End of training

Browse files

Files changed (5) hide show

README.md +1 -1
all_results.json +16 -0
eval_results.json +10 -0
train_results.json +9 -0
trainer_state.json +1222 -0

README.md CHANGED Viewed

@@ -14,7 +14,7 @@ should probably proofread and complete it, then remove this comment. -->
 # griffin-1024-llama3t-8layer-simple_wikipedia_LM-vN
-This model is a fine-tuned version of [griffin-1024-llama3t-8layer](https://huggingface.co/griffin-1024-llama3t-8layer) on an unknown dataset.
 It achieves the following results on the evaluation set:
 - Loss: 4.3584
 - Accuracy: 0.3789

 # griffin-1024-llama3t-8layer-simple_wikipedia_LM-vN
+This model is a fine-tuned version of [griffin-1024-llama3t-8layer](https://huggingface.co/griffin-1024-llama3t-8layer) on the pszemraj/simple_wikipedia_LM dataset.
 It achieves the following results on the evaluation set:
 - Loss: 4.3584
 - Accuracy: 0.3789

all_results.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+    "epoch": 1.995634549423137,
+    "eval_accuracy": 0.3789325513196481,
+    "eval_loss": 4.358436584472656,
+    "eval_runtime": 20.603,
+    "eval_samples": 250,
+    "eval_samples_per_second": 12.134,
+    "eval_steps_per_second": 3.058,
+    "perplexity": 78.13488159488827,
+    "total_flos": 6.441101073108173e+16,
+    "train_loss": 8.280340445041656,
+    "train_runtime": 18888.2342,
+    "train_samples": 51310,
+    "train_samples_per_second": 5.433,
+    "train_steps_per_second": 0.042
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "epoch": 1.995634549423137,
+    "eval_accuracy": 0.3789325513196481,
+    "eval_loss": 4.358436584472656,
+    "eval_runtime": 20.603,
+    "eval_samples": 250,
+    "eval_samples_per_second": 12.134,
+    "eval_steps_per_second": 3.058,
+    "perplexity": 78.13488159488827
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 1.995634549423137,
+    "total_flos": 6.441101073108173e+16,
+    "train_loss": 8.280340445041656,
+    "train_runtime": 18888.2342,
+    "train_samples": 51310,
+    "train_samples_per_second": 5.433,
+    "train_steps_per_second": 0.042
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1222 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 1.995634549423137,
+  "eval_steps": 100,
+  "global_step": 800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.012472715933894606,
+      "grad_norm": 6.119478225708008,
+      "learning_rate": 3.75e-05,
+      "loss": 36.8721,
+      "step": 5
+    },
+    {
+      "epoch": 0.024945431867789213,
+      "grad_norm": 2.9732778072357178,
+      "learning_rate": 7.5e-05,
+      "loss": 33.0439,
+      "step": 10
+    },
+    {
+      "epoch": 0.037418147801683815,
+      "grad_norm": 1.5332609415054321,
+      "learning_rate": 0.0001125,
+      "loss": 30.1165,
+      "step": 15
+    },
+    {
+      "epoch": 0.049890863735578425,
+      "grad_norm": 1.2270578145980835,
+      "learning_rate": 0.00015,
+      "loss": 28.647,
+      "step": 20
+    },
+    {
+      "epoch": 0.06236357966947303,
+      "grad_norm": 1.053142786026001,
+      "learning_rate": 0.00018749999999999998,
+      "loss": 26.0629,
+      "step": 25
+    },
+    {
+      "epoch": 0.07483629560336763,
+      "grad_norm": 1.0131248235702515,
+      "learning_rate": 0.000225,
+      "loss": 23.8703,
+      "step": 30
+    },
+    {
+      "epoch": 0.08730901153726224,
+      "grad_norm": 0.9197985529899597,
+      "learning_rate": 0.0002625,
+      "loss": 21.521,
+      "step": 35
+    },
+    {
+      "epoch": 0.09978172747115685,
+      "grad_norm": 1.0926002264022827,
+      "learning_rate": 0.0003,
+      "loss": 19.8433,
+      "step": 40
+    },
+    {
+      "epoch": 0.11225444340505145,
+      "grad_norm": 0.7152827382087708,
+      "learning_rate": 0.0003,
+      "loss": 18.618,
+      "step": 45
+    },
+    {
+      "epoch": 0.12472715933894606,
+      "grad_norm": 0.6178381443023682,
+      "learning_rate": 0.0003,
+      "loss": 17.3644,
+      "step": 50
+    },
+    {
+      "epoch": 0.13719987527284067,
+      "grad_norm": 0.48063215613365173,
+      "learning_rate": 0.0003,
+      "loss": 16.6105,
+      "step": 55
+    },
+    {
+      "epoch": 0.14967259120673526,
+      "grad_norm": 0.46090102195739746,
+      "learning_rate": 0.0003,
+      "loss": 16.2326,
+      "step": 60
+    },
+    {
+      "epoch": 0.16214530714062989,
+      "grad_norm": 0.4266461730003357,
+      "learning_rate": 0.0003,
+      "loss": 15.8385,
+      "step": 65
+    },
+    {
+      "epoch": 0.17461802307452448,
+      "grad_norm": 0.3876805901527405,
+      "learning_rate": 0.0003,
+      "loss": 15.3119,
+      "step": 70
+    },
+    {
+      "epoch": 0.18709073900841908,
+      "grad_norm": 0.3796117603778839,
+      "learning_rate": 0.0003,
+      "loss": 15.2481,
+      "step": 75
+    },
+    {
+      "epoch": 0.1995634549423137,
+      "grad_norm": 0.37646082043647766,
+      "learning_rate": 0.0003,
+      "loss": 14.7319,
+      "step": 80
+    },
+    {
+      "epoch": 0.2120361708762083,
+      "grad_norm": 0.3688748776912689,
+      "learning_rate": 0.0003,
+      "loss": 14.6364,
+      "step": 85
+    },
+    {
+      "epoch": 0.2245088868101029,
+      "grad_norm": 0.37435677647590637,
+      "learning_rate": 0.0003,
+      "loss": 14.2134,
+      "step": 90
+    },
+    {
+      "epoch": 0.23698160274399752,
+      "grad_norm": 0.36440223455429077,
+      "learning_rate": 0.0003,
+      "loss": 13.9198,
+      "step": 95
+    },
+    {
+      "epoch": 0.2494543186778921,
+      "grad_norm": 0.33530500531196594,
+      "learning_rate": 0.0003,
+      "loss": 13.6044,
+      "step": 100
+    },
+    {
+      "epoch": 0.2494543186778921,
+      "eval_accuracy": 0.007913978494623657,
+      "eval_loss": 12.544113159179688,
+      "eval_runtime": 18.5829,
+      "eval_samples_per_second": 13.453,
+      "eval_steps_per_second": 3.39,
+      "step": 100
+    },
+    {
+      "epoch": 0.26192703461178674,
+      "grad_norm": 0.3251523971557617,
+      "learning_rate": 0.0003,
+      "loss": 13.3181,
+      "step": 105
+    },
+    {
+      "epoch": 0.27439975054568133,
+      "grad_norm": 0.3473041355609894,
+      "learning_rate": 0.0003,
+      "loss": 12.9976,
+      "step": 110
+    },
+    {
+      "epoch": 0.2868724664795759,
+      "grad_norm": 0.3266255557537079,
+      "learning_rate": 0.0003,
+      "loss": 12.7667,
+      "step": 115
+    },
+    {
+      "epoch": 0.2993451824134705,
+      "grad_norm": 0.35194671154022217,
+      "learning_rate": 0.0003,
+      "loss": 12.7544,
+      "step": 120
+    },
+    {
+      "epoch": 0.3118178983473651,
+      "grad_norm": 0.34635770320892334,
+      "learning_rate": 0.0003,
+      "loss": 12.2756,
+      "step": 125
+    },
+    {
+      "epoch": 0.32429061428125977,
+      "grad_norm": 0.3480404019355774,
+      "learning_rate": 0.0003,
+      "loss": 12.1192,
+      "step": 130
+    },
+    {
+      "epoch": 0.33676333021515437,
+      "grad_norm": 0.3309994339942932,
+      "learning_rate": 0.0003,
+      "loss": 11.8339,
+      "step": 135
+    },
+    {
+      "epoch": 0.34923604614904896,
+      "grad_norm": 0.33558282256126404,
+      "learning_rate": 0.0003,
+      "loss": 11.6745,
+      "step": 140
+    },
+    {
+      "epoch": 0.36170876208294356,
+      "grad_norm": 0.3359847664833069,
+      "learning_rate": 0.0003,
+      "loss": 11.3363,
+      "step": 145
+    },
+    {
+      "epoch": 0.37418147801683815,
+      "grad_norm": 0.33947232365608215,
+      "learning_rate": 0.0003,
+      "loss": 11.0303,
+      "step": 150
+    },
+    {
+      "epoch": 0.38665419395073275,
+      "grad_norm": 0.32984089851379395,
+      "learning_rate": 0.0003,
+      "loss": 10.9271,
+      "step": 155
+    },
+    {
+      "epoch": 0.3991269098846274,
+      "grad_norm": 0.3498048782348633,
+      "learning_rate": 0.0003,
+      "loss": 10.6215,
+      "step": 160
+    },
+    {
+      "epoch": 0.411599625818522,
+      "grad_norm": 0.354889839887619,
+      "learning_rate": 0.0003,
+      "loss": 10.5165,
+      "step": 165
+    },
+    {
+      "epoch": 0.4240723417524166,
+      "grad_norm": 0.34426406025886536,
+      "learning_rate": 0.0003,
+      "loss": 10.0716,
+      "step": 170
+    },
+    {
+      "epoch": 0.4365450576863112,
+      "grad_norm": 0.34653356671333313,
+      "learning_rate": 0.0003,
+      "loss": 10.0709,
+      "step": 175
+    },
+    {
+      "epoch": 0.4490177736202058,
+      "grad_norm": 0.3454643189907074,
+      "learning_rate": 0.0003,
+      "loss": 9.7226,
+      "step": 180
+    },
+    {
+      "epoch": 0.4614904895541004,
+      "grad_norm": 0.3724479377269745,
+      "learning_rate": 0.0003,
+      "loss": 9.5827,
+      "step": 185
+    },
+    {
+      "epoch": 0.47396320548799503,
+      "grad_norm": 0.37687671184539795,
+      "learning_rate": 0.0003,
+      "loss": 9.3702,
+      "step": 190
+    },
+    {
+      "epoch": 0.4864359214218896,
+      "grad_norm": 0.3670942187309265,
+      "learning_rate": 0.0003,
+      "loss": 9.2377,
+      "step": 195
+    },
+    {
+      "epoch": 0.4989086373557842,
+      "grad_norm": 0.3864516019821167,
+      "learning_rate": 0.0003,
+      "loss": 8.9524,
+      "step": 200
+    },
+    {
+      "epoch": 0.4989086373557842,
+      "eval_accuracy": 0.04734701857282502,
+      "eval_loss": 8.425415992736816,
+      "eval_runtime": 17.9427,
+      "eval_samples_per_second": 13.933,
+      "eval_steps_per_second": 3.511,
+      "step": 200
+    },
+    {
+      "epoch": 0.5113813532896788,
+      "grad_norm": 0.3540992736816406,
+      "learning_rate": 0.0003,
+      "loss": 8.9811,
+      "step": 205
+    },
+    {
+      "epoch": 0.5238540692235735,
+      "grad_norm": 0.35756129026412964,
+      "learning_rate": 0.0003,
+      "loss": 8.6522,
+      "step": 210
+    },
+    {
+      "epoch": 0.536326785157468,
+      "grad_norm": 0.38473081588745117,
+      "learning_rate": 0.0003,
+      "loss": 8.6516,
+      "step": 215
+    },
+    {
+      "epoch": 0.5487995010913627,
+      "grad_norm": 0.3616325259208679,
+      "learning_rate": 0.0003,
+      "loss": 8.5213,
+      "step": 220
+    },
+    {
+      "epoch": 0.5612722170252572,
+      "grad_norm": 0.375959187746048,
+      "learning_rate": 0.0003,
+      "loss": 8.3109,
+      "step": 225
+    },
+    {
+      "epoch": 0.5737449329591519,
+      "grad_norm": 0.38421833515167236,
+      "learning_rate": 0.0003,
+      "loss": 8.2747,
+      "step": 230
+    },
+    {
+      "epoch": 0.5862176488930465,
+      "grad_norm": 0.379168301820755,
+      "learning_rate": 0.0003,
+      "loss": 8.197,
+      "step": 235
+    },
+    {
+      "epoch": 0.598690364826941,
+      "grad_norm": 0.39803043007850647,
+      "learning_rate": 0.0003,
+      "loss": 8.0836,
+      "step": 240
+    },
+    {
+      "epoch": 0.6111630807608357,
+      "grad_norm": 0.41287195682525635,
+      "learning_rate": 0.0003,
+      "loss": 7.9406,
+      "step": 245
+    },
+    {
+      "epoch": 0.6236357966947302,
+      "grad_norm": 0.3857806324958801,
+      "learning_rate": 0.0003,
+      "loss": 7.9488,
+      "step": 250
+    },
+    {
+      "epoch": 0.6361085126286249,
+      "grad_norm": 0.3808286488056183,
+      "learning_rate": 0.0003,
+      "loss": 7.7673,
+      "step": 255
+    },
+    {
+      "epoch": 0.6485812285625195,
+      "grad_norm": 0.4393250048160553,
+      "learning_rate": 0.0003,
+      "loss": 7.707,
+      "step": 260
+    },
+    {
+      "epoch": 0.6610539444964141,
+      "grad_norm": 0.4232034981250763,
+      "learning_rate": 0.0003,
+      "loss": 7.7852,
+      "step": 265
+    },
+    {
+      "epoch": 0.6735266604303087,
+      "grad_norm": 0.42222586274147034,
+      "learning_rate": 0.0003,
+      "loss": 7.6145,
+      "step": 270
+    },
+    {
+      "epoch": 0.6859993763642033,
+      "grad_norm": 0.35792261362075806,
+      "learning_rate": 0.0003,
+      "loss": 7.5498,
+      "step": 275
+    },
+    {
+      "epoch": 0.6984720922980979,
+      "grad_norm": 0.343427449464798,
+      "learning_rate": 0.0003,
+      "loss": 7.4698,
+      "step": 280
+    },
+    {
+      "epoch": 0.7109448082319925,
+      "grad_norm": 0.4176105856895447,
+      "learning_rate": 0.0003,
+      "loss": 7.3752,
+      "step": 285
+    },
+    {
+      "epoch": 0.7234175241658871,
+      "grad_norm": 0.40987178683280945,
+      "learning_rate": 0.0003,
+      "loss": 7.342,
+      "step": 290
+    },
+    {
+      "epoch": 0.7358902400997818,
+      "grad_norm": 0.4014261066913605,
+      "learning_rate": 0.0003,
+      "loss": 7.1609,
+      "step": 295
+    },
+    {
+      "epoch": 0.7483629560336763,
+      "grad_norm": 0.4236806035041809,
+      "learning_rate": 0.0003,
+      "loss": 7.1721,
+      "step": 300
+    },
+    {
+      "epoch": 0.7483629560336763,
+      "eval_accuracy": 0.03885043988269795,
+      "eval_loss": 6.619859218597412,
+      "eval_runtime": 18.2015,
+      "eval_samples_per_second": 13.735,
+      "eval_steps_per_second": 3.461,
+      "step": 300
+    },
+    {
+      "epoch": 0.760835671967571,
+      "grad_norm": 0.4133549630641937,
+      "learning_rate": 0.0003,
+      "loss": 7.1892,
+      "step": 305
+    },
+    {
+      "epoch": 0.7733083879014655,
+      "grad_norm": 0.44653546810150146,
+      "learning_rate": 0.0003,
+      "loss": 7.0446,
+      "step": 310
+    },
+    {
+      "epoch": 0.7857811038353602,
+      "grad_norm": 0.41286739706993103,
+      "learning_rate": 0.0003,
+      "loss": 6.9656,
+      "step": 315
+    },
+    {
+      "epoch": 0.7982538197692548,
+      "grad_norm": 0.3720580041408539,
+      "learning_rate": 0.0003,
+      "loss": 6.907,
+      "step": 320
+    },
+    {
+      "epoch": 0.8107265357031493,
+      "grad_norm": 0.39917078614234924,
+      "learning_rate": 0.0003,
+      "loss": 6.9853,
+      "step": 325
+    },
+    {
+      "epoch": 0.823199251637044,
+      "grad_norm": 0.4373719096183777,
+      "learning_rate": 0.0003,
+      "loss": 6.8592,
+      "step": 330
+    },
+    {
+      "epoch": 0.8356719675709385,
+      "grad_norm": 0.4183291792869568,
+      "learning_rate": 0.0003,
+      "loss": 6.7432,
+      "step": 335
+    },
+    {
+      "epoch": 0.8481446835048332,
+      "grad_norm": 0.40696659684181213,
+      "learning_rate": 0.0003,
+      "loss": 6.7505,
+      "step": 340
+    },
+    {
+      "epoch": 0.8606173994387278,
+      "grad_norm": 0.36887314915657043,
+      "learning_rate": 0.0003,
+      "loss": 6.7657,
+      "step": 345
+    },
+    {
+      "epoch": 0.8730901153726224,
+      "grad_norm": 0.4768717885017395,
+      "learning_rate": 0.0003,
+      "loss": 6.7173,
+      "step": 350
+    },
+    {
+      "epoch": 0.885562831306517,
+      "grad_norm": 0.43819448351860046,
+      "learning_rate": 0.0003,
+      "loss": 6.5465,
+      "step": 355
+    },
+    {
+      "epoch": 0.8980355472404116,
+      "grad_norm": 0.40145763754844666,
+      "learning_rate": 0.0003,
+      "loss": 6.512,
+      "step": 360
+    },
+    {
+      "epoch": 0.9105082631743062,
+      "grad_norm": 0.49852269887924194,
+      "learning_rate": 0.0003,
+      "loss": 6.5335,
+      "step": 365
+    },
+    {
+      "epoch": 0.9229809791082008,
+      "grad_norm": 0.454698771238327,
+      "learning_rate": 0.0003,
+      "loss": 6.4527,
+      "step": 370
+    },
+    {
+      "epoch": 0.9354536950420954,
+      "grad_norm": 0.4860341250896454,
+      "learning_rate": 0.0003,
+      "loss": 6.4102,
+      "step": 375
+    },
+    {
+      "epoch": 0.9479264109759901,
+      "grad_norm": 0.39718613028526306,
+      "learning_rate": 0.0003,
+      "loss": 6.4694,
+      "step": 380
+    },
+    {
+      "epoch": 0.9603991269098846,
+      "grad_norm": 0.4210009276866913,
+      "learning_rate": 0.0003,
+      "loss": 6.4807,
+      "step": 385
+    },
+    {
+      "epoch": 0.9728718428437793,
+      "grad_norm": 0.4482674300670624,
+      "learning_rate": 0.0003,
+      "loss": 6.414,
+      "step": 390
+    },
+    {
+      "epoch": 0.9853445587776738,
+      "grad_norm": 0.42889419198036194,
+      "learning_rate": 0.0003,
+      "loss": 6.3543,
+      "step": 395
+    },
+    {
+      "epoch": 0.9978172747115684,
+      "grad_norm": 0.5144391059875488,
+      "learning_rate": 0.0003,
+      "loss": 6.2087,
+      "step": 400
+    },
+    {
+      "epoch": 0.9978172747115684,
+      "eval_accuracy": 0.22513000977517106,
+      "eval_loss": 5.719752311706543,
+      "eval_runtime": 17.8865,
+      "eval_samples_per_second": 13.977,
+      "eval_steps_per_second": 3.522,
+      "step": 400
+    },
+    {
+      "epoch": 1.010289990645463,
+      "grad_norm": 0.6417849063873291,
+      "learning_rate": 0.0003,
+      "loss": 6.048,
+      "step": 405
+    },
+    {
+      "epoch": 1.0227627065793576,
+      "grad_norm": 0.5739749073982239,
+      "learning_rate": 0.0003,
+      "loss": 5.9866,
+      "step": 410
+    },
+    {
+      "epoch": 1.0352354225132523,
+      "grad_norm": 0.49603304266929626,
+      "learning_rate": 0.0003,
+      "loss": 5.9419,
+      "step": 415
+    },
+    {
+      "epoch": 1.047708138447147,
+      "grad_norm": 0.5403385162353516,
+      "learning_rate": 0.0003,
+      "loss": 5.8366,
+      "step": 420
+    },
+    {
+      "epoch": 1.0601808543810414,
+      "grad_norm": 0.6306777000427246,
+      "learning_rate": 0.0003,
+      "loss": 5.7657,
+      "step": 425
+    },
+    {
+      "epoch": 1.072653570314936,
+      "grad_norm": 0.7016925811767578,
+      "learning_rate": 0.0003,
+      "loss": 5.6619,
+      "step": 430
+    },
+    {
+      "epoch": 1.0851262862488307,
+      "grad_norm": 0.6606624722480774,
+      "learning_rate": 0.0003,
+      "loss": 5.6094,
+      "step": 435
+    },
+    {
+      "epoch": 1.0975990021827253,
+      "grad_norm": 0.7023086547851562,
+      "learning_rate": 0.0003,
+      "loss": 5.6074,
+      "step": 440
+    },
+    {
+      "epoch": 1.11007171811662,
+      "grad_norm": 0.8505487442016602,
+      "learning_rate": 0.0003,
+      "loss": 5.6959,
+      "step": 445
+    },
+    {
+      "epoch": 1.1225444340505144,
+      "grad_norm": 0.6713190674781799,
+      "learning_rate": 0.0003,
+      "loss": 5.6344,
+      "step": 450
+    },
+    {
+      "epoch": 1.135017149984409,
+      "grad_norm": 0.5908814668655396,
+      "learning_rate": 0.0003,
+      "loss": 5.4591,
+      "step": 455
+    },
+    {
+      "epoch": 1.1474898659183037,
+      "grad_norm": 0.7601476311683655,
+      "learning_rate": 0.0003,
+      "loss": 5.5622,
+      "step": 460
+    },
+    {
+      "epoch": 1.1599625818521984,
+      "grad_norm": 0.5737589001655579,
+      "learning_rate": 0.0003,
+      "loss": 5.4541,
+      "step": 465
+    },
+    {
+      "epoch": 1.172435297786093,
+      "grad_norm": 0.8831024169921875,
+      "learning_rate": 0.0003,
+      "loss": 5.4784,
+      "step": 470
+    },
+    {
+      "epoch": 1.1849080137199874,
+      "grad_norm": 0.8297187089920044,
+      "learning_rate": 0.0003,
+      "loss": 5.4252,
+      "step": 475
+    },
+    {
+      "epoch": 1.197380729653882,
+      "grad_norm": 0.857667863368988,
+      "learning_rate": 0.0003,
+      "loss": 5.3268,
+      "step": 480
+    },
+    {
+      "epoch": 1.2098534455877767,
+      "grad_norm": 0.8937066793441772,
+      "learning_rate": 0.0003,
+      "loss": 5.279,
+      "step": 485
+    },
+    {
+      "epoch": 1.2223261615216714,
+      "grad_norm": 0.784275472164154,
+      "learning_rate": 0.0003,
+      "loss": 5.3079,
+      "step": 490
+    },
+    {
+      "epoch": 1.234798877455566,
+      "grad_norm": 0.7549949884414673,
+      "learning_rate": 0.0003,
+      "loss": 5.3977,
+      "step": 495
+    },
+    {
+      "epoch": 1.2472715933894605,
+      "grad_norm": 0.7452312111854553,
+      "learning_rate": 0.0003,
+      "loss": 5.4917,
+      "step": 500
+    },
+    {
+      "epoch": 1.2472715933894605,
+      "eval_accuracy": 0.32684261974584555,
+      "eval_loss": 4.947990894317627,
+      "eval_runtime": 19.5683,
+      "eval_samples_per_second": 12.776,
+      "eval_steps_per_second": 3.219,
+      "step": 500
+    },
+    {
+      "epoch": 1.2597443093233551,
+      "grad_norm": 0.6744974255561829,
+      "learning_rate": 0.0003,
+      "loss": 5.1679,
+      "step": 505
+    },
+    {
+      "epoch": 1.2722170252572498,
+      "grad_norm": 1.0095832347869873,
+      "learning_rate": 0.0003,
+      "loss": 5.3918,
+      "step": 510
+    },
+    {
+      "epoch": 1.2846897411911444,
+      "grad_norm": 0.7461665272712708,
+      "learning_rate": 0.0003,
+      "loss": 5.2346,
+      "step": 515
+    },
+    {
+      "epoch": 1.2971624571250389,
+      "grad_norm": 0.88801109790802,
+      "learning_rate": 0.0003,
+      "loss": 5.2033,
+      "step": 520
+    },
+    {
+      "epoch": 1.3096351730589335,
+      "grad_norm": 0.7549375891685486,
+      "learning_rate": 0.0003,
+      "loss": 5.098,
+      "step": 525
+    },
+    {
+      "epoch": 1.3221078889928282,
+      "grad_norm": 1.1236454248428345,
+      "learning_rate": 0.0003,
+      "loss": 5.2069,
+      "step": 530
+    },
+    {
+      "epoch": 1.3345806049267228,
+      "grad_norm": 0.9261302947998047,
+      "learning_rate": 0.0003,
+      "loss": 5.1925,
+      "step": 535
+    },
+    {
+      "epoch": 1.3470533208606175,
+      "grad_norm": 0.7248057126998901,
+      "learning_rate": 0.0003,
+      "loss": 5.109,
+      "step": 540
+    },
+    {
+      "epoch": 1.3595260367945121,
+      "grad_norm": 0.941017210483551,
+      "learning_rate": 0.0003,
+      "loss": 5.0975,
+      "step": 545
+    },
+    {
+      "epoch": 1.3719987527284065,
+      "grad_norm": 0.9451349973678589,
+      "learning_rate": 0.0003,
+      "loss": 5.1825,
+      "step": 550
+    },
+    {
+      "epoch": 1.3844714686623012,
+      "grad_norm": 0.9956802725791931,
+      "learning_rate": 0.0003,
+      "loss": 5.1017,
+      "step": 555
+    },
+    {
+      "epoch": 1.3969441845961958,
+      "grad_norm": 1.0484583377838135,
+      "learning_rate": 0.0003,
+      "loss": 5.1371,
+      "step": 560
+    },
+    {
+      "epoch": 1.4094169005300905,
+      "grad_norm": 1.1080021858215332,
+      "learning_rate": 0.0003,
+      "loss": 5.0146,
+      "step": 565
+    },
+    {
+      "epoch": 1.421889616463985,
+      "grad_norm": 0.9495016932487488,
+      "learning_rate": 0.0003,
+      "loss": 5.0971,
+      "step": 570
+    },
+    {
+      "epoch": 1.4343623323978796,
+      "grad_norm": 0.7586097717285156,
+      "learning_rate": 0.0003,
+      "loss": 5.0336,
+      "step": 575
+    },
+    {
+      "epoch": 1.4468350483317742,
+      "grad_norm": 0.647396981716156,
+      "learning_rate": 0.0003,
+      "loss": 5.0119,
+      "step": 580
+    },
+    {
+      "epoch": 1.4593077642656689,
+      "grad_norm": 0.7189023494720459,
+      "learning_rate": 0.0003,
+      "loss": 5.0908,
+      "step": 585
+    },
+    {
+      "epoch": 1.4717804801995635,
+      "grad_norm": 0.9973328113555908,
+      "learning_rate": 0.0003,
+      "loss": 4.7903,
+      "step": 590
+    },
+    {
+      "epoch": 1.4842531961334582,
+      "grad_norm": 0.8094688057899475,
+      "learning_rate": 0.0003,
+      "loss": 5.0103,
+      "step": 595
+    },
+    {
+      "epoch": 1.4967259120673526,
+      "grad_norm": 1.0308438539505005,
+      "learning_rate": 0.0003,
+      "loss": 4.9408,
+      "step": 600
+    },
+    {
+      "epoch": 1.4967259120673526,
+      "eval_accuracy": 0.35667253176930597,
+      "eval_loss": 4.673036575317383,
+      "eval_runtime": 19.5514,
+      "eval_samples_per_second": 12.787,
+      "eval_steps_per_second": 3.222,
+      "step": 600
+    },
+    {
+      "epoch": 1.5091986280012473,
+      "grad_norm": 0.7587366104125977,
+      "learning_rate": 0.0003,
+      "loss": 4.9818,
+      "step": 605
+    },
+    {
+      "epoch": 1.521671343935142,
+      "grad_norm": 1.0271868705749512,
+      "learning_rate": 0.0003,
+      "loss": 4.9614,
+      "step": 610
+    },
+    {
+      "epoch": 1.5341440598690363,
+      "grad_norm": 1.061369776725769,
+      "learning_rate": 0.0003,
+      "loss": 4.8608,
+      "step": 615
+    },
+    {
+      "epoch": 1.546616775802931,
+      "grad_norm": 0.9442321062088013,
+      "learning_rate": 0.0003,
+      "loss": 4.9478,
+      "step": 620
+    },
+    {
+      "epoch": 1.5590894917368257,
+      "grad_norm": 0.8110609650611877,
+      "learning_rate": 0.0003,
+      "loss": 5.0979,
+      "step": 625
+    },
+    {
+      "epoch": 1.5715622076707203,
+      "grad_norm": 0.6862745881080627,
+      "learning_rate": 0.0003,
+      "loss": 4.8345,
+      "step": 630
+    },
+    {
+      "epoch": 1.584034923604615,
+      "grad_norm": 0.8737391233444214,
+      "learning_rate": 0.0003,
+      "loss": 4.8572,
+      "step": 635
+    },
+    {
+      "epoch": 1.5965076395385096,
+      "grad_norm": 0.8002131581306458,
+      "learning_rate": 0.0003,
+      "loss": 4.8072,
+      "step": 640
+    },
+    {
+      "epoch": 1.6089803554724043,
+      "grad_norm": 0.7860103845596313,
+      "learning_rate": 0.0003,
+      "loss": 4.8922,
+      "step": 645
+    },
+    {
+      "epoch": 1.6214530714062987,
+      "grad_norm": 0.9875708222389221,
+      "learning_rate": 0.0003,
+      "loss": 4.9247,
+      "step": 650
+    },
+    {
+      "epoch": 1.6339257873401933,
+      "grad_norm": 0.8873936533927917,
+      "learning_rate": 0.0003,
+      "loss": 4.8795,
+      "step": 655
+    },
+    {
+      "epoch": 1.646398503274088,
+      "grad_norm": 0.7963967323303223,
+      "learning_rate": 0.0003,
+      "loss": 4.835,
+      "step": 660
+    },
+    {
+      "epoch": 1.6588712192079824,
+      "grad_norm": 0.8068607449531555,
+      "learning_rate": 0.0003,
+      "loss": 4.8713,
+      "step": 665
+    },
+    {
+      "epoch": 1.671343935141877,
+      "grad_norm": 0.9093911647796631,
+      "learning_rate": 0.0003,
+      "loss": 4.7725,
+      "step": 670
+    },
+    {
+      "epoch": 1.6838166510757717,
+      "grad_norm": 0.7699265480041504,
+      "learning_rate": 0.0003,
+      "loss": 4.7502,
+      "step": 675
+    },
+    {
+      "epoch": 1.6962893670096664,
+      "grad_norm": 0.7545697689056396,
+      "learning_rate": 0.0003,
+      "loss": 4.9555,
+      "step": 680
+    },
+    {
+      "epoch": 1.708762082943561,
+      "grad_norm": 0.7571801543235779,
+      "learning_rate": 0.0003,
+      "loss": 4.7616,
+      "step": 685
+    },
+    {
+      "epoch": 1.7212347988774557,
+      "grad_norm": 0.7757474184036255,
+      "learning_rate": 0.0003,
+      "loss": 4.6462,
+      "step": 690
+    },
+    {
+      "epoch": 1.7337075148113503,
+      "grad_norm": 0.7473092079162598,
+      "learning_rate": 0.0003,
+      "loss": 4.6699,
+      "step": 695
+    },
+    {
+      "epoch": 1.7461802307452448,
+      "grad_norm": 1.2531319856643677,
+      "learning_rate": 0.0003,
+      "loss": 4.8347,
+      "step": 700
+    },
+    {
+      "epoch": 1.7461802307452448,
+      "eval_accuracy": 0.37069794721407623,
+      "eval_loss": 4.498379707336426,
+      "eval_runtime": 20.0355,
+      "eval_samples_per_second": 12.478,
+      "eval_steps_per_second": 3.144,
+      "step": 700
+    },
+    {
+      "epoch": 1.7586529466791394,
+      "grad_norm": 1.3069407939910889,
+      "learning_rate": 0.0003,
+      "loss": 4.7338,
+      "step": 705
+    },
+    {
+      "epoch": 1.7711256626130338,
+      "grad_norm": 1.1146960258483887,
+      "learning_rate": 0.0003,
+      "loss": 4.8758,
+      "step": 710
+    },
+    {
+      "epoch": 1.7835983785469285,
+      "grad_norm": 1.0376973152160645,
+      "learning_rate": 0.0003,
+      "loss": 4.7604,
+      "step": 715
+    },
+    {
+      "epoch": 1.7960710944808231,
+      "grad_norm": 1.2044090032577515,
+      "learning_rate": 0.0003,
+      "loss": 4.7472,
+      "step": 720
+    },
+    {
+      "epoch": 1.8085438104147178,
+      "grad_norm": 1.0660207271575928,
+      "learning_rate": 0.0003,
+      "loss": 4.79,
+      "step": 725
+    },
+    {
+      "epoch": 1.8210165263486124,
+      "grad_norm": 0.7932606935501099,
+      "learning_rate": 0.0003,
+      "loss": 4.7476,
+      "step": 730
+    },
+    {
+      "epoch": 1.833489242282507,
+      "grad_norm": 0.8554738759994507,
+      "learning_rate": 0.0003,
+      "loss": 4.7839,
+      "step": 735
+    },
+    {
+      "epoch": 1.8459619582164017,
+      "grad_norm": 1.015703797340393,
+      "learning_rate": 0.0003,
+      "loss": 4.7935,
+      "step": 740
+    },
+    {
+      "epoch": 1.8584346741502962,
+      "grad_norm": 1.1005243062973022,
+      "learning_rate": 0.0003,
+      "loss": 4.7913,
+      "step": 745
+    },
+    {
+      "epoch": 1.8709073900841908,
+      "grad_norm": 0.8775972127914429,
+      "learning_rate": 0.0003,
+      "loss": 4.5128,
+      "step": 750
+    },
+    {
+      "epoch": 1.8833801060180855,
+      "grad_norm": 0.8116542100906372,
+      "learning_rate": 0.0003,
+      "loss": 4.6496,
+      "step": 755
+    },
+    {
+      "epoch": 1.89585282195198,
+      "grad_norm": 0.7614642381668091,
+      "learning_rate": 0.0003,
+      "loss": 4.7695,
+      "step": 760
+    },
+    {
+      "epoch": 1.9083255378858746,
+      "grad_norm": 1.0064287185668945,
+      "learning_rate": 0.0003,
+      "loss": 4.7929,
+      "step": 765
+    },
+    {
+      "epoch": 1.9207982538197692,
+      "grad_norm": 0.7342740297317505,
+      "learning_rate": 0.0003,
+      "loss": 4.6711,
+      "step": 770
+    },
+    {
+      "epoch": 1.9332709697536639,
+      "grad_norm": 0.9723834991455078,
+      "learning_rate": 0.0003,
+      "loss": 4.6212,
+      "step": 775
+    },
+    {
+      "epoch": 1.9457436856875585,
+      "grad_norm": 1.20729398727417,
+      "learning_rate": 0.0003,
+      "loss": 4.6513,
+      "step": 780
+    },
+    {
+      "epoch": 1.9582164016214532,
+      "grad_norm": 0.7920907735824585,
+      "learning_rate": 0.0003,
+      "loss": 4.6264,
+      "step": 785
+    },
+    {
+      "epoch": 1.9706891175553478,
+      "grad_norm": 0.6307650804519653,
+      "learning_rate": 0.0003,
+      "loss": 4.6481,
+      "step": 790
+    },
+    {
+      "epoch": 1.9831618334892422,
+      "grad_norm": 0.8942980766296387,
+      "learning_rate": 0.0003,
+      "loss": 4.6598,
+      "step": 795
+    },
+    {
+      "epoch": 1.995634549423137,
+      "grad_norm": 0.7046281099319458,
+      "learning_rate": 0.0003,
+      "loss": 4.7023,
+      "step": 800
+    },
+    {
+      "epoch": 1.995634549423137,
+      "eval_accuracy": 0.3789325513196481,
+      "eval_loss": 4.358436584472656,
+      "eval_runtime": 20.1663,
+      "eval_samples_per_second": 12.397,
+      "eval_steps_per_second": 3.124,
+      "step": 800
+    },
+    {
+      "epoch": 1.995634549423137,
+      "step": 800,
+      "total_flos": 6.441101073108173e+16,
+      "train_loss": 8.280340445041656,
+      "train_runtime": 18888.2342,
+      "train_samples_per_second": 5.433,
+      "train_steps_per_second": 0.042
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 800,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 2,
+  "save_steps": 100,
+  "total_flos": 6.441101073108173e+16,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}