klcsp
/

llama3.1-8b-gpt4o_100k_coding-fft

+---
+library_name: transformers
+license: llama3
+base_model: meta-llama/Meta-Llama-3-8B
+tags:
+- trl
+- sft
+- generated_from_trainer
+datasets:
+- generator
+model-index:
+- name: llama3.1-8b-gpt4o_100k_coding-fft
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# llama3.1-8b-gpt4o_100k_coding-fft
+This model is a fine-tuned version of [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 2.2535
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0003
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 8
+- gradient_accumulation_steps: 2
+- total_train_batch_size: 32
+- total_eval_batch_size: 16
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.5499        | 0.9991 | 539  | 2.2535          |
+### Framework versions
+- Transformers 4.45.1
+- Pytorch 2.4.1+cu121
+- Datasets 3.0.1
+- Tokenizers 0.20.0

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9990732159406858,
+    "total_flos": 28187736145920.0,
+    "train_loss": 1.0674916249259283,
+    "train_runtime": 10422.8572,
+    "train_samples": 116368,
+    "train_samples_per_second": 1.655,
+    "train_steps_per_second": 0.052
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token_id": 128256,
+  "do_sample": true,
+  "eos_token_id": 128257,
+  "max_length": 4096,
+  "pad_token_id": 128257,
+  "temperature": 0.6,
+  "top_p": 0.9,
+  "transformers_version": "4.45.1"
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9990732159406858,
+    "total_flos": 28187736145920.0,
+    "train_loss": 1.0674916249259283,
+    "train_runtime": 10422.8572,
+    "train_samples": 116368,
+    "train_samples_per_second": 1.655,
+    "train_steps_per_second": 0.052
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,806 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9990732159406858,
+  "eval_steps": 500,
+  "global_step": 539,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0018535681186283596,
+      "grad_norm": 26.35410018433926,
+      "learning_rate": 5.555555555555555e-06,
+      "loss": 1.6809,
+      "step": 1
+    },
+    {
+      "epoch": 0.009267840593141797,
+      "grad_norm": 39.94533926724377,
+      "learning_rate": 2.7777777777777772e-05,
+      "loss": 1.343,
+      "step": 5
+    },
+    {
+      "epoch": 0.018535681186283594,
+      "grad_norm": 4.5543620999927485,
+      "learning_rate": 5.5555555555555545e-05,
+      "loss": 0.8749,
+      "step": 10
+    },
+    {
+      "epoch": 0.027803521779425393,
+      "grad_norm": 4.24495034141875,
+      "learning_rate": 8.333333333333333e-05,
+      "loss": 0.8704,
+      "step": 15
+    },
+    {
+      "epoch": 0.03707136237256719,
+      "grad_norm": 39.42543160126464,
+      "learning_rate": 0.00011111111111111109,
+      "loss": 0.8733,
+      "step": 20
+    },
+    {
+      "epoch": 0.04633920296570899,
+      "grad_norm": 11.962483325702362,
+      "learning_rate": 0.0001388888888888889,
+      "loss": 1.0553,
+      "step": 25
+    },
+    {
+      "epoch": 0.05560704355885079,
+      "grad_norm": 14.195688905715766,
+      "learning_rate": 0.00016666666666666666,
+      "loss": 1.0623,
+      "step": 30
+    },
+    {
+      "epoch": 0.06487488415199258,
+      "grad_norm": 4.164105779251046,
+      "learning_rate": 0.00019444444444444443,
+      "loss": 0.8718,
+      "step": 35
+    },
+    {
+      "epoch": 0.07414272474513438,
+      "grad_norm": 6.991113945747281,
+      "learning_rate": 0.00022222222222222218,
+      "loss": 0.8619,
+      "step": 40
+    },
+    {
+      "epoch": 0.08341056533827618,
+      "grad_norm": 631.4969668103296,
+      "learning_rate": 0.00025,
+      "loss": 1.8292,
+      "step": 45
+    },
+    {
+      "epoch": 0.09267840593141798,
+      "grad_norm": 51.99035558508526,
+      "learning_rate": 0.0002777777777777778,
+      "loss": 1.6128,
+      "step": 50
+    },
+    {
+      "epoch": 0.10194624652455977,
+      "grad_norm": 5.615188207831111,
+      "learning_rate": 0.0002999968531502098,
+      "loss": 1.1458,
+      "step": 55
+    },
+    {
+      "epoch": 0.11121408711770157,
+      "grad_norm": 162.57564664314097,
+      "learning_rate": 0.0002998867272706619,
+      "loss": 1.7169,
+      "step": 60
+    },
+    {
+      "epoch": 0.12048192771084337,
+      "grad_norm": 27.819505339583905,
+      "learning_rate": 0.0002996193909122197,
+      "loss": 2.1196,
+      "step": 65
+    },
+    {
+      "epoch": 0.12974976830398516,
+      "grad_norm": 12.017979123334465,
+      "learning_rate": 0.00029919512447380625,
+      "loss": 1.3348,
+      "step": 70
+    },
+    {
+      "epoch": 0.13901760889712697,
+      "grad_norm": 529.5307829885717,
+      "learning_rate": 0.0002986143729523282,
+      "loss": 1.0105,
+      "step": 75
+    },
+    {
+      "epoch": 0.14828544949026876,
+      "grad_norm": 2.511805320667406,
+      "learning_rate": 0.000297877745475935,
+      "loss": 1.0586,
+      "step": 80
+    },
+    {
+      "epoch": 0.15755329008341057,
+      "grad_norm": 4.022429995067504,
+      "learning_rate": 0.0002969860146651276,
+      "loss": 0.9055,
+      "step": 85
+    },
+    {
+      "epoch": 0.16682113067655235,
+      "grad_norm": 512.3998660297882,
+      "learning_rate": 0.0002959401158223867,
+      "loss": 5.3655,
+      "step": 90
+    },
+    {
+      "epoch": 0.17608897126969417,
+      "grad_norm": 131.99417407337796,
+      "learning_rate": 0.00029474114595116896,
+      "loss": 3.1238,
+      "step": 95
+    },
+    {
+      "epoch": 0.18535681186283595,
+      "grad_norm": 19.953838142892188,
+      "learning_rate": 0.0002933903626053024,
+      "loss": 1.9603,
+      "step": 100
+    },
+    {
+      "epoch": 0.19462465245597776,
+      "grad_norm": 7.472797510255509,
+      "learning_rate": 0.00029188918256998564,
+      "loss": 1.4801,
+      "step": 105
+    },
+    {
+      "epoch": 0.20389249304911955,
+      "grad_norm": 9.70552960980865,
+      "learning_rate": 0.00029023918037577635,
+      "loss": 1.3544,
+      "step": 110
+    },
+    {
+      "epoch": 0.21316033364226136,
+      "grad_norm": 17.11916105312361,
+      "learning_rate": 0.00028844208664712575,
+      "loss": 1.0681,
+      "step": 115
+    },
+    {
+      "epoch": 0.22242817423540315,
+      "grad_norm": 22.28855318123011,
+      "learning_rate": 0.00028649978628719254,
+      "loss": 1.2611,
+      "step": 120
+    },
+    {
+      "epoch": 0.23169601482854496,
+      "grad_norm": 34.76579789562846,
+      "learning_rate": 0.00028441431650084016,
+      "loss": 1.6181,
+      "step": 125
+    },
+    {
+      "epoch": 0.24096385542168675,
+      "grad_norm": 15.11803328233237,
+      "learning_rate": 0.0002821878646578898,
+      "loss": 1.1601,
+      "step": 130
+    },
+    {
+      "epoch": 0.25023169601482853,
+      "grad_norm": 10.594239787348334,
+      "learning_rate": 0.0002798227659988717,
+      "loss": 1.0309,
+      "step": 135
+    },
+    {
+      "epoch": 0.2594995366079703,
+      "grad_norm": 3.881883182518721,
+      "learning_rate": 0.00027732150118568017,
+      "loss": 1.5651,
+      "step": 140
+    },
+    {
+      "epoch": 0.26876737720111216,
+      "grad_norm": 3.036719624432077,
+      "learning_rate": 0.00027468669369970207,
+      "loss": 1.1445,
+      "step": 145
+    },
+    {
+      "epoch": 0.27803521779425394,
+      "grad_norm": 14.352629655607771,
+      "learning_rate": 0.00027192110709014697,
+      "loss": 0.9305,
+      "step": 150
+    },
+    {
+      "epoch": 0.2873030583873957,
+      "grad_norm": 2.933168446331413,
+      "learning_rate": 0.0002690276420754655,
+      "loss": 0.9324,
+      "step": 155
+    },
+    {
+      "epoch": 0.2965708989805375,
+      "grad_norm": 4.661928504358525,
+      "learning_rate": 0.00026600933350089654,
+      "loss": 0.9491,
+      "step": 160
+    },
+    {
+      "epoch": 0.30583873957367935,
+      "grad_norm": 5.782107803222577,
+      "learning_rate": 0.0002628693471553335,
+      "loss": 0.8689,
+      "step": 165
+    },
+    {
+      "epoch": 0.31510658016682114,
+      "grad_norm": 4.287118434199453,
+      "learning_rate": 0.00025961097645084885,
+      "loss": 1.112,
+      "step": 170
+    },
+    {
+      "epoch": 0.3243744207599629,
+      "grad_norm": 170.8246279898043,
+      "learning_rate": 0.0002562376389683599,
+      "loss": 2.2669,
+      "step": 175
+    },
+    {
+      "epoch": 0.3336422613531047,
+      "grad_norm": 243.40178580373365,
+      "learning_rate": 0.00025275287287305814,
+      "loss": 3.2917,
+      "step": 180
+    },
+    {
+      "epoch": 0.34291010194624655,
+      "grad_norm": 160.38147120987801,
+      "learning_rate": 0.00024916033320336264,
+      "loss": 3.067,
+      "step": 185
+    },
+    {
+      "epoch": 0.35217794253938833,
+      "grad_norm": 30.070071532273786,
+      "learning_rate": 0.0002454637880372892,
+      "loss": 3.1301,
+      "step": 190
+    },
+    {
+      "epoch": 0.3614457831325301,
+      "grad_norm": 48.47525499191394,
+      "learning_rate": 0.0002416671145402575,
+      "loss": 2.6178,
+      "step": 195
+    },
+    {
+      "epoch": 0.3707136237256719,
+      "grad_norm": 48.37664262960068,
+      "learning_rate": 0.00023777429489847934,
+      "loss": 1.441,
+      "step": 200
+    },
+    {
+      "epoch": 0.3799814643188137,
+      "grad_norm": 3.8665655676681245,
+      "learning_rate": 0.0002337894121421954,
+      "loss": 1.2978,
+      "step": 205
+    },
+    {
+      "epoch": 0.38924930491195553,
+      "grad_norm": 2.9273559971935055,
+      "learning_rate": 0.00022971664586314054,
+      "loss": 1.0238,
+      "step": 210
+    },
+    {
+      "epoch": 0.3985171455050973,
+      "grad_norm": 20.165413444524287,
+      "learning_rate": 0.00022556026783072895,
+      "loss": 0.8787,
+      "step": 215
+    },
+    {
+      "epoch": 0.4077849860982391,
+      "grad_norm": 2.3870488681045203,
+      "learning_rate": 0.00022132463751155812,
+      "loss": 0.9467,
+      "step": 220
+    },
+    {
+      "epoch": 0.4170528266913809,
+      "grad_norm": 4.038604860534557,
+      "learning_rate": 0.00021701419749693034,
+      "loss": 0.9708,
+      "step": 225
+    },
+    {
+      "epoch": 0.4263206672845227,
+      "grad_norm": 2.7240106071190535,
+      "learning_rate": 0.00021263346884318777,
+      "loss": 1.0564,
+      "step": 230
+    },
+    {
+      "epoch": 0.4355885078776645,
+      "grad_norm": 1.6377519083501402,
+      "learning_rate": 0.00020818704632974896,
+      "loss": 0.7724,
+      "step": 235
+    },
+    {
+      "epoch": 0.4448563484708063,
+      "grad_norm": 1.0433662853439323,
+      "learning_rate": 0.00020367959363981936,
+      "loss": 0.8052,
+      "step": 240
+    },
+    {
+      "epoch": 0.4541241890639481,
+      "grad_norm": 1.5351301360333338,
+      "learning_rate": 0.00019911583846883197,
+      "loss": 0.8407,
+      "step": 245
+    },
+    {
+      "epoch": 0.4633920296570899,
+      "grad_norm": 1.2741936551633426,
+      "learning_rate": 0.0001945005675657475,
+      "loss": 0.8344,
+      "step": 250
+    },
+    {
+      "epoch": 0.4726598702502317,
+      "grad_norm": 2.4652187839401467,
+      "learning_rate": 0.00018983862171241577,
+      "loss": 0.9688,
+      "step": 255
+    },
+    {
+      "epoch": 0.4819277108433735,
+      "grad_norm": 1.1552631433359035,
+      "learning_rate": 0.00018513489064626398,
+      "loss": 0.8647,
+      "step": 260
+    },
+    {
+      "epoch": 0.4911955514365153,
+      "grad_norm": 1.113328634852908,
+      "learning_rate": 0.00018039430793163753,
+      "loss": 0.8514,
+      "step": 265
+    },
+    {
+      "epoch": 0.5004633920296571,
+      "grad_norm": 0.995245914359209,
+      "learning_rate": 0.00017562184578517203,
+      "loss": 0.8845,
+      "step": 270
+    },
+    {
+      "epoch": 0.5097312326227988,
+      "grad_norm": 1.5488835001137233,
+      "learning_rate": 0.00017082250986062502,
+      "loss": 0.7809,
+      "step": 275
+    },
+    {
+      "epoch": 0.5189990732159406,
+      "grad_norm": 1.5756484423065427,
+      "learning_rate": 0.00016600133399863594,
+      "loss": 0.8326,
+      "step": 280
+    },
+    {
+      "epoch": 0.5282669138090825,
+      "grad_norm": 206.03580929049917,
+      "learning_rate": 0.0001611633749469231,
+      "loss": 2.2034,
+      "step": 285
+    },
+    {
+      "epoch": 0.5375347544022243,
+      "grad_norm": 4.169210184616496,
+      "learning_rate": 0.0001563137070564528,
+      "loss": 1.9396,
+      "step": 290
+    },
+    {
+      "epoch": 0.5468025949953661,
+      "grad_norm": 7.018651154352153,
+      "learning_rate": 0.0001514574169591466,
+      "loss": 1.4702,
+      "step": 295
+    },
+    {
+      "epoch": 0.5560704355885079,
+      "grad_norm": 1.7304196471580864,
+      "learning_rate": 0.0001465995982327065,
+      "loss": 0.9425,
+      "step": 300
+    },
+    {
+      "epoch": 0.5653382761816497,
+      "grad_norm": 2.241775914452003,
+      "learning_rate": 0.00014174534605815525,
+      "loss": 0.7961,
+      "step": 305
+    },
+    {
+      "epoch": 0.5746061167747915,
+      "grad_norm": 3.456242970621782,
+      "learning_rate": 0.0001368997518756954,
+      "loss": 0.834,
+      "step": 310
+    },
+    {
+      "epoch": 0.5838739573679332,
+      "grad_norm": 5.072902047355265,
+      "learning_rate": 0.00013206789804449116,
+      "loss": 0.99,
+      "step": 315
+    },
+    {
+      "epoch": 0.593141797961075,
+      "grad_norm": 6.231760889035276,
+      "learning_rate": 0.0001272548525119758,
+      "loss": 0.8985,
+      "step": 320
+    },
+    {
+      "epoch": 0.6024096385542169,
+      "grad_norm": 0.8710278075890822,
+      "learning_rate": 0.0001224656634982746,
+      "loss": 0.7725,
+      "step": 325
+    },
+    {
+      "epoch": 0.6116774791473587,
+      "grad_norm": 1.799368795567863,
+      "learning_rate": 0.00011770535420131876,
+      "loss": 0.7621,
+      "step": 330
+    },
+    {
+      "epoch": 0.6209453197405005,
+      "grad_norm": 0.9742992413928552,
+      "learning_rate": 0.00011297891752820484,
+      "loss": 0.8327,
+      "step": 335
+    },
+    {
+      "epoch": 0.6302131603336423,
+      "grad_norm": 0.85009149768552,
+      "learning_rate": 0.0001082913108583245,
+      "loss": 0.6863,
+      "step": 340
+    },
+    {
+      "epoch": 0.6394810009267841,
+      "grad_norm": 0.8181844995131707,
+      "learning_rate": 0.0001036474508437579,
+      "loss": 0.6779,
+      "step": 345
+    },
+    {
+      "epoch": 0.6487488415199258,
+      "grad_norm": 0.9546399988989202,
+      "learning_rate": 9.905220825238491e-05,
+      "loss": 0.6872,
+      "step": 350
+    },
+    {
+      "epoch": 0.6580166821130676,
+      "grad_norm": 0.6995327936772554,
+      "learning_rate": 9.45104028591222e-05,
+      "loss": 0.7025,
+      "step": 355
+    },
+    {
+      "epoch": 0.6672845227062094,
+      "grad_norm": 0.7195227820211112,
+      "learning_rate": 9.002679839064463e-05,
+      "loss": 0.6807,
+      "step": 360
+    },
+    {
+      "epoch": 0.6765523632993512,
+      "grad_norm": 0.6283846451216366,
+      "learning_rate": 8.560609752889412e-05,
+      "loss": 0.6888,
+      "step": 365
+    },
+    {
+      "epoch": 0.6858202038924931,
+      "grad_norm": 0.7394400936445648,
+      "learning_rate": 8.125293697861548e-05,
+      "loss": 0.6542,
+      "step": 370
+    },
+    {
+      "epoch": 0.6950880444856349,
+      "grad_norm": 0.6824620929581083,
+      "learning_rate": 7.697188260409356e-05,
+      "loss": 0.671,
+      "step": 375
+    },
+    {
+      "epoch": 0.7043558850787767,
+      "grad_norm": 0.6116075298871171,
+      "learning_rate": 7.276742464019198e-05,
+      "loss": 0.6729,
+      "step": 380
+    },
+    {
+      "epoch": 0.7136237256719185,
+      "grad_norm": 0.6453928151826652,
+      "learning_rate": 6.864397298271699e-05,
+      "loss": 0.6626,
+      "step": 385
+    },
+    {
+      "epoch": 0.7228915662650602,
+      "grad_norm": 0.6716078881868212,
+      "learning_rate": 6.460585256304559e-05,
+      "loss": 0.6851,
+      "step": 390
+    },
+    {
+      "epoch": 0.732159406858202,
+      "grad_norm": 0.6801638199214254,
+      "learning_rate": 6.065729881186982e-05,
+      "loss": 0.6168,
+      "step": 395
+    },
+    {
+      "epoch": 0.7414272474513438,
+      "grad_norm": 0.4794662188577268,
+      "learning_rate": 5.680245321681471e-05,
+      "loss": 0.6531,
+      "step": 400
+    },
+    {
+      "epoch": 0.7506950880444856,
+      "grad_norm": 1.029961404045439,
+      "learning_rate": 5.304535897858999e-05,
+      "loss": 0.6295,
+      "step": 405
+    },
+    {
+      "epoch": 0.7599629286376274,
+      "grad_norm": 0.5284338458030478,
+      "learning_rate": 4.938995677023054e-05,
+      "loss": 0.6201,
+      "step": 410
+    },
+    {
+      "epoch": 0.7692307692307693,
+      "grad_norm": 0.4516339145236783,
+      "learning_rate": 4.584008060387455e-05,
+      "loss": 0.6083,
+      "step": 415
+    },
+    {
+      "epoch": 0.7784986098239111,
+      "grad_norm": 0.5727589086366411,
+      "learning_rate": 4.239945380941461e-05,
+      "loss": 0.6021,
+      "step": 420
+    },
+    {
+      "epoch": 0.7877664504170528,
+      "grad_norm": 0.4804742186453503,
+      "learning_rate": 3.907168512923842e-05,
+      "loss": 0.5933,
+      "step": 425
+    },
+    {
+      "epoch": 0.7970342910101946,
+      "grad_norm": 0.45776606255626884,
+      "learning_rate": 3.5860264933156324e-05,
+      "loss": 0.5774,
+      "step": 430
+    },
+    {
+      "epoch": 0.8063021316033364,
+      "grad_norm": 1.2302657626291176,
+      "learning_rate": 3.276856155748584e-05,
+      "loss": 0.5908,
+      "step": 435
+    },
+    {
+      "epoch": 0.8155699721964782,
+      "grad_norm": 0.9348743887035583,
+      "learning_rate": 2.9799817772131516e-05,
+      "loss": 0.592,
+      "step": 440
+    },
+    {
+      "epoch": 0.82483781278962,
+      "grad_norm": 0.4027186043623421,
+      "learning_rate": 2.6957147379367217e-05,
+      "loss": 0.5798,
+      "step": 445
+    },
+    {
+      "epoch": 0.8341056533827618,
+      "grad_norm": 0.5240634600695104,
+      "learning_rate": 2.4243531947887802e-05,
+      "loss": 0.5805,
+      "step": 450
+    },
+    {
+      "epoch": 0.8433734939759037,
+      "grad_norm": 0.5416112282286268,
+      "learning_rate": 2.1661817685554833e-05,
+      "loss": 0.6067,
+      "step": 455
+    },
+    {
+      "epoch": 0.8526413345690455,
+      "grad_norm": 0.47266676591358336,
+      "learning_rate": 1.921471245411794e-05,
+      "loss": 0.5962,
+      "step": 460
+    },
+    {
+      "epoch": 0.8619091751621872,
+      "grad_norm": 0.6382596120183166,
+      "learning_rate": 1.6904782929041693e-05,
+      "loss": 0.5791,
+      "step": 465
+    },
+    {
+      "epoch": 0.871177015755329,
+      "grad_norm": 0.49306751144827193,
+      "learning_rate": 1.4734451907417255e-05,
+      "loss": 0.6182,
+      "step": 470
+    },
+    {
+      "epoch": 0.8804448563484708,
+      "grad_norm": 0.49635162924553927,
+      "learning_rate": 1.2705995766783079e-05,
+      "loss": 0.5521,
+      "step": 475
+    },
+    {
+      "epoch": 0.8897126969416126,
+      "grad_norm": 0.49868563464480065,
+      "learning_rate": 1.0821542077519169e-05,
+      "loss": 0.5579,
+      "step": 480
+    },
+    {
+      "epoch": 0.8989805375347544,
+      "grad_norm": 0.384294096915848,
+      "learning_rate": 9.083067371319324e-06,
+      "loss": 0.5532,
+      "step": 485
+    },
+    {
+      "epoch": 0.9082483781278962,
+      "grad_norm": 0.6584770004045724,
+      "learning_rate": 7.492395068082619e-06,
+      "loss": 0.544,
+      "step": 490
+    },
+    {
+      "epoch": 0.917516218721038,
+      "grad_norm": 0.5198655418147264,
+      "learning_rate": 6.051193563397599e-06,
+      "loss": 0.555,
+      "step": 495
+    },
+    {
+      "epoch": 0.9267840593141798,
+      "grad_norm": 0.46652880015863485,
+      "learning_rate": 4.760974478625634e-06,
+      "loss": 0.5437,
+      "step": 500
+    },
+    {
+      "epoch": 0.9360518999073216,
+      "grad_norm": 1.0569529661255628,
+      "learning_rate": 3.623091075418977e-06,
+      "loss": 0.5573,
+      "step": 505
+    },
+    {
+      "epoch": 0.9453197405004634,
+      "grad_norm": 0.4373874071284105,
+      "learning_rate": 2.638736836336158e-06,
+      "loss": 0.5312,
+      "step": 510
+    },
+    {
+      "epoch": 0.9545875810936052,
+      "grad_norm": 0.46826046078794514,
+      "learning_rate": 1.8089442130434061e-06,
+      "loss": 0.5648,
+      "step": 515
+    },
+    {
+      "epoch": 0.963855421686747,
+      "grad_norm": 0.40638149868456475,
+      "learning_rate": 1.1345835434156736e-06,
+      "loss": 0.5417,
+      "step": 520
+    },
+    {
+      "epoch": 0.9731232622798888,
+      "grad_norm": 0.4274920327067933,
+      "learning_rate": 6.163621386722218e-07,
+      "loss": 0.5528,
+      "step": 525
+    },
+    {
+      "epoch": 0.9823911028730306,
+      "grad_norm": 0.41954954402917344,
+      "learning_rate": 2.5482354150493935e-07,
+      "loss": 0.549,
+      "step": 530
+    },
+    {
+      "epoch": 0.9916589434661723,
+      "grad_norm": 0.46407841167410513,
+      "learning_rate": 5.0346955976976467e-08,
+      "loss": 0.5499,
+      "step": 535
+    },
+    {
+      "epoch": 0.9990732159406858,
+      "eval_loss": 2.2535195350646973,
+      "eval_runtime": 2.3617,
+      "eval_samples_per_second": 1.694,
+      "eval_steps_per_second": 0.423,
+      "step": 539
+    },
+    {
+      "epoch": 0.9990732159406858,
+      "step": 539,
+      "total_flos": 28187736145920.0,
+      "train_loss": 1.0674916249259283,
+      "train_runtime": 10422.8572,
+      "train_samples_per_second": 1.655,
+      "train_steps_per_second": 0.052
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 539,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 100,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 28187736145920.0,
+  "train_batch_size": 2,
+  "trial_name": null,
+  "trial_params": null
+}