{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 3.857142857142857,
  "eval_steps": 500,
  "global_step": 96,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.08163265306122448,
      "grad_norm": 0.9611280560493469,
      "learning_rate": 0.00019583333333333334,
      "loss": 9.6138,
      "step": 2
    },
    {
      "epoch": 0.16326530612244897,
      "grad_norm": 1.6276118755340576,
      "learning_rate": 0.00019166666666666667,
      "loss": 9.1817,
      "step": 4
    },
    {
      "epoch": 0.24489795918367346,
      "grad_norm": 1.9113811254501343,
      "learning_rate": 0.0001875,
      "loss": 8.6034,
      "step": 6
    },
    {
      "epoch": 0.32653061224489793,
      "grad_norm": 1.9783806800842285,
      "learning_rate": 0.00018333333333333334,
      "loss": 8.2011,
      "step": 8
    },
    {
      "epoch": 0.40816326530612246,
      "grad_norm": 1.256810188293457,
      "learning_rate": 0.0001791666666666667,
      "loss": 7.825,
      "step": 10
    },
    {
      "epoch": 0.4897959183673469,
      "grad_norm": 1.0618529319763184,
      "learning_rate": 0.000175,
      "loss": 7.5991,
      "step": 12
    },
    {
      "epoch": 0.5714285714285714,
      "grad_norm": 1.0928442478179932,
      "learning_rate": 0.00017083333333333333,
      "loss": 7.5382,
      "step": 14
    },
    {
      "epoch": 0.6530612244897959,
      "grad_norm": 1.0726333856582642,
      "learning_rate": 0.0001666666666666667,
      "loss": 7.5217,
      "step": 16
    },
    {
      "epoch": 0.7346938775510204,
      "grad_norm": 0.9622203707695007,
      "learning_rate": 0.00016250000000000002,
      "loss": 7.4223,
      "step": 18
    },
    {
      "epoch": 0.8163265306122449,
      "grad_norm": 0.9835846424102783,
      "learning_rate": 0.00015833333333333332,
      "loss": 7.3894,
      "step": 20
    },
    {
      "epoch": 0.8979591836734694,
      "grad_norm": 0.7535036206245422,
      "learning_rate": 0.00015416666666666668,
      "loss": 7.3294,
      "step": 22
    },
    {
      "epoch": 0.9795918367346939,
      "grad_norm": 0.5977053046226501,
      "learning_rate": 0.00015000000000000001,
      "loss": 7.3581,
      "step": 24
    },
    {
      "epoch": 1.0408163265306123,
      "grad_norm": 1.4706302881240845,
      "learning_rate": 0.00014583333333333335,
      "loss": 5.4914,
      "step": 26
    },
    {
      "epoch": 1.1224489795918366,
      "grad_norm": 0.9165309071540833,
      "learning_rate": 0.00014166666666666668,
      "loss": 7.2332,
      "step": 28
    },
    {
      "epoch": 1.2040816326530612,
      "grad_norm": 1.0459541082382202,
      "learning_rate": 0.0001375,
      "loss": 7.2485,
      "step": 30
    },
    {
      "epoch": 1.2857142857142856,
      "grad_norm": 0.7298290133476257,
      "learning_rate": 0.00013333333333333334,
      "loss": 7.2555,
      "step": 32
    },
    {
      "epoch": 1.3673469387755102,
      "grad_norm": 0.7277728319168091,
      "learning_rate": 0.00012916666666666667,
      "loss": 7.1953,
      "step": 34
    },
    {
      "epoch": 1.4489795918367347,
      "grad_norm": 0.6085571646690369,
      "learning_rate": 0.000125,
      "loss": 7.1782,
      "step": 36
    },
    {
      "epoch": 1.5306122448979593,
      "grad_norm": 0.621771514415741,
      "learning_rate": 0.00012083333333333333,
      "loss": 7.1948,
      "step": 38
    },
    {
      "epoch": 1.6122448979591837,
      "grad_norm": 0.8418042659759521,
      "learning_rate": 0.00011666666666666668,
      "loss": 7.1403,
      "step": 40
    },
    {
      "epoch": 1.693877551020408,
      "grad_norm": 0.4234924018383026,
      "learning_rate": 0.00011250000000000001,
      "loss": 7.1515,
      "step": 42
    },
    {
      "epoch": 1.7755102040816326,
      "grad_norm": 0.6041470766067505,
      "learning_rate": 0.00010833333333333333,
      "loss": 7.0812,
      "step": 44
    },
    {
      "epoch": 1.8571428571428572,
      "grad_norm": 0.8113789558410645,
      "learning_rate": 0.00010416666666666667,
      "loss": 7.1907,
      "step": 46
    },
    {
      "epoch": 1.9387755102040818,
      "grad_norm": 0.8824118375778198,
      "learning_rate": 0.0001,
      "loss": 7.1823,
      "step": 48
    },
    {
      "epoch": 2.0,
      "grad_norm": 0.4565996527671814,
      "learning_rate": 9.583333333333334e-05,
      "loss": 5.3337,
      "step": 50
    },
    {
      "epoch": 2.0816326530612246,
      "grad_norm": 0.6504805088043213,
      "learning_rate": 9.166666666666667e-05,
      "loss": 7.1268,
      "step": 52
    },
    {
      "epoch": 2.163265306122449,
      "grad_norm": 0.39629825949668884,
      "learning_rate": 8.75e-05,
      "loss": 7.1279,
      "step": 54
    },
    {
      "epoch": 2.2448979591836733,
      "grad_norm": 0.48973432183265686,
      "learning_rate": 8.333333333333334e-05,
      "loss": 7.1235,
      "step": 56
    },
    {
      "epoch": 2.326530612244898,
      "grad_norm": 0.4701223373413086,
      "learning_rate": 7.916666666666666e-05,
      "loss": 7.0456,
      "step": 58
    },
    {
      "epoch": 2.4081632653061225,
      "grad_norm": 1.1409740447998047,
      "learning_rate": 7.500000000000001e-05,
      "loss": 7.1346,
      "step": 60
    },
    {
      "epoch": 2.489795918367347,
      "grad_norm": 0.48205825686454773,
      "learning_rate": 7.083333333333334e-05,
      "loss": 7.0549,
      "step": 62
    },
    {
      "epoch": 2.571428571428571,
      "grad_norm": 0.6595655083656311,
      "learning_rate": 6.666666666666667e-05,
      "loss": 7.1589,
      "step": 64
    },
    {
      "epoch": 2.6530612244897958,
      "grad_norm": 0.8976457715034485,
      "learning_rate": 6.25e-05,
      "loss": 7.0514,
      "step": 66
    },
    {
      "epoch": 2.7346938775510203,
      "grad_norm": 0.7210097312927246,
      "learning_rate": 5.833333333333334e-05,
      "loss": 7.1126,
      "step": 68
    },
    {
      "epoch": 2.816326530612245,
      "grad_norm": 0.49198052287101746,
      "learning_rate": 5.4166666666666664e-05,
      "loss": 7.1056,
      "step": 70
    },
    {
      "epoch": 2.8979591836734695,
      "grad_norm": 0.48084384202957153,
      "learning_rate": 5e-05,
      "loss": 6.9983,
      "step": 72
    },
    {
      "epoch": 2.979591836734694,
      "grad_norm": 0.5943397879600525,
      "learning_rate": 4.5833333333333334e-05,
      "loss": 7.1149,
      "step": 74
    },
    {
      "epoch": 3.0408163265306123,
      "grad_norm": 0.49961674213409424,
      "learning_rate": 4.166666666666667e-05,
      "loss": 5.322,
      "step": 76
    },
    {
      "epoch": 3.122448979591837,
      "grad_norm": 0.5228291153907776,
      "learning_rate": 3.7500000000000003e-05,
      "loss": 7.0475,
      "step": 78
    },
    {
      "epoch": 3.204081632653061,
      "grad_norm": 0.7281824350357056,
      "learning_rate": 3.3333333333333335e-05,
      "loss": 7.0873,
      "step": 80
    },
    {
      "epoch": 3.2857142857142856,
      "grad_norm": 0.49217817187309265,
      "learning_rate": 2.916666666666667e-05,
      "loss": 7.0653,
      "step": 82
    },
    {
      "epoch": 3.36734693877551,
      "grad_norm": 0.6683712005615234,
      "learning_rate": 2.5e-05,
      "loss": 7.0252,
      "step": 84
    },
    {
      "epoch": 3.4489795918367347,
      "grad_norm": 0.7695971131324768,
      "learning_rate": 2.0833333333333336e-05,
      "loss": 7.0718,
      "step": 86
    },
    {
      "epoch": 3.5306122448979593,
      "grad_norm": 0.6157537698745728,
      "learning_rate": 1.6666666666666667e-05,
      "loss": 7.0925,
      "step": 88
    },
    {
      "epoch": 3.612244897959184,
      "grad_norm": 0.5091469287872314,
      "learning_rate": 1.25e-05,
      "loss": 6.9613,
      "step": 90
    },
    {
      "epoch": 3.693877551020408,
      "grad_norm": 0.5973119735717773,
      "learning_rate": 8.333333333333334e-06,
      "loss": 7.01,
      "step": 92
    },
    {
      "epoch": 3.7755102040816326,
      "grad_norm": 0.44761213660240173,
      "learning_rate": 4.166666666666667e-06,
      "loss": 7.1092,
      "step": 94
    },
    {
      "epoch": 3.857142857142857,
      "grad_norm": 0.6290495991706848,
      "learning_rate": 0.0,
      "loss": 7.0562,
      "step": 96
    },
    {
      "epoch": 3.857142857142857,
      "step": 96,
      "total_flos": 97587854499492.0,
      "train_loss": 7.217979848384857,
      "train_runtime": 2726.5862,
      "train_samples_per_second": 0.574,
      "train_steps_per_second": 0.035
    }
  ],
  "logging_steps": 2,
  "max_steps": 96,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 4,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 97587854499492.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}