{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 1.7346053772766696,
  "eval_steps": 200,
  "global_step": 10000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.03469210754553339,
      "eval_loss": 1.8023786544799805,
      "eval_runtime": 572.3832,
      "eval_samples_per_second": 10.074,
      "eval_steps_per_second": 1.26,
      "step": 200
    },
    {
      "epoch": 0.06938421509106678,
      "eval_loss": 1.7305923700332642,
      "eval_runtime": 572.203,
      "eval_samples_per_second": 10.077,
      "eval_steps_per_second": 1.26,
      "step": 400
    },
    {
      "epoch": 0.08673026886383348,
      "grad_norm": 6.57196569442749,
      "learning_rate": 2.4566348655680836e-05,
      "loss": 1.8068,
      "step": 500
    },
    {
      "epoch": 0.10407632263660017,
      "eval_loss": 1.7001726627349854,
      "eval_runtime": 572.0907,
      "eval_samples_per_second": 10.079,
      "eval_steps_per_second": 1.26,
      "step": 600
    },
    {
      "epoch": 0.13876843018213356,
      "eval_loss": 1.6665369272232056,
      "eval_runtime": 572.5138,
      "eval_samples_per_second": 10.071,
      "eval_steps_per_second": 1.259,
      "step": 800
    },
    {
      "epoch": 0.17346053772766695,
      "grad_norm": 6.407934665679932,
      "learning_rate": 2.4132697311361666e-05,
      "loss": 1.6773,
      "step": 1000
    },
    {
      "epoch": 0.17346053772766695,
      "eval_loss": 1.645666480064392,
      "eval_runtime": 572.4669,
      "eval_samples_per_second": 10.072,
      "eval_steps_per_second": 1.259,
      "step": 1000
    },
    {
      "epoch": 0.20815264527320035,
      "eval_loss": 1.6295970678329468,
      "eval_runtime": 572.6705,
      "eval_samples_per_second": 10.069,
      "eval_steps_per_second": 1.259,
      "step": 1200
    },
    {
      "epoch": 0.24284475281873374,
      "eval_loss": 1.6119849681854248,
      "eval_runtime": 572.6602,
      "eval_samples_per_second": 10.069,
      "eval_steps_per_second": 1.259,
      "step": 1400
    },
    {
      "epoch": 0.26019080659150046,
      "grad_norm": 6.23416805267334,
      "learning_rate": 2.36990459670425e-05,
      "loss": 1.6291,
      "step": 1500
    },
    {
      "epoch": 0.2775368603642671,
      "eval_loss": 1.5977734327316284,
      "eval_runtime": 572.802,
      "eval_samples_per_second": 10.066,
      "eval_steps_per_second": 1.259,
      "step": 1600
    },
    {
      "epoch": 0.31222896790980054,
      "eval_loss": 1.5906885862350464,
      "eval_runtime": 572.7238,
      "eval_samples_per_second": 10.068,
      "eval_steps_per_second": 1.259,
      "step": 1800
    },
    {
      "epoch": 0.3469210754553339,
      "grad_norm": 5.846036434173584,
      "learning_rate": 2.326539462272333e-05,
      "loss": 1.6032,
      "step": 2000
    },
    {
      "epoch": 0.3469210754553339,
      "eval_loss": 1.5792902708053589,
      "eval_runtime": 572.6421,
      "eval_samples_per_second": 10.069,
      "eval_steps_per_second": 1.259,
      "step": 2000
    },
    {
      "epoch": 0.38161318300086733,
      "eval_loss": 1.5674443244934082,
      "eval_runtime": 572.94,
      "eval_samples_per_second": 10.064,
      "eval_steps_per_second": 1.258,
      "step": 2200
    },
    {
      "epoch": 0.4163052905464007,
      "eval_loss": 1.5650794506072998,
      "eval_runtime": 573.1561,
      "eval_samples_per_second": 10.06,
      "eval_steps_per_second": 1.258,
      "step": 2400
    },
    {
      "epoch": 0.4336513443191674,
      "grad_norm": 6.9578962326049805,
      "learning_rate": 2.2831743278404163e-05,
      "loss": 1.5699,
      "step": 2500
    },
    {
      "epoch": 0.45099739809193407,
      "eval_loss": 1.5550028085708618,
      "eval_runtime": 572.973,
      "eval_samples_per_second": 10.063,
      "eval_steps_per_second": 1.258,
      "step": 2600
    },
    {
      "epoch": 0.4856895056374675,
      "eval_loss": 1.539338231086731,
      "eval_runtime": 573.1731,
      "eval_samples_per_second": 10.06,
      "eval_steps_per_second": 1.258,
      "step": 2800
    },
    {
      "epoch": 0.5203816131830009,
      "grad_norm": 6.062795639038086,
      "learning_rate": 2.2398091934084997e-05,
      "loss": 1.5555,
      "step": 3000
    },
    {
      "epoch": 0.5203816131830009,
      "eval_loss": 1.533992886543274,
      "eval_runtime": 573.3108,
      "eval_samples_per_second": 10.057,
      "eval_steps_per_second": 1.258,
      "step": 3000
    },
    {
      "epoch": 0.5550737207285342,
      "eval_loss": 1.5279603004455566,
      "eval_runtime": 573.3234,
      "eval_samples_per_second": 10.057,
      "eval_steps_per_second": 1.258,
      "step": 3200
    },
    {
      "epoch": 0.5897658282740676,
      "eval_loss": 1.5221937894821167,
      "eval_runtime": 573.2462,
      "eval_samples_per_second": 10.059,
      "eval_steps_per_second": 1.258,
      "step": 3400
    },
    {
      "epoch": 0.6071118820468343,
      "grad_norm": 5.474059581756592,
      "learning_rate": 2.196444058976583e-05,
      "loss": 1.5258,
      "step": 3500
    },
    {
      "epoch": 0.6244579358196011,
      "eval_loss": 1.5145606994628906,
      "eval_runtime": 573.1527,
      "eval_samples_per_second": 10.06,
      "eval_steps_per_second": 1.258,
      "step": 3600
    },
    {
      "epoch": 0.6591500433651344,
      "eval_loss": 1.5087436437606812,
      "eval_runtime": 573.3236,
      "eval_samples_per_second": 10.057,
      "eval_steps_per_second": 1.258,
      "step": 3800
    },
    {
      "epoch": 0.6938421509106678,
      "grad_norm": 4.400829315185547,
      "learning_rate": 2.1530789245446662e-05,
      "loss": 1.5145,
      "step": 4000
    },
    {
      "epoch": 0.6938421509106678,
      "eval_loss": 1.501986026763916,
      "eval_runtime": 572.9788,
      "eval_samples_per_second": 10.063,
      "eval_steps_per_second": 1.258,
      "step": 4000
    },
    {
      "epoch": 0.7285342584562012,
      "eval_loss": 1.4961259365081787,
      "eval_runtime": 572.9318,
      "eval_samples_per_second": 10.064,
      "eval_steps_per_second": 1.258,
      "step": 4200
    },
    {
      "epoch": 0.7632263660017347,
      "eval_loss": 1.4921443462371826,
      "eval_runtime": 573.2197,
      "eval_samples_per_second": 10.059,
      "eval_steps_per_second": 1.258,
      "step": 4400
    },
    {
      "epoch": 0.7805724197745013,
      "grad_norm": 5.124959945678711,
      "learning_rate": 2.1097137901127496e-05,
      "loss": 1.4981,
      "step": 4500
    },
    {
      "epoch": 0.797918473547268,
      "eval_loss": 1.48764967918396,
      "eval_runtime": 573.3463,
      "eval_samples_per_second": 10.057,
      "eval_steps_per_second": 1.258,
      "step": 4600
    },
    {
      "epoch": 0.8326105810928014,
      "eval_loss": 1.4827669858932495,
      "eval_runtime": 573.3276,
      "eval_samples_per_second": 10.057,
      "eval_steps_per_second": 1.258,
      "step": 4800
    },
    {
      "epoch": 0.8673026886383348,
      "grad_norm": 5.631836414337158,
      "learning_rate": 2.0663486556808327e-05,
      "loss": 1.4758,
      "step": 5000
    },
    {
      "epoch": 0.8673026886383348,
      "eval_loss": 1.4766356945037842,
      "eval_runtime": 573.3049,
      "eval_samples_per_second": 10.057,
      "eval_steps_per_second": 1.258,
      "step": 5000
    },
    {
      "epoch": 0.9019947961838681,
      "eval_loss": 1.4708250761032104,
      "eval_runtime": 573.3902,
      "eval_samples_per_second": 10.056,
      "eval_steps_per_second": 1.257,
      "step": 5200
    },
    {
      "epoch": 0.9366869037294016,
      "eval_loss": 1.4667783975601196,
      "eval_runtime": 573.338,
      "eval_samples_per_second": 10.057,
      "eval_steps_per_second": 1.258,
      "step": 5400
    },
    {
      "epoch": 0.9540329575021682,
      "grad_norm": 4.832674980163574,
      "learning_rate": 2.0229835212489158e-05,
      "loss": 1.4818,
      "step": 5500
    },
    {
      "epoch": 0.971379011274935,
      "eval_loss": 1.4649358987808228,
      "eval_runtime": 573.5907,
      "eval_samples_per_second": 10.052,
      "eval_steps_per_second": 1.257,
      "step": 5600
    },
    {
      "epoch": 1.0060711188204683,
      "eval_loss": 1.4911904335021973,
      "eval_runtime": 573.8034,
      "eval_samples_per_second": 10.049,
      "eval_steps_per_second": 1.257,
      "step": 5800
    },
    {
      "epoch": 1.0407632263660018,
      "grad_norm": 6.181447982788086,
      "learning_rate": 1.9796183868169993e-05,
      "loss": 1.3108,
      "step": 6000
    },
    {
      "epoch": 1.0407632263660018,
      "eval_loss": 1.5114498138427734,
      "eval_runtime": 573.9439,
      "eval_samples_per_second": 10.046,
      "eval_steps_per_second": 1.256,
      "step": 6000
    },
    {
      "epoch": 1.0754553339115351,
      "eval_loss": 1.5078836679458618,
      "eval_runtime": 573.7341,
      "eval_samples_per_second": 10.05,
      "eval_steps_per_second": 1.257,
      "step": 6200
    },
    {
      "epoch": 1.1101474414570685,
      "eval_loss": 1.512686848640442,
      "eval_runtime": 573.5532,
      "eval_samples_per_second": 10.053,
      "eval_steps_per_second": 1.257,
      "step": 6400
    },
    {
      "epoch": 1.1274934952298352,
      "grad_norm": 6.276436805725098,
      "learning_rate": 1.9362532523850823e-05,
      "loss": 1.1338,
      "step": 6500
    },
    {
      "epoch": 1.144839549002602,
      "eval_loss": 1.5086950063705444,
      "eval_runtime": 573.502,
      "eval_samples_per_second": 10.054,
      "eval_steps_per_second": 1.257,
      "step": 6600
    },
    {
      "epoch": 1.1795316565481353,
      "eval_loss": 1.5138036012649536,
      "eval_runtime": 573.4778,
      "eval_samples_per_second": 10.054,
      "eval_steps_per_second": 1.257,
      "step": 6800
    },
    {
      "epoch": 1.2142237640936686,
      "grad_norm": 5.294378280639648,
      "learning_rate": 1.8928881179531658e-05,
      "loss": 1.1411,
      "step": 7000
    },
    {
      "epoch": 1.2142237640936686,
      "eval_loss": 1.5119119882583618,
      "eval_runtime": 573.2773,
      "eval_samples_per_second": 10.058,
      "eval_steps_per_second": 1.258,
      "step": 7000
    },
    {
      "epoch": 1.2489158716392021,
      "eval_loss": 1.5059071779251099,
      "eval_runtime": 573.2436,
      "eval_samples_per_second": 10.059,
      "eval_steps_per_second": 1.258,
      "step": 7200
    },
    {
      "epoch": 1.2836079791847355,
      "eval_loss": 1.4931423664093018,
      "eval_runtime": 573.2431,
      "eval_samples_per_second": 10.059,
      "eval_steps_per_second": 1.258,
      "step": 7400
    },
    {
      "epoch": 1.3009540329575022,
      "grad_norm": 5.875624179840088,
      "learning_rate": 1.8495229835212492e-05,
      "loss": 1.1482,
      "step": 7500
    },
    {
      "epoch": 1.318300086730269,
      "eval_loss": 1.4929821491241455,
      "eval_runtime": 572.8059,
      "eval_samples_per_second": 10.066,
      "eval_steps_per_second": 1.259,
      "step": 7600
    },
    {
      "epoch": 1.3529921942758023,
      "eval_loss": 1.490503191947937,
      "eval_runtime": 572.7436,
      "eval_samples_per_second": 10.067,
      "eval_steps_per_second": 1.259,
      "step": 7800
    },
    {
      "epoch": 1.3876843018213356,
      "grad_norm": 5.962628364562988,
      "learning_rate": 1.8061578490893323e-05,
      "loss": 1.1534,
      "step": 8000
    },
    {
      "epoch": 1.3876843018213356,
      "eval_loss": 1.4796279668807983,
      "eval_runtime": 572.5741,
      "eval_samples_per_second": 10.07,
      "eval_steps_per_second": 1.259,
      "step": 8000
    },
    {
      "epoch": 1.4223764093668692,
      "eval_loss": 1.4942739009857178,
      "eval_runtime": 572.7895,
      "eval_samples_per_second": 10.067,
      "eval_steps_per_second": 1.259,
      "step": 8200
    },
    {
      "epoch": 1.4570685169124025,
      "eval_loss": 1.478100299835205,
      "eval_runtime": 574.02,
      "eval_samples_per_second": 10.045,
      "eval_steps_per_second": 1.256,
      "step": 8400
    },
    {
      "epoch": 1.4744145706851692,
      "grad_norm": 5.818081855773926,
      "learning_rate": 1.7627927146574154e-05,
      "loss": 1.1493,
      "step": 8500
    },
    {
      "epoch": 1.4917606244579358,
      "eval_loss": 1.4706262350082397,
      "eval_runtime": 573.645,
      "eval_samples_per_second": 10.052,
      "eval_steps_per_second": 1.257,
      "step": 8600
    },
    {
      "epoch": 1.5264527320034693,
      "eval_loss": 1.4702831506729126,
      "eval_runtime": 573.6402,
      "eval_samples_per_second": 10.052,
      "eval_steps_per_second": 1.257,
      "step": 8800
    },
    {
      "epoch": 1.5611448395490026,
      "grad_norm": 6.020638465881348,
      "learning_rate": 1.7194275802254988e-05,
      "loss": 1.1517,
      "step": 9000
    },
    {
      "epoch": 1.5611448395490026,
      "eval_loss": 1.4639151096343994,
      "eval_runtime": 573.5071,
      "eval_samples_per_second": 10.054,
      "eval_steps_per_second": 1.257,
      "step": 9000
    },
    {
      "epoch": 1.595836947094536,
      "eval_loss": 1.4722236394882202,
      "eval_runtime": 573.4545,
      "eval_samples_per_second": 10.055,
      "eval_steps_per_second": 1.257,
      "step": 9200
    },
    {
      "epoch": 1.6305290546400695,
      "eval_loss": 1.4613826274871826,
      "eval_runtime": 573.2765,
      "eval_samples_per_second": 10.058,
      "eval_steps_per_second": 1.258,
      "step": 9400
    },
    {
      "epoch": 1.647875108412836,
      "grad_norm": 5.535754680633545,
      "learning_rate": 1.676062445793582e-05,
      "loss": 1.1428,
      "step": 9500
    },
    {
      "epoch": 1.6652211621856028,
      "eval_loss": 1.4539824724197388,
      "eval_runtime": 573.1598,
      "eval_samples_per_second": 10.06,
      "eval_steps_per_second": 1.258,
      "step": 9600
    },
    {
      "epoch": 1.699913269731136,
      "eval_loss": 1.457112431526184,
      "eval_runtime": 573.1778,
      "eval_samples_per_second": 10.06,
      "eval_steps_per_second": 1.258,
      "step": 9800
    },
    {
      "epoch": 1.7346053772766696,
      "grad_norm": 6.019700527191162,
      "learning_rate": 1.6326973113616653e-05,
      "loss": 1.1466,
      "step": 10000
    },
    {
      "epoch": 1.7346053772766696,
      "eval_loss": 1.4443352222442627,
      "eval_runtime": 573.1133,
      "eval_samples_per_second": 10.061,
      "eval_steps_per_second": 1.258,
      "step": 10000
    }
  ],
  "logging_steps": 500,
  "max_steps": 28825,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5,
  "save_steps": 5000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 1.74751582519296e+18,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}