{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 380, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02631578947368421, "grad_norm": 170.0, "learning_rate": 5.263157894736842e-06, "loss": 33.7533, "step": 1 }, { "epoch": 0.13157894736842105, "grad_norm": 137.0, "learning_rate": 2.6315789473684212e-05, "loss": 34.5932, "step": 5 }, { "epoch": 0.2631578947368421, "grad_norm": 67.5, "learning_rate": 5.2631578947368424e-05, "loss": 30.1164, "step": 10 }, { "epoch": 0.39473684210526316, "grad_norm": 17.25, "learning_rate": 7.894736842105263e-05, "loss": 21.7002, "step": 15 }, { "epoch": 0.5263157894736842, "grad_norm": 14.3125, "learning_rate": 0.00010526315789473685, "loss": 17.9593, "step": 20 }, { "epoch": 0.6578947368421053, "grad_norm": 5.375, "learning_rate": 0.00013157894736842108, "loss": 15.8965, "step": 25 }, { "epoch": 0.7894736842105263, "grad_norm": 3.203125, "learning_rate": 0.00015789473684210527, "loss": 14.3005, "step": 30 }, { "epoch": 0.9210526315789473, "grad_norm": 3.90625, "learning_rate": 0.00018421052631578948, "loss": 13.7073, "step": 35 }, { "epoch": 1.0, "eval_loss": 7.137722015380859, "eval_runtime": 0.2537, "eval_samples_per_second": 39.423, "eval_steps_per_second": 3.942, "step": 38 }, { "epoch": 1.0526315789473684, "grad_norm": 7.375, "learning_rate": 0.00019998312416333227, "loss": 12.9513, "step": 40 }, { "epoch": 1.1842105263157894, "grad_norm": 12.0, "learning_rate": 0.00019979333640833947, "loss": 10.9396, "step": 45 }, { "epoch": 1.3157894736842106, "grad_norm": 20.25, "learning_rate": 0.00019939306773179497, "loss": 8.1563, "step": 50 }, { "epoch": 1.4473684210526316, "grad_norm": 13.8125, "learning_rate": 0.00019878316236762196, "loss": 4.3271, "step": 55 }, { "epoch": 1.5789473684210527, "grad_norm": 5.84375, "learning_rate": 0.0001979649067087574, "loss": 2.3733, "step": 60 }, { "epoch": 1.7105263157894737, "grad_norm": 4.96875, "learning_rate": 0.00019694002659393305, "loss": 2.0099, "step": 65 }, { "epoch": 1.8421052631578947, "grad_norm": 1.171875, "learning_rate": 0.00019571068366759143, "loss": 1.7688, "step": 70 }, { "epoch": 1.973684210526316, "grad_norm": 1.046875, "learning_rate": 0.00019427947082061432, "loss": 1.6173, "step": 75 }, { "epoch": 2.0, "eval_loss": 2.9756951332092285, "eval_runtime": 0.2386, "eval_samples_per_second": 41.915, "eval_steps_per_second": 4.191, "step": 76 }, { "epoch": 2.1052631578947367, "grad_norm": 0.9453125, "learning_rate": 0.00019264940672148018, "loss": 1.5155, "step": 80 }, { "epoch": 2.236842105263158, "grad_norm": 0.84765625, "learning_rate": 0.00019082392944938466, "loss": 1.4557, "step": 85 }, { "epoch": 2.3684210526315788, "grad_norm": 0.7421875, "learning_rate": 0.00018880688924275378, "loss": 1.4092, "step": 90 }, { "epoch": 2.5, "grad_norm": 0.43359375, "learning_rate": 0.00018660254037844388, "loss": 1.3523, "step": 95 }, { "epoch": 2.6315789473684212, "grad_norm": 0.58203125, "learning_rate": 0.00018421553219875658, "loss": 1.3176, "step": 100 }, { "epoch": 2.763157894736842, "grad_norm": 0.58203125, "learning_rate": 0.0001816508993051943, "loss": 1.2811, "step": 105 }, { "epoch": 2.8947368421052633, "grad_norm": 0.82421875, "learning_rate": 0.00017891405093963938, "loss": 1.2585, "step": 110 }, { "epoch": 3.0, "eval_loss": 2.7225849628448486, "eval_runtime": 0.2375, "eval_samples_per_second": 42.102, "eval_steps_per_second": 4.21, "step": 114 }, { "epoch": 3.026315789473684, "grad_norm": 0.9453125, "learning_rate": 0.00017601075957535364, "loss": 1.2406, "step": 115 }, { "epoch": 3.1578947368421053, "grad_norm": 0.984375, "learning_rate": 0.0001729471487418621, "loss": 1.209, "step": 120 }, { "epoch": 3.2894736842105265, "grad_norm": 0.85546875, "learning_rate": 0.00016972968010939954, "loss": 1.1876, "step": 125 }, { "epoch": 3.4210526315789473, "grad_norm": 0.953125, "learning_rate": 0.00016636513986016213, "loss": 1.1913, "step": 130 }, { "epoch": 3.5526315789473686, "grad_norm": 0.6015625, "learning_rate": 0.0001628606243751082, "loss": 1.17, "step": 135 }, { "epoch": 3.6842105263157894, "grad_norm": 0.546875, "learning_rate": 0.00015922352526649803, "loss": 1.1573, "step": 140 }, { "epoch": 3.8157894736842106, "grad_norm": 0.734375, "learning_rate": 0.00015546151378774086, "loss": 1.157, "step": 145 }, { "epoch": 3.9473684210526314, "grad_norm": 0.81640625, "learning_rate": 0.00015158252465343242, "loss": 1.1493, "step": 150 }, { "epoch": 4.0, "eval_loss": 2.631202220916748, "eval_runtime": 0.238, "eval_samples_per_second": 42.017, "eval_steps_per_second": 4.202, "step": 152 }, { "epoch": 4.078947368421052, "grad_norm": 1.21875, "learning_rate": 0.00014759473930370736, "loss": 1.1375, "step": 155 }, { "epoch": 4.2105263157894735, "grad_norm": 1.1015625, "learning_rate": 0.00014350656864820733, "loss": 1.1162, "step": 160 }, { "epoch": 4.342105263157895, "grad_norm": 1.2734375, "learning_rate": 0.0001393266353260583, "loss": 1.1148, "step": 165 }, { "epoch": 4.473684210526316, "grad_norm": 1.0078125, "learning_rate": 0.00013506375551927547, "loss": 1.1125, "step": 170 }, { "epoch": 4.605263157894737, "grad_norm": 0.625, "learning_rate": 0.00013072692035795305, "loss": 1.1057, "step": 175 }, { "epoch": 4.7368421052631575, "grad_norm": 0.7421875, "learning_rate": 0.00012632527695645993, "loss": 1.1023, "step": 180 }, { "epoch": 4.868421052631579, "grad_norm": 0.703125, "learning_rate": 0.0001218681091206376, "loss": 1.0983, "step": 185 }, { "epoch": 5.0, "grad_norm": 0.6796875, "learning_rate": 0.00011736481776669306, "loss": 1.0934, "step": 190 }, { "epoch": 5.0, "eval_loss": 2.6199984550476074, "eval_runtime": 0.2365, "eval_samples_per_second": 42.281, "eval_steps_per_second": 4.228, "step": 190 }, { "epoch": 5.131578947368421, "grad_norm": 0.74609375, "learning_rate": 0.00011282490109308633, "loss": 1.0826, "step": 195 }, { "epoch": 5.2631578947368425, "grad_norm": 0.81640625, "learning_rate": 0.00010825793454723325, "loss": 1.0788, "step": 200 }, { "epoch": 5.394736842105263, "grad_norm": 0.68359375, "learning_rate": 0.00010367355062927726, "loss": 1.0782, "step": 205 }, { "epoch": 5.526315789473684, "grad_norm": 0.95703125, "learning_rate": 9.908141857552737e-05, "loss": 1.0606, "step": 210 }, { "epoch": 5.657894736842105, "grad_norm": 0.65234375, "learning_rate": 9.449122396441345e-05, "loss": 1.0564, "step": 215 }, { "epoch": 5.7894736842105265, "grad_norm": 0.5546875, "learning_rate": 8.991264828797319e-05, "loss": 1.0509, "step": 220 }, { "epoch": 5.921052631578947, "grad_norm": 0.6328125, "learning_rate": 8.535534853195786e-05, "loss": 1.0587, "step": 225 }, { "epoch": 6.0, "eval_loss": 2.6019885540008545, "eval_runtime": 0.2397, "eval_samples_per_second": 41.723, "eval_steps_per_second": 4.172, "step": 228 }, { "epoch": 6.052631578947368, "grad_norm": 0.6640625, "learning_rate": 8.082893680762619e-05, "loss": 1.0543, "step": 230 }, { "epoch": 6.184210526315789, "grad_norm": 0.72265625, "learning_rate": 7.634296007818576e-05, "loss": 1.0452, "step": 235 }, { "epoch": 6.315789473684211, "grad_norm": 0.84375, "learning_rate": 7.190688002264308e-05, "loss": 1.042, "step": 240 }, { "epoch": 6.447368421052632, "grad_norm": 0.66796875, "learning_rate": 6.753005307953167e-05, "loss": 1.0413, "step": 245 }, { "epoch": 6.578947368421053, "grad_norm": 0.80078125, "learning_rate": 6.322171071261071e-05, "loss": 1.0436, "step": 250 }, { "epoch": 6.7105263157894735, "grad_norm": 0.77734375, "learning_rate": 5.8990939940156e-05, "loss": 1.0367, "step": 255 }, { "epoch": 6.842105263157895, "grad_norm": 0.5703125, "learning_rate": 5.484666416891109e-05, "loss": 1.0299, "step": 260 }, { "epoch": 6.973684210526316, "grad_norm": 0.70703125, "learning_rate": 5.079762437312219e-05, "loss": 1.0289, "step": 265 }, { "epoch": 7.0, "eval_loss": 2.5995545387268066, "eval_runtime": 0.2387, "eval_samples_per_second": 41.887, "eval_steps_per_second": 4.189, "step": 266 }, { "epoch": 7.105263157894737, "grad_norm": 0.53125, "learning_rate": 4.685236065835443e-05, "loss": 1.0249, "step": 270 }, { "epoch": 7.2368421052631575, "grad_norm": 0.671875, "learning_rate": 4.301919424897338e-05, "loss": 1.0192, "step": 275 }, { "epoch": 7.368421052631579, "grad_norm": 0.486328125, "learning_rate": 3.9306209937284346e-05, "loss": 1.0285, "step": 280 }, { "epoch": 7.5, "grad_norm": 0.72265625, "learning_rate": 3.5721239031346066e-05, "loss": 1.025, "step": 285 }, { "epoch": 7.631578947368421, "grad_norm": 0.58984375, "learning_rate": 3.227184283742591e-05, "loss": 1.0347, "step": 290 }, { "epoch": 7.7631578947368425, "grad_norm": 0.51171875, "learning_rate": 2.89652967119336e-05, "loss": 1.0223, "step": 295 }, { "epoch": 7.894736842105263, "grad_norm": 0.52734375, "learning_rate": 2.5808574716471856e-05, "loss": 1.0197, "step": 300 }, { "epoch": 8.0, "eval_loss": 2.602214813232422, "eval_runtime": 0.2375, "eval_samples_per_second": 42.1, "eval_steps_per_second": 4.21, "step": 304 }, { "epoch": 8.026315789473685, "grad_norm": 0.5546875, "learning_rate": 2.2808334908367914e-05, "loss": 1.023, "step": 305 }, { "epoch": 8.157894736842104, "grad_norm": 0.498046875, "learning_rate": 1.9970905297711606e-05, "loss": 1.0158, "step": 310 }, { "epoch": 8.289473684210526, "grad_norm": 0.51953125, "learning_rate": 1.7302270500518182e-05, "loss": 1.0183, "step": 315 }, { "epoch": 8.421052631578947, "grad_norm": 0.46875, "learning_rate": 1.4808059116167305e-05, "loss": 1.0081, "step": 320 }, { "epoch": 8.552631578947368, "grad_norm": 0.53125, "learning_rate": 1.2493531855740625e-05, "loss": 1.0149, "step": 325 }, { "epoch": 8.68421052631579, "grad_norm": 0.494140625, "learning_rate": 1.0363570446297999e-05, "loss": 1.0197, "step": 330 }, { "epoch": 8.81578947368421, "grad_norm": 0.46484375, "learning_rate": 8.422667334494249e-06, "loss": 1.02, "step": 335 }, { "epoch": 8.947368421052632, "grad_norm": 0.5, "learning_rate": 6.674916211254289e-06, "loss": 1.0221, "step": 340 }, { "epoch": 9.0, "eval_loss": 2.605945110321045, "eval_runtime": 0.2375, "eval_samples_per_second": 42.099, "eval_steps_per_second": 4.21, "step": 342 }, { "epoch": 9.078947368421053, "grad_norm": 0.45703125, "learning_rate": 5.124003377490582e-06, "loss": 1.0218, "step": 345 }, { "epoch": 9.210526315789474, "grad_norm": 0.46484375, "learning_rate": 3.7731999690749585e-06, "loss": 1.0087, "step": 350 }, { "epoch": 9.342105263157896, "grad_norm": 0.4609375, "learning_rate": 2.6253550574632303e-06, "loss": 1.0178, "step": 355 }, { "epoch": 9.473684210526315, "grad_norm": 0.453125, "learning_rate": 1.6828896405244988e-06, "loss": 1.0225, "step": 360 }, { "epoch": 9.605263157894736, "grad_norm": 0.455078125, "learning_rate": 9.477915362496758e-07, "loss": 1.0206, "step": 365 }, { "epoch": 9.736842105263158, "grad_norm": 0.490234375, "learning_rate": 4.216111901092501e-07, "loss": 1.017, "step": 370 }, { "epoch": 9.868421052631579, "grad_norm": 0.46484375, "learning_rate": 1.0545840490313596e-07, "loss": 1.0248, "step": 375 }, { "epoch": 10.0, "grad_norm": 0.57421875, "learning_rate": 0.0, "loss": 1.0175, "step": 380 }, { "epoch": 10.0, "eval_loss": 2.6068081855773926, "eval_runtime": 0.235, "eval_samples_per_second": 42.55, "eval_steps_per_second": 4.255, "step": 380 }, { "epoch": 10.0, "step": 380, "total_flos": 1.158687595912233e+18, "train_loss": 3.413645140748275, "train_runtime": 927.0586, "train_samples_per_second": 26.201, "train_steps_per_second": 0.41 } ], "logging_steps": 5, "max_steps": 380, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 1.158687595912233e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }