{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 352, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005681818181818182, "grad_norm": 61.450797180202606, "learning_rate": 5.555555555555555e-07, "loss": 2.2212, "step": 1 }, { "epoch": 0.028409090909090908, "grad_norm": 64.39059584640545, "learning_rate": 2.7777777777777783e-06, "loss": 2.1229, "step": 5 }, { "epoch": 0.056818181818181816, "grad_norm": 6.91974733739602, "learning_rate": 5.555555555555557e-06, "loss": 1.6002, "step": 10 }, { "epoch": 0.08522727272727272, "grad_norm": 4.765316545239553, "learning_rate": 8.333333333333334e-06, "loss": 1.0834, "step": 15 }, { "epoch": 0.11363636363636363, "grad_norm": 2.7902519705279327, "learning_rate": 1.1111111111111113e-05, "loss": 0.8354, "step": 20 }, { "epoch": 0.14204545454545456, "grad_norm": 1.4800453018498552, "learning_rate": 1.388888888888889e-05, "loss": 0.763, "step": 25 }, { "epoch": 0.17045454545454544, "grad_norm": 1.1251067723566994, "learning_rate": 1.6666666666666667e-05, "loss": 0.7216, "step": 30 }, { "epoch": 0.19886363636363635, "grad_norm": 1.0559075058285379, "learning_rate": 1.9444444444444445e-05, "loss": 0.7044, "step": 35 }, { "epoch": 0.22727272727272727, "grad_norm": 1.2534613751009074, "learning_rate": 1.999209397227302e-05, "loss": 0.6903, "step": 40 }, { "epoch": 0.2556818181818182, "grad_norm": 1.6289307968117395, "learning_rate": 1.995999715857997e-05, "loss": 0.6679, "step": 45 }, { "epoch": 0.2840909090909091, "grad_norm": 1.2476804276032436, "learning_rate": 1.9903294664725023e-05, "loss": 0.6615, "step": 50 }, { "epoch": 0.3125, "grad_norm": 0.9819151769054784, "learning_rate": 1.9822126571413616e-05, "loss": 0.6576, "step": 55 }, { "epoch": 0.3409090909090909, "grad_norm": 0.7874872470633155, "learning_rate": 1.97166934004041e-05, "loss": 0.644, "step": 60 }, { "epoch": 0.3693181818181818, "grad_norm": 0.8685235909784279, "learning_rate": 1.9587255619128648e-05, "loss": 0.6417, "step": 65 }, { "epoch": 0.3977272727272727, "grad_norm": 0.8355500376299562, "learning_rate": 1.9434132997221347e-05, "loss": 0.6415, "step": 70 }, { "epoch": 0.42613636363636365, "grad_norm": 0.7019935692437175, "learning_rate": 1.9257703816543144e-05, "loss": 0.6351, "step": 75 }, { "epoch": 0.45454545454545453, "grad_norm": 0.7150838153313119, "learning_rate": 1.9058403936655235e-05, "loss": 0.6301, "step": 80 }, { "epoch": 0.48295454545454547, "grad_norm": 0.777998409699531, "learning_rate": 1.8836725718049562e-05, "loss": 0.6323, "step": 85 }, { "epoch": 0.5113636363636364, "grad_norm": 0.9386535132438508, "learning_rate": 1.8593216805796612e-05, "loss": 0.6262, "step": 90 }, { "epoch": 0.5397727272727273, "grad_norm": 0.7833891503314497, "learning_rate": 1.8328478776615336e-05, "loss": 0.6226, "step": 95 }, { "epoch": 0.5681818181818182, "grad_norm": 0.6233679945069268, "learning_rate": 1.804316565270765e-05, "loss": 0.6215, "step": 100 }, { "epoch": 0.5965909090909091, "grad_norm": 0.6035859026208489, "learning_rate": 1.7737982286028938e-05, "loss": 0.6145, "step": 105 }, { "epoch": 0.625, "grad_norm": 0.6777356473856782, "learning_rate": 1.7413682616986185e-05, "loss": 0.6131, "step": 110 }, { "epoch": 0.6534090909090909, "grad_norm": 0.6612579810251372, "learning_rate": 1.7071067811865477e-05, "loss": 0.6126, "step": 115 }, { "epoch": 0.6818181818181818, "grad_norm": 0.5914172959469909, "learning_rate": 1.671098428359037e-05, "loss": 0.6109, "step": 120 }, { "epoch": 0.7102272727272727, "grad_norm": 0.6559176879020109, "learning_rate": 1.6334321600700612e-05, "loss": 0.6117, "step": 125 }, { "epoch": 0.7386363636363636, "grad_norm": 0.6684982270631625, "learning_rate": 1.5942010289717108e-05, "loss": 0.6076, "step": 130 }, { "epoch": 0.7670454545454546, "grad_norm": 0.6372604947073411, "learning_rate": 1.5535019536322158e-05, "loss": 0.6027, "step": 135 }, { "epoch": 0.7954545454545454, "grad_norm": 0.6994385308149875, "learning_rate": 1.5114354791034225e-05, "loss": 0.6101, "step": 140 }, { "epoch": 0.8238636363636364, "grad_norm": 0.595747522856983, "learning_rate": 1.4681055285292138e-05, "loss": 0.6029, "step": 145 }, { "epoch": 0.8522727272727273, "grad_norm": 0.5480963244425426, "learning_rate": 1.4236191464085286e-05, "loss": 0.6015, "step": 150 }, { "epoch": 0.8806818181818182, "grad_norm": 0.5398374925046976, "learning_rate": 1.3780862341472183e-05, "loss": 0.601, "step": 155 }, { "epoch": 0.9090909090909091, "grad_norm": 0.5638894303413493, "learning_rate": 1.331619278552068e-05, "loss": 0.597, "step": 160 }, { "epoch": 0.9375, "grad_norm": 0.6681163362607668, "learning_rate": 1.2843330739377003e-05, "loss": 0.5955, "step": 165 }, { "epoch": 0.9659090909090909, "grad_norm": 0.5783323357337038, "learning_rate": 1.2363444385329052e-05, "loss": 0.5938, "step": 170 }, { "epoch": 0.9943181818181818, "grad_norm": 0.5688860290477228, "learning_rate": 1.1877719258869827e-05, "loss": 0.5958, "step": 175 }, { "epoch": 1.0, "eval_loss": 0.6028689742088318, "eval_runtime": 4.9572, "eval_samples_per_second": 70.806, "eval_steps_per_second": 1.21, "step": 176 }, { "epoch": 1.0227272727272727, "grad_norm": 0.5776530983447593, "learning_rate": 1.1387355319890685e-05, "loss": 0.5715, "step": 180 }, { "epoch": 1.0511363636363635, "grad_norm": 0.5351985628101088, "learning_rate": 1.0893563988239773e-05, "loss": 0.5662, "step": 185 }, { "epoch": 1.0795454545454546, "grad_norm": 0.555723989781456, "learning_rate": 1.039756515096926e-05, "loss": 0.5641, "step": 190 }, { "epoch": 1.1079545454545454, "grad_norm": 0.5820199823963064, "learning_rate": 9.900584148664705e-06, "loss": 0.5663, "step": 195 }, { "epoch": 1.1363636363636362, "grad_norm": 0.4816152928348661, "learning_rate": 9.403848748301802e-06, "loss": 0.5604, "step": 200 }, { "epoch": 1.1647727272727273, "grad_norm": 0.5345678354575093, "learning_rate": 8.908586110108794e-06, "loss": 0.5643, "step": 205 }, { "epoch": 1.1931818181818181, "grad_norm": 0.5415303447950298, "learning_rate": 8.416019755927851e-06, "loss": 0.5612, "step": 210 }, { "epoch": 1.2215909090909092, "grad_norm": 0.5423360313378033, "learning_rate": 7.927366546564911e-06, "loss": 0.5615, "step": 215 }, { "epoch": 1.25, "grad_norm": 0.4718754739107113, "learning_rate": 7.443833675595254e-06, "loss": 0.5606, "step": 220 }, { "epoch": 1.2784090909090908, "grad_norm": 0.4930197840237037, "learning_rate": 6.966615687051517e-06, "loss": 0.5637, "step": 225 }, { "epoch": 1.3068181818181819, "grad_norm": 0.48706485409767547, "learning_rate": 6.496891524361757e-06, "loss": 0.5628, "step": 230 }, { "epoch": 1.3352272727272727, "grad_norm": 0.4651113637210652, "learning_rate": 6.03582161782806e-06, "loss": 0.5632, "step": 235 }, { "epoch": 1.3636363636363638, "grad_norm": 0.5580807947723885, "learning_rate": 5.584545017840886e-06, "loss": 0.5592, "step": 240 }, { "epoch": 1.3920454545454546, "grad_norm": 0.5164154281834604, "learning_rate": 5.144176580911431e-06, "loss": 0.5569, "step": 245 }, { "epoch": 1.4204545454545454, "grad_norm": 0.5214361592191502, "learning_rate": 4.7158042154738094e-06, "loss": 0.5568, "step": 250 }, { "epoch": 1.4488636363636362, "grad_norm": 0.4690952469954382, "learning_rate": 4.3004861942610575e-06, "loss": 0.5555, "step": 255 }, { "epoch": 1.4772727272727273, "grad_norm": 0.4614475658830548, "learning_rate": 3.899248539894756e-06, "loss": 0.5577, "step": 260 }, { "epoch": 1.5056818181818183, "grad_norm": 0.4420892345248393, "learning_rate": 3.513082490146864e-06, "loss": 0.554, "step": 265 }, { "epoch": 1.5340909090909092, "grad_norm": 0.4142734694806993, "learning_rate": 3.1429420491358696e-06, "loss": 0.5552, "step": 270 }, { "epoch": 1.5625, "grad_norm": 0.4431041177417911, "learning_rate": 2.7897416305068325e-06, "loss": 0.5533, "step": 275 }, { "epoch": 1.5909090909090908, "grad_norm": 0.43134061646610855, "learning_rate": 2.454353798417698e-06, "loss": 0.5526, "step": 280 }, { "epoch": 1.6193181818181817, "grad_norm": 0.4436282928131185, "learning_rate": 2.137607111912734e-06, "loss": 0.5516, "step": 285 }, { "epoch": 1.6477272727272727, "grad_norm": 0.42887307203464214, "learning_rate": 1.840284078008393e-06, "loss": 0.5529, "step": 290 }, { "epoch": 1.6761363636363638, "grad_norm": 0.4325445800220451, "learning_rate": 1.5631192185484557e-06, "loss": 0.5509, "step": 295 }, { "epoch": 1.7045454545454546, "grad_norm": 0.42115802401305363, "learning_rate": 1.3067972556041753e-06, "loss": 0.5542, "step": 300 }, { "epoch": 1.7329545454545454, "grad_norm": 0.41199771612659136, "learning_rate": 1.0719514199022473e-06, "loss": 0.5524, "step": 305 }, { "epoch": 1.7613636363636362, "grad_norm": 0.3982652388798851, "learning_rate": 8.591618864596541e-07, "loss": 0.5516, "step": 310 }, { "epoch": 1.7897727272727273, "grad_norm": 0.43027763841489375, "learning_rate": 6.689543412899913e-07, "loss": 0.5529, "step": 315 }, { "epoch": 1.8181818181818183, "grad_norm": 0.4194862616952164, "learning_rate": 5.017986827221733e-07, "loss": 0.5511, "step": 320 }, { "epoch": 1.8465909090909092, "grad_norm": 0.3966942779222067, "learning_rate": 3.5810786053987025e-07, "loss": 0.5494, "step": 325 }, { "epoch": 1.875, "grad_norm": 0.41819439445707, "learning_rate": 2.3823685580949273e-07, "loss": 0.5546, "step": 330 }, { "epoch": 1.9034090909090908, "grad_norm": 0.40593144742056747, "learning_rate": 1.4248180391703614e-07, "loss": 0.547, "step": 335 }, { "epoch": 1.9318181818181817, "grad_norm": 0.4100887183324941, "learning_rate": 7.10792629802659e-08, "loss": 0.5527, "step": 340 }, { "epoch": 1.9602272727272727, "grad_norm": 0.4017736667114781, "learning_rate": 2.420562944358329e-08, "loss": 0.55, "step": 345 }, { "epoch": 1.9886363636363638, "grad_norm": 0.3955471531873619, "learning_rate": 1.9767022993444353e-09, "loss": 0.5494, "step": 350 }, { "epoch": 2.0, "eval_loss": 0.5863133072853088, "eval_runtime": 4.9616, "eval_samples_per_second": 70.744, "eval_steps_per_second": 1.209, "step": 352 }, { "epoch": 2.0, "step": 352, "total_flos": 73701638799360.0, "train_loss": 0.6391877403313463, "train_runtime": 1025.7787, "train_samples_per_second": 21.89, "train_steps_per_second": 0.343 } ], "logging_steps": 5, "max_steps": 352, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 73701638799360.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }