{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2827125681933636, "eval_steps": 100000, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000942375227311212, "grad_norm": 14.25, "learning_rate": 1e-05, "loss": 0.46586317, "memory(GiB)": 64.76, "step": 1, "train_speed(iter/s)": 0.003324 }, { "epoch": 0.00471187613655606, "grad_norm": 2.46875, "learning_rate": 9.999648647603774e-06, "loss": 0.26192743, "memory(GiB)": 75.3, "step": 5, "train_speed(iter/s)": 0.003362 }, { "epoch": 0.00942375227311212, "grad_norm": 1.15625, "learning_rate": 9.998221363123425e-06, "loss": 0.10271888, "memory(GiB)": 75.3, "step": 10, "train_speed(iter/s)": 0.003359 }, { "epoch": 0.01413562840966818, "grad_norm": 1.1796875, "learning_rate": 9.995696500215899e-06, "loss": 0.09046092, "memory(GiB)": 75.3, "step": 15, "train_speed(iter/s)": 0.003358 }, { "epoch": 0.01884750454622424, "grad_norm": 1.140625, "learning_rate": 9.992074613325435e-06, "loss": 0.08653282, "memory(GiB)": 75.3, "step": 20, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.023559380682780302, "grad_norm": 1.078125, "learning_rate": 9.987356497795944e-06, "loss": 0.08451628, "memory(GiB)": 75.3, "step": 25, "train_speed(iter/s)": 0.003358 }, { "epoch": 0.02827125681933636, "grad_norm": 1.09375, "learning_rate": 9.981543189696349e-06, "loss": 0.0772208, "memory(GiB)": 75.3, "step": 30, "train_speed(iter/s)": 0.003356 }, { "epoch": 0.03298313295589242, "grad_norm": 1.125, "learning_rate": 9.97463596559307e-06, "loss": 0.08322463, "memory(GiB)": 75.3, "step": 35, "train_speed(iter/s)": 0.003356 }, { "epoch": 0.03769500909244848, "grad_norm": 1.09375, "learning_rate": 9.966636342269706e-06, "loss": 0.07725406, "memory(GiB)": 75.3, "step": 40, "train_speed(iter/s)": 0.003355 }, { "epoch": 0.04240688522900454, "grad_norm": 1.15625, "learning_rate": 9.957546076393944e-06, "loss": 0.07683957, "memory(GiB)": 75.3, "step": 45, "train_speed(iter/s)": 0.003356 }, { "epoch": 0.047118761365560605, "grad_norm": 1.1328125, "learning_rate": 9.947367164131823e-06, "loss": 0.07508552, "memory(GiB)": 75.3, "step": 50, "train_speed(iter/s)": 0.003355 }, { "epoch": 0.05183063750211667, "grad_norm": 1.0703125, "learning_rate": 9.936101840709373e-06, "loss": 0.07236413, "memory(GiB)": 75.3, "step": 55, "train_speed(iter/s)": 0.003353 }, { "epoch": 0.05654251363867272, "grad_norm": 1.0703125, "learning_rate": 9.923752579921787e-06, "loss": 0.07231579, "memory(GiB)": 75.3, "step": 60, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.06125438977522878, "grad_norm": 1.0234375, "learning_rate": 9.910322093590177e-06, "loss": 0.07145001, "memory(GiB)": 75.3, "step": 65, "train_speed(iter/s)": 0.003348 }, { "epoch": 0.06596626591178484, "grad_norm": 1.0546875, "learning_rate": 9.895813330966086e-06, "loss": 0.07301619, "memory(GiB)": 75.3, "step": 70, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.0706781420483409, "grad_norm": 1.1015625, "learning_rate": 9.880229478083849e-06, "loss": 0.0724276, "memory(GiB)": 75.3, "step": 75, "train_speed(iter/s)": 0.00335 }, { "epoch": 0.07539001818489696, "grad_norm": 1.0390625, "learning_rate": 9.863573957060953e-06, "loss": 0.06874905, "memory(GiB)": 75.3, "step": 80, "train_speed(iter/s)": 0.003349 }, { "epoch": 0.08010189432145302, "grad_norm": 1.0859375, "learning_rate": 9.845850425346563e-06, "loss": 0.07212579, "memory(GiB)": 75.3, "step": 85, "train_speed(iter/s)": 0.003347 }, { "epoch": 0.08481377045800909, "grad_norm": 1.1171875, "learning_rate": 9.827062774918377e-06, "loss": 0.07294501, "memory(GiB)": 75.3, "step": 90, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.08952564659456515, "grad_norm": 0.98828125, "learning_rate": 9.807215131427966e-06, "loss": 0.06517277, "memory(GiB)": 75.3, "step": 95, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.09423752273112121, "grad_norm": 0.984375, "learning_rate": 9.786311853294799e-06, "loss": 0.06962139, "memory(GiB)": 75.3, "step": 100, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.09894939886767727, "grad_norm": 0.98828125, "learning_rate": 9.764357530749178e-06, "loss": 0.06724482, "memory(GiB)": 75.3, "step": 105, "train_speed(iter/s)": 0.003339 }, { "epoch": 0.10366127500423333, "grad_norm": 1.015625, "learning_rate": 9.741356984824234e-06, "loss": 0.06572815, "memory(GiB)": 75.3, "step": 110, "train_speed(iter/s)": 0.003339 }, { "epoch": 0.10837315114078938, "grad_norm": 1.0390625, "learning_rate": 9.717315266297277e-06, "loss": 0.06739124, "memory(GiB)": 75.3, "step": 115, "train_speed(iter/s)": 0.003342 }, { "epoch": 0.11308502727734544, "grad_norm": 0.9375, "learning_rate": 9.692237654580658e-06, "loss": 0.06834027, "memory(GiB)": 75.3, "step": 120, "train_speed(iter/s)": 0.003342 }, { "epoch": 0.1177969034139015, "grad_norm": 1.078125, "learning_rate": 9.66612965656245e-06, "loss": 0.0658385, "memory(GiB)": 75.3, "step": 125, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.12250877955045757, "grad_norm": 1.0859375, "learning_rate": 9.638997005397174e-06, "loss": 0.0717117, "memory(GiB)": 75.3, "step": 130, "train_speed(iter/s)": 0.003344 }, { "epoch": 0.12722065568701363, "grad_norm": 0.9765625, "learning_rate": 9.610845659246833e-06, "loss": 0.0667814, "memory(GiB)": 75.3, "step": 135, "train_speed(iter/s)": 0.003344 }, { "epoch": 0.13193253182356968, "grad_norm": 0.9140625, "learning_rate": 9.581681799972528e-06, "loss": 0.06573244, "memory(GiB)": 75.3, "step": 140, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.13664440796012575, "grad_norm": 1.03125, "learning_rate": 9.551511831776966e-06, "loss": 0.06967602, "memory(GiB)": 75.3, "step": 145, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.1413562840966818, "grad_norm": 0.90625, "learning_rate": 9.520342379798141e-06, "loss": 0.06216406, "memory(GiB)": 75.3, "step": 150, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.14606816023323788, "grad_norm": 1.0390625, "learning_rate": 9.488180288654485e-06, "loss": 0.06460171, "memory(GiB)": 75.3, "step": 155, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.15078003636979392, "grad_norm": 1.078125, "learning_rate": 9.45503262094184e-06, "loss": 0.06467786, "memory(GiB)": 75.3, "step": 160, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.15549191250635, "grad_norm": 1.0625, "learning_rate": 9.420906655682553e-06, "loss": 0.06358048, "memory(GiB)": 75.3, "step": 165, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.16020378864290605, "grad_norm": 1.015625, "learning_rate": 9.385809886727044e-06, "loss": 0.06778824, "memory(GiB)": 75.3, "step": 170, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.16491566477946212, "grad_norm": 1.046875, "learning_rate": 9.349750021108212e-06, "loss": 0.06321884, "memory(GiB)": 75.3, "step": 175, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.16962754091601817, "grad_norm": 0.97265625, "learning_rate": 9.31273497734901e-06, "loss": 0.06310185, "memory(GiB)": 75.3, "step": 180, "train_speed(iter/s)": 0.003344 }, { "epoch": 0.17433941705257422, "grad_norm": 0.9765625, "learning_rate": 9.274772883723587e-06, "loss": 0.06271737, "memory(GiB)": 75.3, "step": 185, "train_speed(iter/s)": 0.003344 }, { "epoch": 0.1790512931891303, "grad_norm": 0.97265625, "learning_rate": 9.235872076472378e-06, "loss": 0.06393245, "memory(GiB)": 75.3, "step": 190, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.18376316932568634, "grad_norm": 1.03125, "learning_rate": 9.196041097971509e-06, "loss": 0.06558744, "memory(GiB)": 75.3, "step": 195, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.18847504546224242, "grad_norm": 0.98046875, "learning_rate": 9.155288694856942e-06, "loss": 0.06127087, "memory(GiB)": 75.3, "step": 200, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.19318692159879847, "grad_norm": 0.875, "learning_rate": 9.113623816103775e-06, "loss": 0.06313071, "memory(GiB)": 75.3, "step": 205, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.19789879773535454, "grad_norm": 1.0, "learning_rate": 9.071055611061102e-06, "loss": 0.06330621, "memory(GiB)": 75.3, "step": 210, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.2026106738719106, "grad_norm": 0.9453125, "learning_rate": 9.027593427442867e-06, "loss": 0.06415906, "memory(GiB)": 75.3, "step": 215, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.20732255000846667, "grad_norm": 0.94140625, "learning_rate": 8.98324680927517e-06, "loss": 0.06299359, "memory(GiB)": 75.3, "step": 220, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.21203442614502271, "grad_norm": 0.9609375, "learning_rate": 8.938025494800454e-06, "loss": 0.06004124, "memory(GiB)": 75.3, "step": 225, "train_speed(iter/s)": 0.003343 }, { "epoch": 0.21674630228157876, "grad_norm": 0.97265625, "learning_rate": 8.891939414339048e-06, "loss": 0.06477681, "memory(GiB)": 75.3, "step": 230, "train_speed(iter/s)": 0.003344 }, { "epoch": 0.22145817841813484, "grad_norm": 0.92578125, "learning_rate": 8.844998688108535e-06, "loss": 0.06010489, "memory(GiB)": 75.3, "step": 235, "train_speed(iter/s)": 0.003344 }, { "epoch": 0.22617005455469089, "grad_norm": 0.9609375, "learning_rate": 8.797213624001403e-06, "loss": 0.05960445, "memory(GiB)": 75.3, "step": 240, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.23088193069124696, "grad_norm": 1.0, "learning_rate": 8.748594715321512e-06, "loss": 0.06301316, "memory(GiB)": 75.3, "step": 245, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.235593806827803, "grad_norm": 0.94921875, "learning_rate": 8.699152638479817e-06, "loss": 0.06120233, "memory(GiB)": 75.3, "step": 250, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.24030568296435909, "grad_norm": 0.97265625, "learning_rate": 8.6488982506499e-06, "loss": 0.06014684, "memory(GiB)": 75.3, "step": 255, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.24501755910091513, "grad_norm": 1.0, "learning_rate": 8.597842587383797e-06, "loss": 0.05922247, "memory(GiB)": 75.3, "step": 260, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.24972943523747118, "grad_norm": 0.97265625, "learning_rate": 8.545996860188668e-06, "loss": 0.05851297, "memory(GiB)": 75.3, "step": 265, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.25444131137402726, "grad_norm": 1.0625, "learning_rate": 8.493372454064809e-06, "loss": 0.05934198, "memory(GiB)": 75.3, "step": 270, "train_speed(iter/s)": 0.003345 }, { "epoch": 0.2591531875105833, "grad_norm": 0.90234375, "learning_rate": 8.439980925005587e-06, "loss": 0.06134464, "memory(GiB)": 75.3, "step": 275, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.26386506364713935, "grad_norm": 0.90234375, "learning_rate": 8.385833997459804e-06, "loss": 0.05825667, "memory(GiB)": 75.3, "step": 280, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.26857693978369546, "grad_norm": 0.8828125, "learning_rate": 8.330943561757092e-06, "loss": 0.06092241, "memory(GiB)": 75.3, "step": 285, "train_speed(iter/s)": 0.003346 }, { "epoch": 0.2732888159202515, "grad_norm": 0.91796875, "learning_rate": 8.275321671496862e-06, "loss": 0.05940055, "memory(GiB)": 75.3, "step": 290, "train_speed(iter/s)": 0.003347 }, { "epoch": 0.27800069205680755, "grad_norm": 0.9140625, "learning_rate": 8.218980540901417e-06, "loss": 0.05920713, "memory(GiB)": 75.3, "step": 295, "train_speed(iter/s)": 0.003347 }, { "epoch": 0.2827125681933636, "grad_norm": 0.92578125, "learning_rate": 8.16193254213377e-06, "loss": 0.05777416, "memory(GiB)": 75.3, "step": 300, "train_speed(iter/s)": 0.003347 } ], "logging_steps": 5, "max_steps": 1061, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.6327530207541985e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }