{ "best_metric": 0.8340622782707214, "best_model_checkpoint": "./results/checkpoint-1000", "epoch": 2.9940436796823295, "eval_steps": 500, "global_step": 1131, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05294506949040371, "grad_norm": 62.576904296875, "learning_rate": 2.0000000000000003e-06, "loss": 16.7099, "step": 20 }, { "epoch": 0.10589013898080742, "grad_norm": 35.65088653564453, "learning_rate": 4.000000000000001e-06, "loss": 14.7891, "step": 40 }, { "epoch": 0.15883520847121113, "grad_norm": 30.480260848999023, "learning_rate": 6e-06, "loss": 11.6833, "step": 60 }, { "epoch": 0.21178027796161483, "grad_norm": 24.958763122558594, "learning_rate": 8.000000000000001e-06, "loss": 7.878, "step": 80 }, { "epoch": 0.26472534745201853, "grad_norm": 4.896631717681885, "learning_rate": 1e-05, "loss": 4.1299, "step": 100 }, { "epoch": 0.31767041694242226, "grad_norm": 1.8821851015090942, "learning_rate": 1.2e-05, "loss": 2.512, "step": 120 }, { "epoch": 0.37061548643282594, "grad_norm": 0.7400406002998352, "learning_rate": 1.4000000000000001e-05, "loss": 1.8807, "step": 140 }, { "epoch": 0.42356055592322966, "grad_norm": 0.601634681224823, "learning_rate": 1.6000000000000003e-05, "loss": 1.5617, "step": 160 }, { "epoch": 0.47650562541363334, "grad_norm": 0.5250083208084106, "learning_rate": 1.8e-05, "loss": 1.461, "step": 180 }, { "epoch": 0.5294506949040371, "grad_norm": 0.5037821531295776, "learning_rate": 2e-05, "loss": 1.3703, "step": 200 }, { "epoch": 0.5823957643944407, "grad_norm": 0.47873854637145996, "learning_rate": 2.2000000000000003e-05, "loss": 1.2966, "step": 220 }, { "epoch": 0.6353408338848445, "grad_norm": 3.9664485454559326, "learning_rate": 2.4e-05, "loss": 1.2427, "step": 240 }, { "epoch": 0.6882859033752482, "grad_norm": 0.45016345381736755, "learning_rate": 2.6000000000000002e-05, "loss": 1.2139, "step": 260 }, { "epoch": 0.7412309728656519, "grad_norm": 0.5136398077011108, "learning_rate": 2.8000000000000003e-05, "loss": 1.1679, "step": 280 }, { "epoch": 0.7941760423560555, "grad_norm": 0.3736862242221832, "learning_rate": 3e-05, "loss": 1.1604, "step": 300 }, { "epoch": 0.8471211118464593, "grad_norm": 0.38698282837867737, "learning_rate": 3.2000000000000005e-05, "loss": 1.1407, "step": 320 }, { "epoch": 0.900066181336863, "grad_norm": 0.4257580637931824, "learning_rate": 3.4000000000000007e-05, "loss": 1.1177, "step": 340 }, { "epoch": 0.9530112508272667, "grad_norm": 0.3982521891593933, "learning_rate": 3.6e-05, "loss": 1.1199, "step": 360 }, { "epoch": 1.0059563203176705, "grad_norm": 0.3849237859249115, "learning_rate": 3.8e-05, "loss": 1.0925, "step": 380 }, { "epoch": 1.0589013898080741, "grad_norm": 0.3753887414932251, "learning_rate": 4e-05, "loss": 1.0605, "step": 400 }, { "epoch": 1.1118464592984778, "grad_norm": 0.3810591697692871, "learning_rate": 4.2e-05, "loss": 1.0541, "step": 420 }, { "epoch": 1.1647915287888815, "grad_norm": 0.3707886040210724, "learning_rate": 4.4000000000000006e-05, "loss": 1.0334, "step": 440 }, { "epoch": 1.2177365982792852, "grad_norm": 0.36902502179145813, "learning_rate": 4.600000000000001e-05, "loss": 1.0249, "step": 460 }, { "epoch": 1.270681667769689, "grad_norm": 0.3862062692642212, "learning_rate": 4.8e-05, "loss": 1.0141, "step": 480 }, { "epoch": 1.3236267372600927, "grad_norm": 0.36468520760536194, "learning_rate": 5e-05, "loss": 1.063, "step": 500 }, { "epoch": 1.3236267372600927, "eval_loss": 0.9045532941818237, "eval_runtime": 74.2771, "eval_samples_per_second": 10.165, "eval_steps_per_second": 2.545, "step": 500 }, { "epoch": 1.3765718067504964, "grad_norm": 0.3426459729671478, "learning_rate": 4.8415213946117275e-05, "loss": 1.01, "step": 520 }, { "epoch": 1.4295168762409, "grad_norm": 0.3736313581466675, "learning_rate": 4.6830427892234554e-05, "loss": 0.984, "step": 540 }, { "epoch": 1.4824619457313037, "grad_norm": 0.36285898089408875, "learning_rate": 4.524564183835183e-05, "loss": 0.9816, "step": 560 }, { "epoch": 1.5354070152217076, "grad_norm": 0.37838441133499146, "learning_rate": 4.36608557844691e-05, "loss": 0.9807, "step": 580 }, { "epoch": 1.588352084712111, "grad_norm": 0.3449678421020508, "learning_rate": 4.207606973058637e-05, "loss": 0.982, "step": 600 }, { "epoch": 1.641297154202515, "grad_norm": 0.3467804789543152, "learning_rate": 4.0491283676703644e-05, "loss": 0.9553, "step": 620 }, { "epoch": 1.6942422236929184, "grad_norm": 0.3551880419254303, "learning_rate": 3.8906497622820917e-05, "loss": 0.9701, "step": 640 }, { "epoch": 1.7471872931833223, "grad_norm": 0.3425547182559967, "learning_rate": 3.7321711568938196e-05, "loss": 0.973, "step": 660 }, { "epoch": 1.800132362673726, "grad_norm": 0.32189810276031494, "learning_rate": 3.573692551505547e-05, "loss": 0.9599, "step": 680 }, { "epoch": 1.8530774321641297, "grad_norm": 0.34214696288108826, "learning_rate": 3.415213946117274e-05, "loss": 0.952, "step": 700 }, { "epoch": 1.9060225016545336, "grad_norm": 0.33412784337997437, "learning_rate": 3.256735340729002e-05, "loss": 0.9453, "step": 720 }, { "epoch": 1.958967571144937, "grad_norm": 0.33273905515670776, "learning_rate": 3.098256735340729e-05, "loss": 0.9387, "step": 740 }, { "epoch": 2.011912640635341, "grad_norm": 0.32698702812194824, "learning_rate": 2.939778129952457e-05, "loss": 0.9456, "step": 760 }, { "epoch": 2.0648577101257444, "grad_norm": 0.36428529024124146, "learning_rate": 2.7812995245641837e-05, "loss": 0.9445, "step": 780 }, { "epoch": 2.1178027796161483, "grad_norm": 0.3233266770839691, "learning_rate": 2.6228209191759113e-05, "loss": 0.9185, "step": 800 }, { "epoch": 2.1707478491065517, "grad_norm": 0.3173067569732666, "learning_rate": 2.4643423137876386e-05, "loss": 0.9146, "step": 820 }, { "epoch": 2.2236929185969556, "grad_norm": 0.33917301893234253, "learning_rate": 2.305863708399366e-05, "loss": 0.9195, "step": 840 }, { "epoch": 2.2766379880873595, "grad_norm": 0.3438282608985901, "learning_rate": 2.1473851030110938e-05, "loss": 0.9356, "step": 860 }, { "epoch": 2.329583057577763, "grad_norm": 0.33590319752693176, "learning_rate": 1.988906497622821e-05, "loss": 0.9319, "step": 880 }, { "epoch": 2.382528127068167, "grad_norm": 0.5414553880691528, "learning_rate": 1.8304278922345483e-05, "loss": 0.9208, "step": 900 }, { "epoch": 2.4354731965585703, "grad_norm": 0.34509792923927307, "learning_rate": 1.671949286846276e-05, "loss": 0.9026, "step": 920 }, { "epoch": 2.488418266048974, "grad_norm": 0.30984020233154297, "learning_rate": 1.5134706814580033e-05, "loss": 0.9066, "step": 940 }, { "epoch": 2.541363335539378, "grad_norm": 0.31895536184310913, "learning_rate": 1.3549920760697307e-05, "loss": 0.9275, "step": 960 }, { "epoch": 2.5943084050297816, "grad_norm": 0.3005692660808563, "learning_rate": 1.1965134706814581e-05, "loss": 0.9241, "step": 980 }, { "epoch": 2.6472534745201854, "grad_norm": 0.325959712266922, "learning_rate": 1.0380348652931855e-05, "loss": 0.9181, "step": 1000 }, { "epoch": 2.6472534745201854, "eval_loss": 0.8340622782707214, "eval_runtime": 74.3137, "eval_samples_per_second": 10.16, "eval_steps_per_second": 2.543, "step": 1000 }, { "epoch": 2.700198544010589, "grad_norm": 0.37024152278900146, "learning_rate": 8.79556259904913e-06, "loss": 0.9156, "step": 1020 }, { "epoch": 2.753143613500993, "grad_norm": 0.3021298050880432, "learning_rate": 7.2107765451664034e-06, "loss": 0.9204, "step": 1040 }, { "epoch": 2.8060886829913967, "grad_norm": 0.35478419065475464, "learning_rate": 5.625990491283677e-06, "loss": 0.9342, "step": 1060 }, { "epoch": 2.8590337524818, "grad_norm": 0.34113916754722595, "learning_rate": 4.041204437400952e-06, "loss": 0.9187, "step": 1080 }, { "epoch": 2.9119788219722036, "grad_norm": 0.33516696095466614, "learning_rate": 2.456418383518225e-06, "loss": 0.9238, "step": 1100 }, { "epoch": 2.9649238914626075, "grad_norm": 0.2989369034767151, "learning_rate": 8.716323296354993e-07, "loss": 0.9188, "step": 1120 } ], "logging_steps": 20, "max_steps": 1131, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.101970198757376e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }