{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5805515239477503, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005805515239477504, "eval_loss": 1.2620487213134766, "eval_runtime": 8.1244, "eval_samples_per_second": 35.695, "eval_steps_per_second": 4.554, "step": 1 }, { "epoch": 0.01741654571843251, "grad_norm": 0.4328065812587738, "learning_rate": 1.5e-05, "loss": 1.2349, "step": 3 }, { "epoch": 0.03483309143686502, "grad_norm": 0.46443697810173035, "learning_rate": 3e-05, "loss": 1.2858, "step": 6 }, { "epoch": 0.05224963715529753, "grad_norm": 0.44606202840805054, "learning_rate": 4.5e-05, "loss": 1.2235, "step": 9 }, { "epoch": 0.05224963715529753, "eval_loss": 1.2377574443817139, "eval_runtime": 8.1496, "eval_samples_per_second": 35.584, "eval_steps_per_second": 4.54, "step": 9 }, { "epoch": 0.06966618287373004, "grad_norm": 0.5522386431694031, "learning_rate": 4.993910125649561e-05, "loss": 1.2714, "step": 12 }, { "epoch": 0.08708272859216255, "grad_norm": 0.5637324452400208, "learning_rate": 4.962019382530521e-05, "loss": 1.1623, "step": 15 }, { "epoch": 0.10449927431059507, "grad_norm": 0.5763392448425293, "learning_rate": 4.9031542398457974e-05, "loss": 1.0162, "step": 18 }, { "epoch": 0.10449927431059507, "eval_loss": 1.0046803951263428, "eval_runtime": 8.16, "eval_samples_per_second": 35.539, "eval_steps_per_second": 4.534, "step": 18 }, { "epoch": 0.12191582002902758, "grad_norm": 0.5673230886459351, "learning_rate": 4.817959636416969e-05, "loss": 0.9585, "step": 21 }, { "epoch": 0.13933236574746008, "grad_norm": 0.47465190291404724, "learning_rate": 4.707368982147318e-05, "loss": 0.8755, "step": 24 }, { "epoch": 0.1567489114658926, "grad_norm": 0.5033220052719116, "learning_rate": 4.572593931387604e-05, "loss": 0.788, "step": 27 }, { "epoch": 0.1567489114658926, "eval_loss": 0.743035078048706, "eval_runtime": 8.1945, "eval_samples_per_second": 35.389, "eval_steps_per_second": 4.515, "step": 27 }, { "epoch": 0.1741654571843251, "grad_norm": 0.4010712802410126, "learning_rate": 4.415111107797445e-05, "loss": 0.6837, "step": 30 }, { "epoch": 0.19158200290275762, "grad_norm": 0.38208135962486267, "learning_rate": 4.2366459261474933e-05, "loss": 0.6335, "step": 33 }, { "epoch": 0.20899854862119013, "grad_norm": 0.3118175268173218, "learning_rate": 4.039153688314145e-05, "loss": 0.5961, "step": 36 }, { "epoch": 0.20899854862119013, "eval_loss": 0.5937825441360474, "eval_runtime": 8.2105, "eval_samples_per_second": 35.32, "eval_steps_per_second": 4.506, "step": 36 }, { "epoch": 0.22641509433962265, "grad_norm": 0.26881372928619385, "learning_rate": 3.824798160583012e-05, "loss": 0.61, "step": 39 }, { "epoch": 0.24383164005805516, "grad_norm": 0.2959866225719452, "learning_rate": 3.5959278669726935e-05, "loss": 0.5413, "step": 42 }, { "epoch": 0.2612481857764877, "grad_norm": 0.31090644001960754, "learning_rate": 3.355050358314172e-05, "loss": 0.5359, "step": 45 }, { "epoch": 0.2612481857764877, "eval_loss": 0.5296854972839355, "eval_runtime": 8.2316, "eval_samples_per_second": 35.23, "eval_steps_per_second": 4.495, "step": 45 }, { "epoch": 0.27866473149492016, "grad_norm": 0.30757585167884827, "learning_rate": 3.104804738999169e-05, "loss": 0.4991, "step": 48 }, { "epoch": 0.2960812772133527, "grad_norm": 0.2986898720264435, "learning_rate": 2.8479327524001636e-05, "loss": 0.5406, "step": 51 }, { "epoch": 0.3134978229317852, "grad_norm": 0.29928526282310486, "learning_rate": 2.587248741756253e-05, "loss": 0.5209, "step": 54 }, { "epoch": 0.3134978229317852, "eval_loss": 0.499483197927475, "eval_runtime": 8.228, "eval_samples_per_second": 35.245, "eval_steps_per_second": 4.497, "step": 54 }, { "epoch": 0.3309143686502177, "grad_norm": 0.3250415325164795, "learning_rate": 2.3256088156396868e-05, "loss": 0.5339, "step": 57 }, { "epoch": 0.3483309143686502, "grad_norm": 0.2665368616580963, "learning_rate": 2.0658795558326743e-05, "loss": 0.4771, "step": 60 }, { "epoch": 0.36574746008708275, "grad_norm": 0.2685997784137726, "learning_rate": 1.8109066104575023e-05, "loss": 0.4493, "step": 63 }, { "epoch": 0.36574746008708275, "eval_loss": 0.48386040329933167, "eval_runtime": 8.2199, "eval_samples_per_second": 35.28, "eval_steps_per_second": 4.501, "step": 63 }, { "epoch": 0.38316400580551524, "grad_norm": 0.3184368908405304, "learning_rate": 1.56348351646022e-05, "loss": 0.4958, "step": 66 }, { "epoch": 0.4005805515239477, "grad_norm": 0.2603450417518616, "learning_rate": 1.3263210930352737e-05, "loss": 0.4831, "step": 69 }, { "epoch": 0.41799709724238027, "grad_norm": 0.3131684362888336, "learning_rate": 1.1020177413231334e-05, "loss": 0.4193, "step": 72 }, { "epoch": 0.41799709724238027, "eval_loss": 0.4739016890525818, "eval_runtime": 8.222, "eval_samples_per_second": 35.271, "eval_steps_per_second": 4.5, "step": 72 }, { "epoch": 0.43541364296081275, "grad_norm": 0.2975787818431854, "learning_rate": 8.930309757836517e-06, "loss": 0.4181, "step": 75 }, { "epoch": 0.4528301886792453, "grad_norm": 0.3197750747203827, "learning_rate": 7.016504991533726e-06, "loss": 0.4859, "step": 78 }, { "epoch": 0.4702467343976778, "grad_norm": 0.29346856474876404, "learning_rate": 5.299731159831953e-06, "loss": 0.4102, "step": 81 }, { "epoch": 0.4702467343976778, "eval_loss": 0.46982458233833313, "eval_runtime": 8.2212, "eval_samples_per_second": 35.274, "eval_steps_per_second": 4.501, "step": 81 }, { "epoch": 0.4876632801161103, "grad_norm": 0.30700182914733887, "learning_rate": 3.798797596089351e-06, "loss": 0.5111, "step": 84 }, { "epoch": 0.5050798258345428, "grad_norm": 0.3279879689216614, "learning_rate": 2.5301488425208296e-06, "loss": 0.4594, "step": 87 }, { "epoch": 0.5224963715529753, "grad_norm": 0.2903614938259125, "learning_rate": 1.5076844803522922e-06, "loss": 0.4378, "step": 90 }, { "epoch": 0.5224963715529753, "eval_loss": 0.4680546224117279, "eval_runtime": 8.2219, "eval_samples_per_second": 35.272, "eval_steps_per_second": 4.5, "step": 90 }, { "epoch": 0.5399129172714079, "grad_norm": 0.32795748114585876, "learning_rate": 7.426068431000882e-07, "loss": 0.503, "step": 93 }, { "epoch": 0.5573294629898403, "grad_norm": 0.3038257658481598, "learning_rate": 2.4329828146074095e-07, "loss": 0.4698, "step": 96 }, { "epoch": 0.5747460087082729, "grad_norm": 0.30126094818115234, "learning_rate": 1.522932452260595e-08, "loss": 0.4015, "step": 99 }, { "epoch": 0.5747460087082729, "eval_loss": 0.4675796627998352, "eval_runtime": 8.2149, "eval_samples_per_second": 35.302, "eval_steps_per_second": 4.504, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.20235311726592e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }