{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.009048545446319504, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.048545446319505e-05, "eval_loss": 2.0614755153656006, "eval_runtime": 1431.9492, "eval_samples_per_second": 12.998, "eval_steps_per_second": 1.625, "step": 1 }, { "epoch": 0.00027145636338958513, "grad_norm": 3.0326344966888428, "learning_rate": 1.5e-05, "loss": 1.1455, "step": 3 }, { "epoch": 0.0005429127267791703, "grad_norm": 2.6346302032470703, "learning_rate": 3e-05, "loss": 1.2541, "step": 6 }, { "epoch": 0.0008143690901687554, "grad_norm": 2.24157977104187, "learning_rate": 4.5e-05, "loss": 1.1746, "step": 9 }, { "epoch": 0.0008143690901687554, "eval_loss": 1.376136302947998, "eval_runtime": 1439.157, "eval_samples_per_second": 12.933, "eval_steps_per_second": 1.617, "step": 9 }, { "epoch": 0.0010858254535583405, "grad_norm": 2.4520862102508545, "learning_rate": 4.993910125649561e-05, "loss": 0.9936, "step": 12 }, { "epoch": 0.0013572818169479257, "grad_norm": 2.1917343139648438, "learning_rate": 4.962019382530521e-05, "loss": 0.7332, "step": 15 }, { "epoch": 0.0016287381803375108, "grad_norm": 2.0286247730255127, "learning_rate": 4.9031542398457974e-05, "loss": 0.6707, "step": 18 }, { "epoch": 0.0016287381803375108, "eval_loss": 0.708565354347229, "eval_runtime": 1438.9184, "eval_samples_per_second": 12.935, "eval_steps_per_second": 1.617, "step": 18 }, { "epoch": 0.001900194543727096, "grad_norm": 2.2299928665161133, "learning_rate": 4.817959636416969e-05, "loss": 0.5779, "step": 21 }, { "epoch": 0.002171650907116681, "grad_norm": 3.2380049228668213, "learning_rate": 4.707368982147318e-05, "loss": 0.6919, "step": 24 }, { "epoch": 0.002443107270506266, "grad_norm": 2.030574321746826, "learning_rate": 4.572593931387604e-05, "loss": 0.4923, "step": 27 }, { "epoch": 0.002443107270506266, "eval_loss": 0.5967397689819336, "eval_runtime": 1439.4951, "eval_samples_per_second": 12.93, "eval_steps_per_second": 1.617, "step": 27 }, { "epoch": 0.0027145636338958513, "grad_norm": 3.188636541366577, "learning_rate": 4.415111107797445e-05, "loss": 0.4312, "step": 30 }, { "epoch": 0.0029860199972854364, "grad_norm": 2.3065080642700195, "learning_rate": 4.2366459261474933e-05, "loss": 0.6207, "step": 33 }, { "epoch": 0.0032574763606750216, "grad_norm": 1.9325222969055176, "learning_rate": 4.039153688314145e-05, "loss": 0.5588, "step": 36 }, { "epoch": 0.0032574763606750216, "eval_loss": 0.5455918908119202, "eval_runtime": 1439.2701, "eval_samples_per_second": 12.932, "eval_steps_per_second": 1.617, "step": 36 }, { "epoch": 0.0035289327240646067, "grad_norm": 2.0116512775421143, "learning_rate": 3.824798160583012e-05, "loss": 0.5004, "step": 39 }, { "epoch": 0.003800389087454192, "grad_norm": 2.0821337699890137, "learning_rate": 3.5959278669726935e-05, "loss": 0.4352, "step": 42 }, { "epoch": 0.0040718454508437765, "grad_norm": 1.7970950603485107, "learning_rate": 3.355050358314172e-05, "loss": 0.533, "step": 45 }, { "epoch": 0.0040718454508437765, "eval_loss": 0.5220184326171875, "eval_runtime": 1438.7352, "eval_samples_per_second": 12.937, "eval_steps_per_second": 1.617, "step": 45 }, { "epoch": 0.004343301814233362, "grad_norm": 2.69805645942688, "learning_rate": 3.104804738999169e-05, "loss": 0.4621, "step": 48 }, { "epoch": 0.004614758177622947, "grad_norm": 1.981372594833374, "learning_rate": 2.8479327524001636e-05, "loss": 0.6349, "step": 51 }, { "epoch": 0.004886214541012532, "grad_norm": 2.258195161819458, "learning_rate": 2.587248741756253e-05, "loss": 0.5486, "step": 54 }, { "epoch": 0.004886214541012532, "eval_loss": 0.5081658363342285, "eval_runtime": 1439.3002, "eval_samples_per_second": 12.932, "eval_steps_per_second": 1.617, "step": 54 }, { "epoch": 0.005157670904402117, "grad_norm": 1.517822504043579, "learning_rate": 2.3256088156396868e-05, "loss": 0.5922, "step": 57 }, { "epoch": 0.005429127267791703, "grad_norm": 1.7860591411590576, "learning_rate": 2.0658795558326743e-05, "loss": 0.437, "step": 60 }, { "epoch": 0.005700583631181287, "grad_norm": 4.32674503326416, "learning_rate": 1.8109066104575023e-05, "loss": 0.5061, "step": 63 }, { "epoch": 0.005700583631181287, "eval_loss": 0.4976564645767212, "eval_runtime": 1439.5584, "eval_samples_per_second": 12.93, "eval_steps_per_second": 1.616, "step": 63 }, { "epoch": 0.005972039994570873, "grad_norm": 1.9437718391418457, "learning_rate": 1.56348351646022e-05, "loss": 0.5445, "step": 66 }, { "epoch": 0.006243496357960458, "grad_norm": 1.3337701559066772, "learning_rate": 1.3263210930352737e-05, "loss": 0.4558, "step": 69 }, { "epoch": 0.006514952721350043, "grad_norm": 2.283390998840332, "learning_rate": 1.1020177413231334e-05, "loss": 0.5451, "step": 72 }, { "epoch": 0.006514952721350043, "eval_loss": 0.4924093782901764, "eval_runtime": 1439.0551, "eval_samples_per_second": 12.934, "eval_steps_per_second": 1.617, "step": 72 }, { "epoch": 0.006786409084739628, "grad_norm": 2.103365421295166, "learning_rate": 8.930309757836517e-06, "loss": 0.5615, "step": 75 }, { "epoch": 0.007057865448129213, "grad_norm": 1.8247758150100708, "learning_rate": 7.016504991533726e-06, "loss": 0.4278, "step": 78 }, { "epoch": 0.007329321811518798, "grad_norm": 2.0524744987487793, "learning_rate": 5.299731159831953e-06, "loss": 0.5058, "step": 81 }, { "epoch": 0.007329321811518798, "eval_loss": 0.48786625266075134, "eval_runtime": 1439.1036, "eval_samples_per_second": 12.934, "eval_steps_per_second": 1.617, "step": 81 }, { "epoch": 0.007600778174908384, "grad_norm": 2.635622501373291, "learning_rate": 3.798797596089351e-06, "loss": 0.5012, "step": 84 }, { "epoch": 0.00787223453829797, "grad_norm": 1.4843945503234863, "learning_rate": 2.5301488425208296e-06, "loss": 0.4902, "step": 87 }, { "epoch": 0.008143690901687553, "grad_norm": 1.9264837503433228, "learning_rate": 1.5076844803522922e-06, "loss": 0.4289, "step": 90 }, { "epoch": 0.008143690901687553, "eval_loss": 0.48586252331733704, "eval_runtime": 1438.3431, "eval_samples_per_second": 12.941, "eval_steps_per_second": 1.618, "step": 90 }, { "epoch": 0.008415147265077139, "grad_norm": 2.084927797317505, "learning_rate": 7.426068431000882e-07, "loss": 0.5326, "step": 93 }, { "epoch": 0.008686603628466724, "grad_norm": 1.7900018692016602, "learning_rate": 2.4329828146074095e-07, "loss": 0.4558, "step": 96 }, { "epoch": 0.00895805999185631, "grad_norm": 1.933836817741394, "learning_rate": 1.522932452260595e-08, "loss": 0.5505, "step": 99 }, { "epoch": 0.00895805999185631, "eval_loss": 0.4853900074958801, "eval_runtime": 1438.1766, "eval_samples_per_second": 12.942, "eval_steps_per_second": 1.618, "step": 99 } ], "logging_steps": 3, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 9, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.44418812198912e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }