{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.961832061068702, "eval_steps": 500, "global_step": 325, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015267175572519083, "grad_norm": 183.11753845214844, "learning_rate": 6.060606060606061e-06, "loss": 46.1063, "step": 1 }, { "epoch": 0.07633587786259542, "grad_norm": 136.03738403320312, "learning_rate": 3.0303030303030306e-05, "loss": 44.0302, "step": 5 }, { "epoch": 0.15267175572519084, "grad_norm": 69.2432632446289, "learning_rate": 6.060606060606061e-05, "loss": 38.4659, "step": 10 }, { "epoch": 0.22900763358778625, "grad_norm": 17.486797332763672, "learning_rate": 9.090909090909092e-05, "loss": 30.3029, "step": 15 }, { "epoch": 0.3053435114503817, "grad_norm": 13.530756950378418, "learning_rate": 0.00012121212121212122, "loss": 26.6709, "step": 20 }, { "epoch": 0.3816793893129771, "grad_norm": 7.521498680114746, "learning_rate": 0.00015151515151515152, "loss": 24.4319, "step": 25 }, { "epoch": 0.4580152671755725, "grad_norm": 5.912084102630615, "learning_rate": 0.00018181818181818183, "loss": 22.862, "step": 30 }, { "epoch": 0.5343511450381679, "grad_norm": 10.610209465026855, "learning_rate": 0.00019997685019798912, "loss": 21.5999, "step": 35 }, { "epoch": 0.6106870229007634, "grad_norm": 20.944725036621094, "learning_rate": 0.0001997165380022878, "loss": 19.4719, "step": 40 }, { "epoch": 0.6870229007633588, "grad_norm": 34.12383270263672, "learning_rate": 0.000199167731989929, "loss": 14.6832, "step": 45 }, { "epoch": 0.7633587786259542, "grad_norm": 42.86738204956055, "learning_rate": 0.0001983320199330545, "loss": 8.7569, "step": 50 }, { "epoch": 0.8396946564885496, "grad_norm": 12.474686622619629, "learning_rate": 0.00019721181966290613, "loss": 4.3457, "step": 55 }, { "epoch": 0.916030534351145, "grad_norm": 9.623456954956055, "learning_rate": 0.00019581037207470382, "loss": 3.4309, "step": 60 }, { "epoch": 0.9923664122137404, "grad_norm": 3.5216312408447266, "learning_rate": 0.00019413173175128473, "loss": 2.9056, "step": 65 }, { "epoch": 0.9923664122137404, "eval_loss": 2.611328125, "eval_runtime": 19.2134, "eval_samples_per_second": 47.935, "eval_steps_per_second": 0.781, "step": 65 }, { "epoch": 1.0687022900763359, "grad_norm": 2.9582359790802, "learning_rate": 0.00019218075523263104, "loss": 2.7809, "step": 70 }, { "epoch": 1.1450381679389312, "grad_norm": 2.319239616394043, "learning_rate": 0.00018996308696522433, "loss": 2.3224, "step": 75 }, { "epoch": 1.2213740458015268, "grad_norm": 1.3839267492294312, "learning_rate": 0.00018748514297187648, "loss": 2.2039, "step": 80 }, { "epoch": 1.297709923664122, "grad_norm": 0.5840837955474854, "learning_rate": 0.00018475409228928312, "loss": 2.1174, "step": 85 }, { "epoch": 1.3740458015267176, "grad_norm": 1.5493711233139038, "learning_rate": 0.00018177783622700327, "loss": 2.0565, "step": 90 }, { "epoch": 1.450381679389313, "grad_norm": 0.7415986657142639, "learning_rate": 0.00017856498550787144, "loss": 2.003, "step": 95 }, { "epoch": 1.5267175572519083, "grad_norm": 0.6342356204986572, "learning_rate": 0.00017512483535597867, "loss": 1.9686, "step": 100 }, { "epoch": 1.6030534351145038, "grad_norm": 1.0893248319625854, "learning_rate": 0.00017146733860429612, "loss": 1.9499, "step": 105 }, { "epoch": 1.6793893129770994, "grad_norm": 1.233128547668457, "learning_rate": 0.0001676030768997445, "loss": 1.9192, "step": 110 }, { "epoch": 1.7557251908396947, "grad_norm": 0.7829602360725403, "learning_rate": 0.00016354323008901776, "loss": 1.8934, "step": 115 }, { "epoch": 1.83206106870229, "grad_norm": 1.0393383502960205, "learning_rate": 0.00015929954387373103, "loss": 1.8579, "step": 120 }, { "epoch": 1.9083969465648853, "grad_norm": 2.433302879333496, "learning_rate": 0.00015488429582847192, "loss": 1.8576, "step": 125 }, { "epoch": 1.984732824427481, "grad_norm": 1.2537367343902588, "learning_rate": 0.00015031025988006936, "loss": 1.8271, "step": 130 }, { "epoch": 2.0, "eval_loss": 1.8229883909225464, "eval_runtime": 19.0953, "eval_samples_per_second": 48.232, "eval_steps_per_second": 0.786, "step": 131 }, { "epoch": 2.0610687022900764, "grad_norm": 1.04417085647583, "learning_rate": 0.00014559066935084588, "loss": 1.975, "step": 135 }, { "epoch": 2.1374045801526718, "grad_norm": 0.9754623174667358, "learning_rate": 0.00014073917867277557, "loss": 1.7901, "step": 140 }, { "epoch": 2.213740458015267, "grad_norm": 0.6031882762908936, "learning_rate": 0.0001357698238833126, "loss": 1.7584, "step": 145 }, { "epoch": 2.2900763358778624, "grad_norm": 1.7654844522476196, "learning_rate": 0.000130696982017182, "loss": 1.7665, "step": 150 }, { "epoch": 2.366412213740458, "grad_norm": 1.8184305429458618, "learning_rate": 0.0001255353295116187, "loss": 1.7496, "step": 155 }, { "epoch": 2.4427480916030535, "grad_norm": 2.4291305541992188, "learning_rate": 0.00012029979974539234, "loss": 1.7389, "step": 160 }, { "epoch": 2.519083969465649, "grad_norm": 0.7844381928443909, "learning_rate": 0.00011500553983446527, "loss": 1.7327, "step": 165 }, { "epoch": 2.595419847328244, "grad_norm": 1.0221455097198486, "learning_rate": 0.00010966786680927874, "loss": 1.7365, "step": 170 }, { "epoch": 2.67175572519084, "grad_norm": 1.1956524848937988, "learning_rate": 0.00010430222330045304, "loss": 1.7204, "step": 175 }, { "epoch": 2.7480916030534353, "grad_norm": 0.7325518131256104, "learning_rate": 9.892413286110886e-05, "loss": 1.7177, "step": 180 }, { "epoch": 2.8244274809160306, "grad_norm": 0.8538561463356018, "learning_rate": 9.354915505506839e-05, "loss": 1.7193, "step": 185 }, { "epoch": 2.900763358778626, "grad_norm": 1.252325415611267, "learning_rate": 8.81928404408726e-05, "loss": 1.7058, "step": 190 }, { "epoch": 2.9770992366412212, "grad_norm": 0.7734937071800232, "learning_rate": 8.287068558185225e-05, "loss": 1.7019, "step": 195 }, { "epoch": 2.9923664122137406, "eval_loss": 1.7041354179382324, "eval_runtime": 19.3108, "eval_samples_per_second": 47.694, "eval_steps_per_second": 0.777, "step": 196 }, { "epoch": 3.053435114503817, "grad_norm": 0.6631619334220886, "learning_rate": 7.759808821241406e-05, "loss": 1.8697, "step": 200 }, { "epoch": 3.1297709923664123, "grad_norm": 0.7187236547470093, "learning_rate": 7.239030269025311e-05, "loss": 1.7181, "step": 205 }, { "epoch": 3.2061068702290076, "grad_norm": 0.5320985913276672, "learning_rate": 6.726239586337408e-05, "loss": 1.7351, "step": 210 }, { "epoch": 3.282442748091603, "grad_norm": 0.43638336658477783, "learning_rate": 6.22292034796035e-05, "loss": 1.7156, "step": 215 }, { "epoch": 3.3587786259541983, "grad_norm": 0.3966742753982544, "learning_rate": 5.730528726470792e-05, "loss": 1.7158, "step": 220 }, { "epoch": 3.435114503816794, "grad_norm": 0.326159805059433, "learning_rate": 5.2504892793295e-05, "loss": 1.7055, "step": 225 }, { "epoch": 3.5114503816793894, "grad_norm": 0.4766685664653778, "learning_rate": 4.7841908274384616e-05, "loss": 1.7006, "step": 230 }, { "epoch": 3.5877862595419847, "grad_norm": 0.41363418102264404, "learning_rate": 4.332982437088825e-05, "loss": 1.7106, "step": 235 }, { "epoch": 3.66412213740458, "grad_norm": 0.5006980299949646, "learning_rate": 3.898169516924398e-05, "loss": 1.6938, "step": 240 }, { "epoch": 3.7404580152671754, "grad_norm": 0.4720315933227539, "learning_rate": 3.4810100412128747e-05, "loss": 1.6886, "step": 245 }, { "epoch": 3.816793893129771, "grad_norm": 0.5057269334793091, "learning_rate": 3.0827109103512643e-05, "loss": 1.6912, "step": 250 }, { "epoch": 3.8931297709923665, "grad_norm": 0.38378995656967163, "learning_rate": 2.7044244591351232e-05, "loss": 1.7001, "step": 255 }, { "epoch": 3.969465648854962, "grad_norm": 0.3008043169975281, "learning_rate": 2.3472451228937253e-05, "loss": 1.7024, "step": 260 }, { "epoch": 4.0, "eval_loss": 1.6962379217147827, "eval_runtime": 18.9852, "eval_samples_per_second": 48.512, "eval_steps_per_second": 0.79, "step": 262 }, { "epoch": 4.0458015267175576, "grad_norm": 0.9348434805870056, "learning_rate": 2.0122062711363532e-05, "loss": 1.8574, "step": 265 }, { "epoch": 4.122137404580153, "grad_norm": 0.7455368638038635, "learning_rate": 1.7002772178705716e-05, "loss": 1.6594, "step": 270 }, { "epoch": 4.198473282442748, "grad_norm": 0.5774383544921875, "learning_rate": 1.4123604172419713e-05, "loss": 1.6527, "step": 275 }, { "epoch": 4.2748091603053435, "grad_norm": 0.5370898842811584, "learning_rate": 1.149288852608743e-05, "loss": 1.6587, "step": 280 }, { "epoch": 4.351145038167939, "grad_norm": 0.7321135997772217, "learning_rate": 9.118236266049707e-06, "loss": 1.6676, "step": 285 }, { "epoch": 4.427480916030534, "grad_norm": 0.5155964493751526, "learning_rate": 7.0065175916482095e-06, "loss": 1.6579, "step": 290 }, { "epoch": 4.5038167938931295, "grad_norm": 0.6737932562828064, "learning_rate": 5.163841998782837e-06, "loss": 1.6508, "step": 295 }, { "epoch": 4.580152671755725, "grad_norm": 0.9017395377159119, "learning_rate": 3.595540604290437e-06, "loss": 1.6375, "step": 300 }, { "epoch": 4.65648854961832, "grad_norm": 0.5460083484649658, "learning_rate": 2.30615072228183e-06, "loss": 1.6522, "step": 305 }, { "epoch": 4.732824427480916, "grad_norm": 0.5443113446235657, "learning_rate": 1.2994027370611173e-06, "loss": 1.648, "step": 310 }, { "epoch": 4.809160305343512, "grad_norm": 0.6177972555160522, "learning_rate": 5.782093106048159e-07, "loss": 1.6559, "step": 315 }, { "epoch": 4.885496183206107, "grad_norm": 0.4734289050102234, "learning_rate": 1.446569558255395e-07, "loss": 1.6443, "step": 320 }, { "epoch": 4.961832061068702, "grad_norm": 0.6619871854782104, "learning_rate": 0.0, "loss": 1.6463, "step": 325 }, { "epoch": 4.961832061068702, "eval_loss": 1.664337158203125, "eval_runtime": 18.9808, "eval_samples_per_second": 48.523, "eval_steps_per_second": 0.79, "step": 325 }, { "epoch": 4.961832061068702, "step": 325, "total_flos": 9.909828121379471e+17, "train_loss": 5.476599056537335, "train_runtime": 4095.1846, "train_samples_per_second": 10.222, "train_steps_per_second": 0.079 } ], "logging_steps": 5, "max_steps": 325, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.909828121379471e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }