{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.935672514619883, "eval_steps": 25, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00935672514619883, "grad_norm": 4.463105201721191, "learning_rate": 2e-05, "loss": 3.2142, "step": 1 }, { "epoch": 0.00935672514619883, "eval_loss": 3.3358547687530518, "eval_runtime": 13.9023, "eval_samples_per_second": 6.474, "eval_steps_per_second": 6.474, "step": 1 }, { "epoch": 0.01871345029239766, "grad_norm": 4.7624897956848145, "learning_rate": 4e-05, "loss": 3.3766, "step": 2 }, { "epoch": 0.028070175438596492, "grad_norm": 3.991523265838623, "learning_rate": 6e-05, "loss": 3.3922, "step": 3 }, { "epoch": 0.03742690058479532, "grad_norm": 3.8783044815063477, "learning_rate": 8e-05, "loss": 3.0767, "step": 4 }, { "epoch": 0.04678362573099415, "grad_norm": 4.1902995109558105, "learning_rate": 0.0001, "loss": 3.2965, "step": 5 }, { "epoch": 0.056140350877192984, "grad_norm": 4.054313659667969, "learning_rate": 0.00012, "loss": 3.0089, "step": 6 }, { "epoch": 0.06549707602339182, "grad_norm": 4.280858993530273, "learning_rate": 0.00014, "loss": 3.0863, "step": 7 }, { "epoch": 0.07485380116959064, "grad_norm": 3.999479293823242, "learning_rate": 0.00016, "loss": 2.9925, "step": 8 }, { "epoch": 0.08421052631578947, "grad_norm": 3.4866421222686768, "learning_rate": 0.00018, "loss": 2.941, "step": 9 }, { "epoch": 0.0935672514619883, "grad_norm": 3.900364398956299, "learning_rate": 0.0002, "loss": 2.7885, "step": 10 }, { "epoch": 0.10292397660818714, "grad_norm": 3.4323623180389404, "learning_rate": 0.0001999390827019096, "loss": 2.8013, "step": 11 }, { "epoch": 0.11228070175438597, "grad_norm": 3.4795432090759277, "learning_rate": 0.00019975640502598244, "loss": 2.6806, "step": 12 }, { "epoch": 0.1216374269005848, "grad_norm": 3.2568836212158203, "learning_rate": 0.00019945218953682734, "loss": 2.6366, "step": 13 }, { "epoch": 0.13099415204678364, "grad_norm": 2.9930436611175537, "learning_rate": 0.00019902680687415705, "loss": 2.7908, "step": 14 }, { "epoch": 0.14035087719298245, "grad_norm": 2.845224380493164, "learning_rate": 0.00019848077530122083, "loss": 2.5384, "step": 15 }, { "epoch": 0.1497076023391813, "grad_norm": 2.690312385559082, "learning_rate": 0.00019781476007338058, "loss": 2.7595, "step": 16 }, { "epoch": 0.15906432748538013, "grad_norm": 3.0404510498046875, "learning_rate": 0.00019702957262759965, "loss": 2.7844, "step": 17 }, { "epoch": 0.16842105263157894, "grad_norm": 2.836411952972412, "learning_rate": 0.0001961261695938319, "loss": 2.4923, "step": 18 }, { "epoch": 0.17777777777777778, "grad_norm": 3.020054578781128, "learning_rate": 0.00019510565162951537, "loss": 2.6076, "step": 19 }, { "epoch": 0.1871345029239766, "grad_norm": 2.719973087310791, "learning_rate": 0.00019396926207859084, "loss": 2.6418, "step": 20 }, { "epoch": 0.19649122807017544, "grad_norm": 3.120027780532837, "learning_rate": 0.00019271838545667876, "loss": 2.7253, "step": 21 }, { "epoch": 0.20584795321637428, "grad_norm": 2.7467753887176514, "learning_rate": 0.0001913545457642601, "loss": 2.4353, "step": 22 }, { "epoch": 0.2152046783625731, "grad_norm": 3.3954317569732666, "learning_rate": 0.0001898794046299167, "loss": 2.6932, "step": 23 }, { "epoch": 0.22456140350877193, "grad_norm": 2.6571295261383057, "learning_rate": 0.00018829475928589271, "loss": 2.5214, "step": 24 }, { "epoch": 0.23391812865497075, "grad_norm": 2.4160666465759277, "learning_rate": 0.00018660254037844388, "loss": 2.6052, "step": 25 }, { "epoch": 0.23391812865497075, "eval_loss": 2.5258662700653076, "eval_runtime": 13.9413, "eval_samples_per_second": 6.456, "eval_steps_per_second": 6.456, "step": 25 }, { "epoch": 0.2432748538011696, "grad_norm": 2.6495888233184814, "learning_rate": 0.0001848048096156426, "loss": 2.5469, "step": 26 }, { "epoch": 0.25263157894736843, "grad_norm": 2.838324785232544, "learning_rate": 0.00018290375725550417, "loss": 2.5969, "step": 27 }, { "epoch": 0.26198830409356727, "grad_norm": 2.747297525405884, "learning_rate": 0.00018090169943749476, "loss": 2.4122, "step": 28 }, { "epoch": 0.27134502923976606, "grad_norm": 2.5374674797058105, "learning_rate": 0.00017880107536067218, "loss": 2.2964, "step": 29 }, { "epoch": 0.2807017543859649, "grad_norm": 2.8469207286834717, "learning_rate": 0.0001766044443118978, "loss": 2.6656, "step": 30 }, { "epoch": 0.29005847953216374, "grad_norm": 2.8864634037017822, "learning_rate": 0.00017431448254773944, "loss": 2.2717, "step": 31 }, { "epoch": 0.2994152046783626, "grad_norm": 2.550114393234253, "learning_rate": 0.0001719339800338651, "loss": 2.3912, "step": 32 }, { "epoch": 0.3087719298245614, "grad_norm": 2.7608063220977783, "learning_rate": 0.00016946583704589973, "loss": 2.3453, "step": 33 }, { "epoch": 0.31812865497076026, "grad_norm": 2.7784292697906494, "learning_rate": 0.00016691306063588583, "loss": 2.495, "step": 34 }, { "epoch": 0.32748538011695905, "grad_norm": 2.7670254707336426, "learning_rate": 0.00016427876096865394, "loss": 2.3523, "step": 35 }, { "epoch": 0.3368421052631579, "grad_norm": 2.92022442817688, "learning_rate": 0.0001615661475325658, "loss": 2.5027, "step": 36 }, { "epoch": 0.34619883040935673, "grad_norm": 2.6825602054595947, "learning_rate": 0.00015877852522924732, "loss": 2.5576, "step": 37 }, { "epoch": 0.35555555555555557, "grad_norm": 2.5809407234191895, "learning_rate": 0.0001559192903470747, "loss": 2.4248, "step": 38 }, { "epoch": 0.3649122807017544, "grad_norm": 2.5570051670074463, "learning_rate": 0.0001529919264233205, "loss": 2.3693, "step": 39 }, { "epoch": 0.3742690058479532, "grad_norm": 2.4310109615325928, "learning_rate": 0.00015000000000000001, "loss": 2.4687, "step": 40 }, { "epoch": 0.38362573099415204, "grad_norm": 2.647545099258423, "learning_rate": 0.00014694715627858908, "loss": 2.5393, "step": 41 }, { "epoch": 0.3929824561403509, "grad_norm": 2.5504465103149414, "learning_rate": 0.00014383711467890774, "loss": 2.3839, "step": 42 }, { "epoch": 0.4023391812865497, "grad_norm": 2.3321890830993652, "learning_rate": 0.00014067366430758004, "loss": 2.5113, "step": 43 }, { "epoch": 0.41169590643274856, "grad_norm": 2.408841371536255, "learning_rate": 0.00013746065934159123, "loss": 2.3157, "step": 44 }, { "epoch": 0.42105263157894735, "grad_norm": 2.5375423431396484, "learning_rate": 0.00013420201433256689, "loss": 2.676, "step": 45 }, { "epoch": 0.4304093567251462, "grad_norm": 2.8962650299072266, "learning_rate": 0.00013090169943749476, "loss": 2.8322, "step": 46 }, { "epoch": 0.439766081871345, "grad_norm": 2.144554615020752, "learning_rate": 0.0001275637355816999, "loss": 2.2713, "step": 47 }, { "epoch": 0.44912280701754387, "grad_norm": 2.3191635608673096, "learning_rate": 0.00012419218955996676, "loss": 2.3502, "step": 48 }, { "epoch": 0.4584795321637427, "grad_norm": 2.344190835952759, "learning_rate": 0.00012079116908177593, "loss": 2.5839, "step": 49 }, { "epoch": 0.4678362573099415, "grad_norm": 2.446531295776367, "learning_rate": 0.00011736481776669306, "loss": 2.5168, "step": 50 }, { "epoch": 0.4678362573099415, "eval_loss": 2.3896028995513916, "eval_runtime": 13.9611, "eval_samples_per_second": 6.446, "eval_steps_per_second": 6.446, "step": 50 }, { "epoch": 0.47719298245614034, "grad_norm": 2.4833006858825684, "learning_rate": 0.00011391731009600654, "loss": 2.6256, "step": 51 }, { "epoch": 0.4865497076023392, "grad_norm": 2.2681424617767334, "learning_rate": 0.00011045284632676536, "loss": 2.3367, "step": 52 }, { "epoch": 0.495906432748538, "grad_norm": 2.4194841384887695, "learning_rate": 0.00010697564737441252, "loss": 2.3812, "step": 53 }, { "epoch": 0.5052631578947369, "grad_norm": 2.4216341972351074, "learning_rate": 0.00010348994967025012, "loss": 2.6421, "step": 54 }, { "epoch": 0.5146198830409356, "grad_norm": 2.5757429599761963, "learning_rate": 0.0001, "loss": 2.7692, "step": 55 }, { "epoch": 0.5239766081871345, "grad_norm": 2.352524757385254, "learning_rate": 9.651005032974994e-05, "loss": 2.1221, "step": 56 }, { "epoch": 0.5333333333333333, "grad_norm": 2.435523271560669, "learning_rate": 9.302435262558747e-05, "loss": 2.5788, "step": 57 }, { "epoch": 0.5426900584795321, "grad_norm": 2.124344825744629, "learning_rate": 8.954715367323468e-05, "loss": 2.2217, "step": 58 }, { "epoch": 0.552046783625731, "grad_norm": 2.3110225200653076, "learning_rate": 8.608268990399349e-05, "loss": 2.4934, "step": 59 }, { "epoch": 0.5614035087719298, "grad_norm": 2.773723840713501, "learning_rate": 8.263518223330697e-05, "loss": 2.8535, "step": 60 }, { "epoch": 0.5707602339181287, "grad_norm": 2.4425599575042725, "learning_rate": 7.920883091822408e-05, "loss": 2.6014, "step": 61 }, { "epoch": 0.5801169590643275, "grad_norm": 2.4114837646484375, "learning_rate": 7.580781044003324e-05, "loss": 2.2151, "step": 62 }, { "epoch": 0.5894736842105263, "grad_norm": 2.333024024963379, "learning_rate": 7.243626441830009e-05, "loss": 2.3344, "step": 63 }, { "epoch": 0.5988304093567252, "grad_norm": 2.4764840602874756, "learning_rate": 6.909830056250527e-05, "loss": 2.5599, "step": 64 }, { "epoch": 0.6081871345029239, "grad_norm": 2.4353721141815186, "learning_rate": 6.579798566743314e-05, "loss": 2.0338, "step": 65 }, { "epoch": 0.6175438596491228, "grad_norm": 2.4556148052215576, "learning_rate": 6.25393406584088e-05, "loss": 2.0903, "step": 66 }, { "epoch": 0.6269005847953216, "grad_norm": 2.566187858581543, "learning_rate": 5.9326335692419995e-05, "loss": 2.4776, "step": 67 }, { "epoch": 0.6362573099415205, "grad_norm": 3.249467611312866, "learning_rate": 5.616288532109225e-05, "loss": 2.4283, "step": 68 }, { "epoch": 0.6456140350877193, "grad_norm": 2.354640483856201, "learning_rate": 5.305284372141095e-05, "loss": 2.2281, "step": 69 }, { "epoch": 0.6549707602339181, "grad_norm": 2.517160177230835, "learning_rate": 5.000000000000002e-05, "loss": 2.2233, "step": 70 }, { "epoch": 0.664327485380117, "grad_norm": 2.740892171859741, "learning_rate": 4.700807357667952e-05, "loss": 2.4251, "step": 71 }, { "epoch": 0.6736842105263158, "grad_norm": 2.4594883918762207, "learning_rate": 4.4080709652925336e-05, "loss": 2.5892, "step": 72 }, { "epoch": 0.6830409356725147, "grad_norm": 2.5558207035064697, "learning_rate": 4.12214747707527e-05, "loss": 2.5979, "step": 73 }, { "epoch": 0.6923976608187135, "grad_norm": 2.260289192199707, "learning_rate": 3.843385246743417e-05, "loss": 2.3547, "step": 74 }, { "epoch": 0.7017543859649122, "grad_norm": 2.2581794261932373, "learning_rate": 3.5721239031346066e-05, "loss": 2.3779, "step": 75 }, { "epoch": 0.7017543859649122, "eval_loss": 2.32470965385437, "eval_runtime": 14.0307, "eval_samples_per_second": 6.414, "eval_steps_per_second": 6.414, "step": 75 }, { "epoch": 0.7111111111111111, "grad_norm": 2.6566221714019775, "learning_rate": 3.308693936411421e-05, "loss": 2.3354, "step": 76 }, { "epoch": 0.7204678362573099, "grad_norm": 2.7329909801483154, "learning_rate": 3.053416295410026e-05, "loss": 2.3228, "step": 77 }, { "epoch": 0.7298245614035088, "grad_norm": 2.649616241455078, "learning_rate": 2.8066019966134904e-05, "loss": 2.2475, "step": 78 }, { "epoch": 0.7391812865497076, "grad_norm": 2.667179584503174, "learning_rate": 2.5685517452260567e-05, "loss": 2.6119, "step": 79 }, { "epoch": 0.7485380116959064, "grad_norm": 2.4291374683380127, "learning_rate": 2.339555568810221e-05, "loss": 2.3532, "step": 80 }, { "epoch": 0.7578947368421053, "grad_norm": 2.347785472869873, "learning_rate": 2.119892463932781e-05, "loss": 2.5149, "step": 81 }, { "epoch": 0.7672514619883041, "grad_norm": 2.5152335166931152, "learning_rate": 1.9098300562505266e-05, "loss": 2.4291, "step": 82 }, { "epoch": 0.776608187134503, "grad_norm": 2.509918212890625, "learning_rate": 1.7096242744495837e-05, "loss": 2.2923, "step": 83 }, { "epoch": 0.7859649122807018, "grad_norm": 2.535485029220581, "learning_rate": 1.5195190384357404e-05, "loss": 2.2462, "step": 84 }, { "epoch": 0.7953216374269005, "grad_norm": 2.524515151977539, "learning_rate": 1.339745962155613e-05, "loss": 2.3981, "step": 85 }, { "epoch": 0.8046783625730994, "grad_norm": 2.53387713432312, "learning_rate": 1.1705240714107302e-05, "loss": 2.3842, "step": 86 }, { "epoch": 0.8140350877192982, "grad_norm": 2.3665547370910645, "learning_rate": 1.0120595370083318e-05, "loss": 2.0239, "step": 87 }, { "epoch": 0.8233918128654971, "grad_norm": 2.359243631362915, "learning_rate": 8.645454235739903e-06, "loss": 2.3947, "step": 88 }, { "epoch": 0.8327485380116959, "grad_norm": 2.518428325653076, "learning_rate": 7.281614543321269e-06, "loss": 2.1668, "step": 89 }, { "epoch": 0.8421052631578947, "grad_norm": 2.3977906703948975, "learning_rate": 6.030737921409169e-06, "loss": 2.2668, "step": 90 }, { "epoch": 0.8514619883040936, "grad_norm": 2.6835556030273438, "learning_rate": 4.8943483704846475e-06, "loss": 2.255, "step": 91 }, { "epoch": 0.8608187134502924, "grad_norm": 2.503506898880005, "learning_rate": 3.873830406168111e-06, "loss": 2.2606, "step": 92 }, { "epoch": 0.8701754385964913, "grad_norm": 2.585240125656128, "learning_rate": 2.970427372400353e-06, "loss": 2.1736, "step": 93 }, { "epoch": 0.87953216374269, "grad_norm": 2.5094244480133057, "learning_rate": 2.1852399266194314e-06, "loss": 2.3768, "step": 94 }, { "epoch": 0.8888888888888888, "grad_norm": 2.3380393981933594, "learning_rate": 1.5192246987791981e-06, "loss": 2.4361, "step": 95 }, { "epoch": 0.8982456140350877, "grad_norm": 2.5652716159820557, "learning_rate": 9.731931258429638e-07, "loss": 2.1897, "step": 96 }, { "epoch": 0.9076023391812865, "grad_norm": 2.379408836364746, "learning_rate": 5.478104631726711e-07, "loss": 2.397, "step": 97 }, { "epoch": 0.9169590643274854, "grad_norm": 2.5933423042297363, "learning_rate": 2.4359497401758024e-07, "loss": 2.5667, "step": 98 }, { "epoch": 0.9263157894736842, "grad_norm": 2.5865583419799805, "learning_rate": 6.09172980904238e-08, "loss": 2.4204, "step": 99 }, { "epoch": 0.935672514619883, "grad_norm": 2.6614973545074463, "learning_rate": 0.0, "loss": 2.624, "step": 100 }, { "epoch": 0.935672514619883, "eval_loss": 2.30788254737854, "eval_runtime": 13.8631, "eval_samples_per_second": 6.492, "eval_steps_per_second": 6.492, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.738765926137856e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }