{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.008, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 8e-05, "grad_norm": 0.1861909031867981, "learning_rate": 4e-05, "loss": 0.6283, "step": 1 }, { "epoch": 0.00016, "grad_norm": 0.16733917593955994, "learning_rate": 8e-05, "loss": 0.6772, "step": 2 }, { "epoch": 0.00024, "grad_norm": 0.20496870577335358, "learning_rate": 0.00012, "loss": 0.8642, "step": 3 }, { "epoch": 0.00032, "grad_norm": 0.19442355632781982, "learning_rate": 0.00016, "loss": 0.7268, "step": 4 }, { "epoch": 0.0004, "grad_norm": 0.1718096137046814, "learning_rate": 0.0002, "loss": 0.6486, "step": 5 }, { "epoch": 0.00048, "grad_norm": 0.1942625492811203, "learning_rate": 0.00019555555555555556, "loss": 0.7806, "step": 6 }, { "epoch": 0.00056, "grad_norm": 0.17527641355991364, "learning_rate": 0.00019111111111111114, "loss": 0.5136, "step": 7 }, { "epoch": 0.00064, "grad_norm": 0.2261490821838379, "learning_rate": 0.0001866666666666667, "loss": 0.8569, "step": 8 }, { "epoch": 0.00072, "grad_norm": 0.17914293706417084, "learning_rate": 0.00018222222222222224, "loss": 0.7845, "step": 9 }, { "epoch": 0.0008, "grad_norm": 0.21953529119491577, "learning_rate": 0.00017777777777777779, "loss": 0.6735, "step": 10 }, { "epoch": 0.00088, "grad_norm": 0.1893182247877121, "learning_rate": 0.00017333333333333334, "loss": 0.8183, "step": 11 }, { "epoch": 0.00096, "grad_norm": 0.20547300577163696, "learning_rate": 0.00016888888888888889, "loss": 1.0212, "step": 12 }, { "epoch": 0.00104, "grad_norm": 0.2539304196834564, "learning_rate": 0.00016444444444444444, "loss": 0.8543, "step": 13 }, { "epoch": 0.00112, "grad_norm": 0.17561326920986176, "learning_rate": 0.00016, "loss": 0.5858, "step": 14 }, { "epoch": 0.0012, "grad_norm": 0.19194333255290985, "learning_rate": 0.00015555555555555556, "loss": 0.8022, "step": 15 }, { "epoch": 0.00128, "grad_norm": 0.22850346565246582, "learning_rate": 0.0001511111111111111, "loss": 0.5618, "step": 16 }, { "epoch": 0.00136, "grad_norm": 0.2263827621936798, "learning_rate": 0.00014666666666666666, "loss": 0.9284, "step": 17 }, { "epoch": 0.00144, "grad_norm": 0.2525479197502136, "learning_rate": 0.00014222222222222224, "loss": 0.7528, "step": 18 }, { "epoch": 0.00152, "grad_norm": 0.19500723481178284, "learning_rate": 0.0001377777777777778, "loss": 0.7256, "step": 19 }, { "epoch": 0.0016, "grad_norm": 0.24944712221622467, "learning_rate": 0.00013333333333333334, "loss": 0.8527, "step": 20 }, { "epoch": 0.00168, "grad_norm": 0.20906241238117218, "learning_rate": 0.00012888888888888892, "loss": 0.8557, "step": 21 }, { "epoch": 0.00176, "grad_norm": 0.196335569024086, "learning_rate": 0.00012444444444444444, "loss": 0.8134, "step": 22 }, { "epoch": 0.00184, "grad_norm": 0.15679122507572174, "learning_rate": 0.00012, "loss": 1.0036, "step": 23 }, { "epoch": 0.00192, "grad_norm": 0.2803596556186676, "learning_rate": 0.00011555555555555555, "loss": 0.8113, "step": 24 }, { "epoch": 0.002, "grad_norm": 0.22238129377365112, "learning_rate": 0.00011111111111111112, "loss": 0.5934, "step": 25 }, { "epoch": 0.00208, "grad_norm": 0.23300746083259583, "learning_rate": 0.00010666666666666667, "loss": 0.7823, "step": 26 }, { "epoch": 0.00216, "grad_norm": 0.2822686731815338, "learning_rate": 0.00010222222222222222, "loss": 0.7634, "step": 27 }, { "epoch": 0.00224, "grad_norm": 0.25708135962486267, "learning_rate": 9.777777777777778e-05, "loss": 0.7359, "step": 28 }, { "epoch": 0.00232, "grad_norm": 0.2508324980735779, "learning_rate": 9.333333333333334e-05, "loss": 1.0442, "step": 29 }, { "epoch": 0.0024, "grad_norm": 0.2428746372461319, "learning_rate": 8.888888888888889e-05, "loss": 1.0047, "step": 30 }, { "epoch": 0.00248, "grad_norm": 0.2691624164581299, "learning_rate": 8.444444444444444e-05, "loss": 0.633, "step": 31 }, { "epoch": 0.00256, "grad_norm": 0.1821339726448059, "learning_rate": 8e-05, "loss": 0.5192, "step": 32 }, { "epoch": 0.00264, "grad_norm": 0.2801157534122467, "learning_rate": 7.555555555555556e-05, "loss": 0.5962, "step": 33 }, { "epoch": 0.00272, "grad_norm": 0.22572503983974457, "learning_rate": 7.111111111111112e-05, "loss": 0.5493, "step": 34 }, { "epoch": 0.0028, "grad_norm": 0.25089773535728455, "learning_rate": 6.666666666666667e-05, "loss": 0.7187, "step": 35 }, { "epoch": 0.00288, "grad_norm": 0.2668441832065582, "learning_rate": 6.222222222222222e-05, "loss": 0.9809, "step": 36 }, { "epoch": 0.00296, "grad_norm": 0.281890869140625, "learning_rate": 5.7777777777777776e-05, "loss": 0.8585, "step": 37 }, { "epoch": 0.00304, "grad_norm": 0.22123876214027405, "learning_rate": 5.333333333333333e-05, "loss": 0.6978, "step": 38 }, { "epoch": 0.00312, "grad_norm": 0.24300044775009155, "learning_rate": 4.888888888888889e-05, "loss": 0.7604, "step": 39 }, { "epoch": 0.0032, "grad_norm": 0.2412249892950058, "learning_rate": 4.4444444444444447e-05, "loss": 0.9841, "step": 40 }, { "epoch": 0.00328, "grad_norm": 0.2551029324531555, "learning_rate": 4e-05, "loss": 0.7346, "step": 41 }, { "epoch": 0.00336, "grad_norm": 0.2715751528739929, "learning_rate": 3.555555555555556e-05, "loss": 0.9846, "step": 42 }, { "epoch": 0.00344, "grad_norm": 0.22599095106124878, "learning_rate": 3.111111111111111e-05, "loss": 0.7476, "step": 43 }, { "epoch": 0.00352, "grad_norm": 0.26102009415626526, "learning_rate": 2.6666666666666667e-05, "loss": 0.7947, "step": 44 }, { "epoch": 0.0036, "grad_norm": 0.2690563499927521, "learning_rate": 2.2222222222222223e-05, "loss": 0.7454, "step": 45 }, { "epoch": 0.00368, "grad_norm": 0.2677896320819855, "learning_rate": 1.777777777777778e-05, "loss": 0.8543, "step": 46 }, { "epoch": 0.00376, "grad_norm": 0.2913207709789276, "learning_rate": 1.3333333333333333e-05, "loss": 0.771, "step": 47 }, { "epoch": 0.00384, "grad_norm": 0.2139165699481964, "learning_rate": 8.88888888888889e-06, "loss": 0.6454, "step": 48 }, { "epoch": 0.00392, "grad_norm": 0.3593102693557739, "learning_rate": 4.444444444444445e-06, "loss": 1.0135, "step": 49 }, { "epoch": 0.004, "grad_norm": 0.2176782488822937, "learning_rate": 0.0, "loss": 1.0192, "step": 50 }, { "epoch": 0.00408, "grad_norm": 0.17560550570487976, "learning_rate": 0.0, "loss": 0.4516, "step": 51 }, { "epoch": 0.00416, "grad_norm": 0.1952800750732422, "learning_rate": 0.0, "loss": 0.9039, "step": 52 }, { "epoch": 0.00424, "grad_norm": 0.9650332927703857, "learning_rate": 0.00019923169267707086, "loss": 1.2938, "step": 53 }, { "epoch": 0.00432, "grad_norm": 0.2434082180261612, "learning_rate": 0.0001992156862745098, "loss": 0.6807, "step": 54 }, { "epoch": 0.0044, "grad_norm": 0.3336096405982971, "learning_rate": 0.00019919967987194878, "loss": 1.0532, "step": 55 }, { "epoch": 0.00448, "grad_norm": 0.21415311098098755, "learning_rate": 0.00019918367346938775, "loss": 1.1118, "step": 56 }, { "epoch": 0.00456, "grad_norm": 0.17886948585510254, "learning_rate": 0.00019916766706682676, "loss": 0.7198, "step": 57 }, { "epoch": 0.00464, "grad_norm": 0.1993376612663269, "learning_rate": 0.0001991516606642657, "loss": 0.8233, "step": 58 }, { "epoch": 0.00472, "grad_norm": 0.21231591701507568, "learning_rate": 0.00019913565426170468, "loss": 0.75, "step": 59 }, { "epoch": 0.0048, "grad_norm": 0.22594475746154785, "learning_rate": 0.00019911964785914368, "loss": 0.9078, "step": 60 }, { "epoch": 0.00488, "grad_norm": 0.21537600457668304, "learning_rate": 0.00019910364145658266, "loss": 0.7157, "step": 61 }, { "epoch": 0.00496, "grad_norm": 0.23297461867332458, "learning_rate": 0.0001990876350540216, "loss": 0.8651, "step": 62 }, { "epoch": 0.00504, "grad_norm": 0.19292137026786804, "learning_rate": 0.00019907162865146058, "loss": 0.7759, "step": 63 }, { "epoch": 0.00512, "grad_norm": 0.4145672023296356, "learning_rate": 0.00019905562224889958, "loss": 1.1774, "step": 64 }, { "epoch": 0.0052, "grad_norm": 0.19476854801177979, "learning_rate": 0.00019903961584633856, "loss": 0.768, "step": 65 }, { "epoch": 0.00528, "grad_norm": 0.22146064043045044, "learning_rate": 0.0001990236094437775, "loss": 0.8986, "step": 66 }, { "epoch": 0.00536, "grad_norm": 0.2676100432872772, "learning_rate": 0.0001990076030412165, "loss": 0.8371, "step": 67 }, { "epoch": 0.00544, "grad_norm": 0.21964198350906372, "learning_rate": 0.00019899159663865548, "loss": 0.7568, "step": 68 }, { "epoch": 0.00552, "grad_norm": 0.18942779302597046, "learning_rate": 0.00019897559023609445, "loss": 0.8593, "step": 69 }, { "epoch": 0.0056, "grad_norm": 0.21514566242694855, "learning_rate": 0.0001989595838335334, "loss": 0.6191, "step": 70 }, { "epoch": 0.00568, "grad_norm": 0.21467895805835724, "learning_rate": 0.0001989435774309724, "loss": 0.4907, "step": 71 }, { "epoch": 0.00576, "grad_norm": 0.20334534347057343, "learning_rate": 0.00019892757102841138, "loss": 0.6906, "step": 72 }, { "epoch": 0.00584, "grad_norm": 0.21853458881378174, "learning_rate": 0.00019891156462585035, "loss": 0.8808, "step": 73 }, { "epoch": 0.00592, "grad_norm": 0.2489083856344223, "learning_rate": 0.00019889555822328933, "loss": 0.9344, "step": 74 }, { "epoch": 0.006, "grad_norm": 0.28158295154571533, "learning_rate": 0.0001988795518207283, "loss": 0.6637, "step": 75 }, { "epoch": 0.00608, "grad_norm": 0.267078161239624, "learning_rate": 0.00019886354541816728, "loss": 0.7328, "step": 76 }, { "epoch": 0.00616, "grad_norm": 0.2382790893316269, "learning_rate": 0.00019884753901560625, "loss": 0.9646, "step": 77 }, { "epoch": 0.00624, "grad_norm": 0.26280778646469116, "learning_rate": 0.00019883153261304523, "loss": 0.7376, "step": 78 }, { "epoch": 0.00632, "grad_norm": 0.2713547945022583, "learning_rate": 0.0001988155262104842, "loss": 1.0505, "step": 79 }, { "epoch": 0.0064, "grad_norm": 0.23571620881557465, "learning_rate": 0.00019879951980792318, "loss": 0.8017, "step": 80 }, { "epoch": 0.00648, "grad_norm": 0.2187693864107132, "learning_rate": 0.00019878351340536215, "loss": 0.6597, "step": 81 }, { "epoch": 0.00656, "grad_norm": 0.22543781995773315, "learning_rate": 0.00019876750700280113, "loss": 0.8349, "step": 82 }, { "epoch": 0.00664, "grad_norm": 0.23452317714691162, "learning_rate": 0.0001987515006002401, "loss": 0.8549, "step": 83 }, { "epoch": 0.00672, "grad_norm": 0.20848535001277924, "learning_rate": 0.00019873549419767908, "loss": 1.0107, "step": 84 }, { "epoch": 0.0068, "grad_norm": 0.19626936316490173, "learning_rate": 0.00019871948779511805, "loss": 0.8725, "step": 85 }, { "epoch": 0.00688, "grad_norm": 0.17012615501880646, "learning_rate": 0.00019870348139255703, "loss": 0.6128, "step": 86 }, { "epoch": 0.00696, "grad_norm": 0.21431602537631989, "learning_rate": 0.000198687474989996, "loss": 0.8825, "step": 87 }, { "epoch": 0.00704, "grad_norm": 0.18353329598903656, "learning_rate": 0.000198671468587435, "loss": 0.621, "step": 88 }, { "epoch": 0.00712, "grad_norm": 0.20687150955200195, "learning_rate": 0.00019865546218487395, "loss": 0.8183, "step": 89 }, { "epoch": 0.0072, "grad_norm": 0.1761425882577896, "learning_rate": 0.00019863945578231293, "loss": 1.1441, "step": 90 }, { "epoch": 0.00728, "grad_norm": 0.2332560122013092, "learning_rate": 0.0001986234493797519, "loss": 0.8049, "step": 91 }, { "epoch": 0.00736, "grad_norm": 0.2265244722366333, "learning_rate": 0.0001986074429771909, "loss": 0.5617, "step": 92 }, { "epoch": 0.00744, "grad_norm": 0.22966669499874115, "learning_rate": 0.00019859143657462985, "loss": 0.8118, "step": 93 }, { "epoch": 0.00752, "grad_norm": 0.1718137413263321, "learning_rate": 0.00019857543017206883, "loss": 0.6905, "step": 94 }, { "epoch": 0.0076, "grad_norm": 0.1688937246799469, "learning_rate": 0.0001985594237695078, "loss": 0.532, "step": 95 }, { "epoch": 0.00768, "grad_norm": 0.263818621635437, "learning_rate": 0.0001985434173669468, "loss": 0.5655, "step": 96 }, { "epoch": 0.00776, "grad_norm": 0.20886196196079254, "learning_rate": 0.00019852741096438575, "loss": 0.655, "step": 97 }, { "epoch": 0.00784, "grad_norm": 0.23567333817481995, "learning_rate": 0.00019851140456182473, "loss": 0.7477, "step": 98 }, { "epoch": 0.00792, "grad_norm": 0.22903132438659668, "learning_rate": 0.00019849539815926373, "loss": 0.764, "step": 99 }, { "epoch": 0.008, "grad_norm": 0.1896105259656906, "learning_rate": 0.0001984793917567027, "loss": 0.7269, "step": 100 } ], "logging_steps": 1, "max_steps": 12500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.039229864103936e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }