{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9968051118210862, "eval_steps": 59, "global_step": 234, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004259850905218318, "grad_norm": 0.42660918831825256, "learning_rate": 2e-05, "loss": 1.4194, "step": 1 }, { "epoch": 0.004259850905218318, "eval_loss": 1.3981385231018066, "eval_runtime": 17.5749, "eval_samples_per_second": 22.475, "eval_steps_per_second": 2.845, "step": 1 }, { "epoch": 0.008519701810436636, "grad_norm": 0.38132771849632263, "learning_rate": 4e-05, "loss": 1.4291, "step": 2 }, { "epoch": 0.012779552715654952, "grad_norm": 0.4677501916885376, "learning_rate": 6e-05, "loss": 1.606, "step": 3 }, { "epoch": 0.01703940362087327, "grad_norm": 0.4839603900909424, "learning_rate": 8e-05, "loss": 1.5193, "step": 4 }, { "epoch": 0.021299254526091587, "grad_norm": 0.52900630235672, "learning_rate": 0.0001, "loss": 1.7253, "step": 5 }, { "epoch": 0.025559105431309903, "grad_norm": 0.4611320495605469, "learning_rate": 0.00012, "loss": 1.4042, "step": 6 }, { "epoch": 0.029818956336528223, "grad_norm": 0.5078997611999512, "learning_rate": 0.00014, "loss": 1.8641, "step": 7 }, { "epoch": 0.03407880724174654, "grad_norm": 0.5692968368530273, "learning_rate": 0.00016, "loss": 1.0603, "step": 8 }, { "epoch": 0.038338658146964855, "grad_norm": 0.5424911379814148, "learning_rate": 0.00018, "loss": 0.9217, "step": 9 }, { "epoch": 0.042598509052183174, "grad_norm": 0.6595712304115295, "learning_rate": 0.0002, "loss": 1.0443, "step": 10 }, { "epoch": 0.046858359957401494, "grad_norm": 0.552948534488678, "learning_rate": 0.00019999016517595753, "loss": 0.9727, "step": 11 }, { "epoch": 0.051118210862619806, "grad_norm": 0.523713231086731, "learning_rate": 0.00019996066263830531, "loss": 1.0042, "step": 12 }, { "epoch": 0.055378061767838126, "grad_norm": 0.3326718807220459, "learning_rate": 0.0001999114981900887, "loss": 0.6851, "step": 13 }, { "epoch": 0.059637912673056445, "grad_norm": 0.40246546268463135, "learning_rate": 0.00019984268150178167, "loss": 0.6865, "step": 14 }, { "epoch": 0.06389776357827476, "grad_norm": 0.3299888074398041, "learning_rate": 0.00019975422610938462, "loss": 0.6413, "step": 15 }, { "epoch": 0.06815761448349308, "grad_norm": 0.321532666683197, "learning_rate": 0.00019964614941176195, "loss": 0.6425, "step": 16 }, { "epoch": 0.0724174653887114, "grad_norm": 0.30551549792289734, "learning_rate": 0.0001995184726672197, "loss": 0.6573, "step": 17 }, { "epoch": 0.07667731629392971, "grad_norm": 0.3162730932235718, "learning_rate": 0.00019937122098932428, "loss": 0.7957, "step": 18 }, { "epoch": 0.08093716719914804, "grad_norm": 0.2646523714065552, "learning_rate": 0.00019920442334196248, "loss": 0.6842, "step": 19 }, { "epoch": 0.08519701810436635, "grad_norm": 0.35260164737701416, "learning_rate": 0.00019901811253364456, "loss": 0.7045, "step": 20 }, { "epoch": 0.08945686900958466, "grad_norm": 0.36994901299476624, "learning_rate": 0.00019881232521105089, "loss": 0.7506, "step": 21 }, { "epoch": 0.09371671991480299, "grad_norm": 0.3145638108253479, "learning_rate": 0.0001985871018518236, "loss": 0.6522, "step": 22 }, { "epoch": 0.0979765708200213, "grad_norm": 0.28740495443344116, "learning_rate": 0.00019834248675660486, "loss": 0.5763, "step": 23 }, { "epoch": 0.10223642172523961, "grad_norm": 0.29527685046195984, "learning_rate": 0.00019807852804032305, "loss": 0.8533, "step": 24 }, { "epoch": 0.10649627263045794, "grad_norm": 0.3023378849029541, "learning_rate": 0.00019779527762272877, "loss": 0.738, "step": 25 }, { "epoch": 0.11075612353567625, "grad_norm": 0.2749658524990082, "learning_rate": 0.00019749279121818235, "loss": 0.6354, "step": 26 }, { "epoch": 0.11501597444089456, "grad_norm": 0.3914307951927185, "learning_rate": 0.0001971711283246951, "loss": 0.8604, "step": 27 }, { "epoch": 0.11927582534611289, "grad_norm": 0.47873714566230774, "learning_rate": 0.00019683035221222618, "loss": 0.7972, "step": 28 }, { "epoch": 0.1235356762513312, "grad_norm": 0.22174575924873352, "learning_rate": 0.0001964705299102376, "loss": 0.4385, "step": 29 }, { "epoch": 0.12779552715654952, "grad_norm": 0.244963139295578, "learning_rate": 0.00019609173219450998, "loss": 0.7168, "step": 30 }, { "epoch": 0.13205537806176784, "grad_norm": 0.32758575677871704, "learning_rate": 0.0001956940335732209, "loss": 0.7231, "step": 31 }, { "epoch": 0.13631522896698617, "grad_norm": 0.21992172300815582, "learning_rate": 0.00019527751227228963, "loss": 0.662, "step": 32 }, { "epoch": 0.14057507987220447, "grad_norm": 0.2899262309074402, "learning_rate": 0.0001948422502199903, "loss": 0.4651, "step": 33 }, { "epoch": 0.1448349307774228, "grad_norm": 0.23878340423107147, "learning_rate": 0.00019438833303083678, "loss": 0.5367, "step": 34 }, { "epoch": 0.14909478168264112, "grad_norm": 0.20475314557552338, "learning_rate": 0.0001939158499887428, "loss": 0.4024, "step": 35 }, { "epoch": 0.15335463258785942, "grad_norm": 0.25068745017051697, "learning_rate": 0.00019342489402945998, "loss": 0.6575, "step": 36 }, { "epoch": 0.15761448349307774, "grad_norm": 0.3811924159526825, "learning_rate": 0.00019291556172229785, "loss": 0.6405, "step": 37 }, { "epoch": 0.16187433439829607, "grad_norm": 0.2627577483654022, "learning_rate": 0.0001923879532511287, "loss": 0.6961, "step": 38 }, { "epoch": 0.16613418530351437, "grad_norm": 0.32665154337882996, "learning_rate": 0.00019184217239468212, "loss": 0.6983, "step": 39 }, { "epoch": 0.1703940362087327, "grad_norm": 0.24597743153572083, "learning_rate": 0.00019127832650613189, "loss": 0.54, "step": 40 }, { "epoch": 0.17465388711395102, "grad_norm": 0.2611660361289978, "learning_rate": 0.00019069652649198005, "loss": 0.6281, "step": 41 }, { "epoch": 0.17891373801916932, "grad_norm": 0.2969326078891754, "learning_rate": 0.0001900968867902419, "loss": 0.6817, "step": 42 }, { "epoch": 0.18317358892438765, "grad_norm": 0.27561935782432556, "learning_rate": 0.00018947952534793661, "loss": 0.626, "step": 43 }, { "epoch": 0.18743343982960597, "grad_norm": 0.33468887209892273, "learning_rate": 0.00018884456359788724, "loss": 0.7383, "step": 44 }, { "epoch": 0.19169329073482427, "grad_norm": 0.2937297224998474, "learning_rate": 0.0001881921264348355, "loss": 0.6972, "step": 45 }, { "epoch": 0.1959531416400426, "grad_norm": 0.33218011260032654, "learning_rate": 0.00018752234219087538, "loss": 0.6749, "step": 46 }, { "epoch": 0.20021299254526093, "grad_norm": 0.2661404311656952, "learning_rate": 0.00018683534261021057, "loss": 0.4882, "step": 47 }, { "epoch": 0.20447284345047922, "grad_norm": 0.2451002150774002, "learning_rate": 0.00018613126282324092, "loss": 0.637, "step": 48 }, { "epoch": 0.20873269435569755, "grad_norm": 0.27517661452293396, "learning_rate": 0.00018541024131998274, "loss": 0.5483, "step": 49 }, { "epoch": 0.21299254526091588, "grad_norm": 0.24373459815979004, "learning_rate": 0.00018467241992282843, "loss": 0.5112, "step": 50 }, { "epoch": 0.21725239616613418, "grad_norm": 0.3239864408969879, "learning_rate": 0.00018391794375865024, "loss": 0.8005, "step": 51 }, { "epoch": 0.2215122470713525, "grad_norm": 0.29262682795524597, "learning_rate": 0.00018314696123025454, "loss": 0.6769, "step": 52 }, { "epoch": 0.22577209797657083, "grad_norm": 0.28277888894081116, "learning_rate": 0.00018235962398719147, "loss": 0.6892, "step": 53 }, { "epoch": 0.23003194888178913, "grad_norm": 0.41741546988487244, "learning_rate": 0.00018155608689592604, "loss": 0.6763, "step": 54 }, { "epoch": 0.23429179978700745, "grad_norm": 0.2734082043170929, "learning_rate": 0.00018073650800937624, "loss": 0.697, "step": 55 }, { "epoch": 0.23855165069222578, "grad_norm": 0.2646290957927704, "learning_rate": 0.00017990104853582493, "loss": 0.5936, "step": 56 }, { "epoch": 0.24281150159744408, "grad_norm": 0.27723610401153564, "learning_rate": 0.00017904987280721035, "loss": 0.5875, "step": 57 }, { "epoch": 0.2470713525026624, "grad_norm": 0.2668153643608093, "learning_rate": 0.000178183148246803, "loss": 0.5219, "step": 58 }, { "epoch": 0.25133120340788073, "grad_norm": 0.29033368825912476, "learning_rate": 0.0001773010453362737, "loss": 0.5997, "step": 59 }, { "epoch": 0.25133120340788073, "eval_loss": 0.5784963965415955, "eval_runtime": 17.4317, "eval_samples_per_second": 22.66, "eval_steps_per_second": 2.868, "step": 59 }, { "epoch": 0.25559105431309903, "grad_norm": 0.2783537209033966, "learning_rate": 0.00017640373758216077, "loss": 0.483, "step": 60 }, { "epoch": 0.2598509052183174, "grad_norm": 0.31082215905189514, "learning_rate": 0.0001754914014817416, "loss": 0.6473, "step": 61 }, { "epoch": 0.2641107561235357, "grad_norm": 0.3206618130207062, "learning_rate": 0.00017456421648831655, "loss": 0.6289, "step": 62 }, { "epoch": 0.268370607028754, "grad_norm": 0.2875254154205322, "learning_rate": 0.00017362236497591094, "loss": 0.594, "step": 63 }, { "epoch": 0.27263045793397234, "grad_norm": 0.22950579226016998, "learning_rate": 0.0001726660322034027, "loss": 0.3886, "step": 64 }, { "epoch": 0.27689030883919064, "grad_norm": 0.24293649196624756, "learning_rate": 0.00017169540627808274, "loss": 0.6129, "step": 65 }, { "epoch": 0.28115015974440893, "grad_norm": 0.2611636519432068, "learning_rate": 0.00017071067811865476, "loss": 0.6891, "step": 66 }, { "epoch": 0.2854100106496273, "grad_norm": 0.284407377243042, "learning_rate": 0.00016971204141768233, "loss": 0.516, "step": 67 }, { "epoch": 0.2896698615548456, "grad_norm": 0.21485944092273712, "learning_rate": 0.00016869969260349018, "loss": 0.3826, "step": 68 }, { "epoch": 0.2939297124600639, "grad_norm": 0.29337963461875916, "learning_rate": 0.00016767383080152742, "loss": 0.5696, "step": 69 }, { "epoch": 0.29818956336528224, "grad_norm": 0.27099764347076416, "learning_rate": 0.0001666346577952004, "loss": 0.4708, "step": 70 }, { "epoch": 0.30244941427050054, "grad_norm": 0.29055824875831604, "learning_rate": 0.00016558237798618245, "loss": 0.5844, "step": 71 }, { "epoch": 0.30670926517571884, "grad_norm": 0.22874757647514343, "learning_rate": 0.00016451719835420877, "loss": 0.4412, "step": 72 }, { "epoch": 0.3109691160809372, "grad_norm": 0.2926221489906311, "learning_rate": 0.00016343932841636456, "loss": 0.5757, "step": 73 }, { "epoch": 0.3152289669861555, "grad_norm": 0.30070438981056213, "learning_rate": 0.00016234898018587337, "loss": 0.6063, "step": 74 }, { "epoch": 0.3194888178913738, "grad_norm": 0.2475481927394867, "learning_rate": 0.00016124636813039502, "loss": 0.5056, "step": 75 }, { "epoch": 0.32374866879659214, "grad_norm": 0.2851349711418152, "learning_rate": 0.00016013170912984058, "loss": 0.7547, "step": 76 }, { "epoch": 0.32800851970181044, "grad_norm": 0.25569260120391846, "learning_rate": 0.00015900522243371282, "loss": 0.5168, "step": 77 }, { "epoch": 0.33226837060702874, "grad_norm": 0.3774610757827759, "learning_rate": 0.0001578671296179806, "loss": 0.6691, "step": 78 }, { "epoch": 0.3365282215122471, "grad_norm": 0.2339468151330948, "learning_rate": 0.00015671765454149559, "loss": 0.5021, "step": 79 }, { "epoch": 0.3407880724174654, "grad_norm": 0.3066350519657135, "learning_rate": 0.00015555702330196023, "loss": 0.6838, "step": 80 }, { "epoch": 0.3450479233226837, "grad_norm": 0.271908164024353, "learning_rate": 0.00015438546419145488, "loss": 0.4837, "step": 81 }, { "epoch": 0.34930777422790205, "grad_norm": 0.304290771484375, "learning_rate": 0.00015320320765153367, "loss": 0.6768, "step": 82 }, { "epoch": 0.35356762513312034, "grad_norm": 0.25685280561447144, "learning_rate": 0.00015201048622789747, "loss": 0.4335, "step": 83 }, { "epoch": 0.35782747603833864, "grad_norm": 0.3003567159175873, "learning_rate": 0.00015080753452465296, "loss": 0.5836, "step": 84 }, { "epoch": 0.362087326943557, "grad_norm": 0.2585873007774353, "learning_rate": 0.0001495945891581668, "loss": 0.5391, "step": 85 }, { "epoch": 0.3663471778487753, "grad_norm": 0.30791282653808594, "learning_rate": 0.000148371888710524, "loss": 0.5103, "step": 86 }, { "epoch": 0.3706070287539936, "grad_norm": 0.23016773164272308, "learning_rate": 0.0001471396736825998, "loss": 0.4269, "step": 87 }, { "epoch": 0.37486687965921195, "grad_norm": 0.3137454390525818, "learning_rate": 0.00014589818644675378, "loss": 0.5116, "step": 88 }, { "epoch": 0.37912673056443025, "grad_norm": 0.28078484535217285, "learning_rate": 0.00014464767119915629, "loss": 0.4388, "step": 89 }, { "epoch": 0.38338658146964855, "grad_norm": 0.3163893222808838, "learning_rate": 0.00014338837391175582, "loss": 0.6122, "step": 90 }, { "epoch": 0.3876464323748669, "grad_norm": 0.34674668312072754, "learning_rate": 0.0001421205422838971, "loss": 0.7114, "step": 91 }, { "epoch": 0.3919062832800852, "grad_norm": 0.2210942953824997, "learning_rate": 0.00014084442569359964, "loss": 0.3351, "step": 92 }, { "epoch": 0.3961661341853035, "grad_norm": 0.30586308240890503, "learning_rate": 0.0001395602751485059, "loss": 0.4845, "step": 93 }, { "epoch": 0.40042598509052185, "grad_norm": 0.2695784568786621, "learning_rate": 0.000138268343236509, "loss": 0.4992, "step": 94 }, { "epoch": 0.40468583599574015, "grad_norm": 0.2989813983440399, "learning_rate": 0.00013696888407606952, "loss": 0.585, "step": 95 }, { "epoch": 0.40894568690095845, "grad_norm": 0.2759920656681061, "learning_rate": 0.0001356621532662313, "loss": 0.4492, "step": 96 }, { "epoch": 0.4132055378061768, "grad_norm": 0.33117353916168213, "learning_rate": 0.0001343484078363461, "loss": 0.5606, "step": 97 }, { "epoch": 0.4174653887113951, "grad_norm": 0.24572253227233887, "learning_rate": 0.00013302790619551674, "loss": 0.3261, "step": 98 }, { "epoch": 0.4217252396166134, "grad_norm": 0.322480171918869, "learning_rate": 0.00013170090808176883, "loss": 0.5527, "step": 99 }, { "epoch": 0.42598509052183176, "grad_norm": 0.3101179301738739, "learning_rate": 0.00013036767451096148, "loss": 0.5419, "step": 100 }, { "epoch": 0.43024494142705005, "grad_norm": 0.3218703269958496, "learning_rate": 0.00012902846772544624, "loss": 0.5441, "step": 101 }, { "epoch": 0.43450479233226835, "grad_norm": 0.26214686036109924, "learning_rate": 0.00012768355114248494, "loss": 0.5388, "step": 102 }, { "epoch": 0.4387646432374867, "grad_norm": 0.421612411737442, "learning_rate": 0.00012633318930243648, "loss": 0.7557, "step": 103 }, { "epoch": 0.443024494142705, "grad_norm": 0.5120344758033752, "learning_rate": 0.0001249776478167227, "loss": 0.7028, "step": 104 }, { "epoch": 0.4472843450479233, "grad_norm": 0.27614736557006836, "learning_rate": 0.00012361719331558345, "loss": 0.3954, "step": 105 }, { "epoch": 0.45154419595314166, "grad_norm": 0.269520103931427, "learning_rate": 0.00012225209339563145, "loss": 0.4851, "step": 106 }, { "epoch": 0.45580404685835996, "grad_norm": 0.2739225924015045, "learning_rate": 0.000120882616567217, "loss": 0.4907, "step": 107 }, { "epoch": 0.46006389776357826, "grad_norm": 0.33920663595199585, "learning_rate": 0.00011950903220161285, "loss": 0.6288, "step": 108 }, { "epoch": 0.4643237486687966, "grad_norm": 0.279832124710083, "learning_rate": 0.00011813161047802985, "loss": 0.447, "step": 109 }, { "epoch": 0.4685835995740149, "grad_norm": 0.31790605187416077, "learning_rate": 0.00011675062233047364, "loss": 0.5933, "step": 110 }, { "epoch": 0.4728434504792332, "grad_norm": 0.24926939606666565, "learning_rate": 0.000115366339394453, "loss": 0.4061, "step": 111 }, { "epoch": 0.47710330138445156, "grad_norm": 0.3327280282974243, "learning_rate": 0.00011397903395354996, "loss": 0.484, "step": 112 }, { "epoch": 0.48136315228966986, "grad_norm": 0.37822094559669495, "learning_rate": 0.00011258897888586255, "loss": 0.6416, "step": 113 }, { "epoch": 0.48562300319488816, "grad_norm": 0.35605669021606445, "learning_rate": 0.00011119644761033078, "loss": 0.6136, "step": 114 }, { "epoch": 0.4898828541001065, "grad_norm": 0.3513132929801941, "learning_rate": 0.0001098017140329561, "loss": 0.6299, "step": 115 }, { "epoch": 0.4941427050053248, "grad_norm": 0.3040708899497986, "learning_rate": 0.00010840505249292476, "loss": 0.4658, "step": 116 }, { "epoch": 0.4984025559105431, "grad_norm": 0.19006308913230896, "learning_rate": 0.00010700673770864673, "loss": 0.2694, "step": 117 }, { "epoch": 0.5026624068157615, "grad_norm": 0.30643633008003235, "learning_rate": 0.00010560704472371919, "loss": 0.4492, "step": 118 }, { "epoch": 0.5026624068157615, "eval_loss": 0.5326976180076599, "eval_runtime": 17.5872, "eval_samples_per_second": 22.46, "eval_steps_per_second": 2.843, "step": 118 }, { "epoch": 0.5069222577209798, "grad_norm": 0.3698013722896576, "learning_rate": 0.00010420624885282653, "loss": 0.6993, "step": 119 }, { "epoch": 0.5111821086261981, "grad_norm": 0.2801634967327118, "learning_rate": 0.0001028046256275869, "loss": 0.4059, "step": 120 }, { "epoch": 0.5154419595314164, "grad_norm": 0.2864643931388855, "learning_rate": 0.00010140245074235624, "loss": 0.5024, "step": 121 }, { "epoch": 0.5197018104366348, "grad_norm": 0.30105265974998474, "learning_rate": 0.0001, "loss": 0.6774, "step": 122 }, { "epoch": 0.5239616613418531, "grad_norm": 0.39152050018310547, "learning_rate": 9.859754925764378e-05, "loss": 0.625, "step": 123 }, { "epoch": 0.5282215122470714, "grad_norm": 0.3618883192539215, "learning_rate": 9.719537437241312e-05, "loss": 0.6978, "step": 124 }, { "epoch": 0.5324813631522897, "grad_norm": 0.23670899868011475, "learning_rate": 9.579375114717351e-05, "loss": 0.3379, "step": 125 }, { "epoch": 0.536741214057508, "grad_norm": 0.3124864101409912, "learning_rate": 9.439295527628081e-05, "loss": 0.525, "step": 126 }, { "epoch": 0.5410010649627263, "grad_norm": 0.3667398989200592, "learning_rate": 9.299326229135326e-05, "loss": 0.6164, "step": 127 }, { "epoch": 0.5452609158679447, "grad_norm": 0.2894105613231659, "learning_rate": 9.159494750707526e-05, "loss": 0.4335, "step": 128 }, { "epoch": 0.549520766773163, "grad_norm": 0.30680200457572937, "learning_rate": 9.019828596704394e-05, "loss": 0.4507, "step": 129 }, { "epoch": 0.5537806176783813, "grad_norm": 0.3676758110523224, "learning_rate": 8.880355238966923e-05, "loss": 0.5955, "step": 130 }, { "epoch": 0.5580404685835996, "grad_norm": 0.3194178342819214, "learning_rate": 8.741102111413748e-05, "loss": 0.5675, "step": 131 }, { "epoch": 0.5623003194888179, "grad_norm": 0.29750558733940125, "learning_rate": 8.602096604645009e-05, "loss": 0.5785, "step": 132 }, { "epoch": 0.5665601703940362, "grad_norm": 0.37204545736312866, "learning_rate": 8.463366060554698e-05, "loss": 0.612, "step": 133 }, { "epoch": 0.5708200212992546, "grad_norm": 0.36891940236091614, "learning_rate": 8.324937766952638e-05, "loss": 0.5463, "step": 134 }, { "epoch": 0.5750798722044729, "grad_norm": 0.2863575518131256, "learning_rate": 8.186838952197018e-05, "loss": 0.4884, "step": 135 }, { "epoch": 0.5793397231096912, "grad_norm": 0.354523241519928, "learning_rate": 8.049096779838719e-05, "loss": 0.7727, "step": 136 }, { "epoch": 0.5835995740149095, "grad_norm": 0.30339759588241577, "learning_rate": 7.911738343278304e-05, "loss": 0.5543, "step": 137 }, { "epoch": 0.5878594249201278, "grad_norm": 0.27778202295303345, "learning_rate": 7.774790660436858e-05, "loss": 0.4716, "step": 138 }, { "epoch": 0.5921192758253461, "grad_norm": 0.38618960976600647, "learning_rate": 7.63828066844166e-05, "loss": 0.6519, "step": 139 }, { "epoch": 0.5963791267305645, "grad_norm": 0.3573627769947052, "learning_rate": 7.502235218327731e-05, "loss": 0.5128, "step": 140 }, { "epoch": 0.6006389776357828, "grad_norm": 0.30529165267944336, "learning_rate": 7.366681069756352e-05, "loss": 0.5184, "step": 141 }, { "epoch": 0.6048988285410011, "grad_norm": 0.2819828987121582, "learning_rate": 7.231644885751507e-05, "loss": 0.4259, "step": 142 }, { "epoch": 0.6091586794462194, "grad_norm": 0.32307252287864685, "learning_rate": 7.097153227455379e-05, "loss": 0.6048, "step": 143 }, { "epoch": 0.6134185303514377, "grad_norm": 0.31262722611427307, "learning_rate": 6.963232548903853e-05, "loss": 0.4834, "step": 144 }, { "epoch": 0.617678381256656, "grad_norm": 0.318851500749588, "learning_rate": 6.829909191823121e-05, "loss": 0.5011, "step": 145 }, { "epoch": 0.6219382321618744, "grad_norm": 0.44246405363082886, "learning_rate": 6.697209380448333e-05, "loss": 0.4384, "step": 146 }, { "epoch": 0.6261980830670927, "grad_norm": 0.3459945023059845, "learning_rate": 6.565159216365389e-05, "loss": 0.5657, "step": 147 }, { "epoch": 0.630457933972311, "grad_norm": 0.33843329548835754, "learning_rate": 6.43378467337687e-05, "loss": 0.5711, "step": 148 }, { "epoch": 0.6347177848775293, "grad_norm": 0.3812694549560547, "learning_rate": 6.30311159239305e-05, "loss": 0.6142, "step": 149 }, { "epoch": 0.6389776357827476, "grad_norm": 0.29333916306495667, "learning_rate": 6.173165676349103e-05, "loss": 0.585, "step": 150 }, { "epoch": 0.6432374866879659, "grad_norm": 0.2884041666984558, "learning_rate": 6.043972485149414e-05, "loss": 0.4866, "step": 151 }, { "epoch": 0.6474973375931843, "grad_norm": 0.33954814076423645, "learning_rate": 5.9155574306400395e-05, "loss": 0.571, "step": 152 }, { "epoch": 0.6517571884984026, "grad_norm": 0.33935782313346863, "learning_rate": 5.787945771610296e-05, "loss": 0.5037, "step": 153 }, { "epoch": 0.6560170394036209, "grad_norm": 0.27371054887771606, "learning_rate": 5.6611626088244194e-05, "loss": 0.3322, "step": 154 }, { "epoch": 0.6602768903088392, "grad_norm": 0.30788496136665344, "learning_rate": 5.5352328800843724e-05, "loss": 0.4454, "step": 155 }, { "epoch": 0.6645367412140575, "grad_norm": 0.34366151690483093, "learning_rate": 5.410181355324622e-05, "loss": 0.5788, "step": 156 }, { "epoch": 0.6687965921192758, "grad_norm": 0.33698371052742004, "learning_rate": 5.286032631740023e-05, "loss": 0.4378, "step": 157 }, { "epoch": 0.6730564430244942, "grad_norm": 0.4181162416934967, "learning_rate": 5.162811128947602e-05, "loss": 0.5367, "step": 158 }, { "epoch": 0.6773162939297125, "grad_norm": 0.4480881690979004, "learning_rate": 5.0405410841833253e-05, "loss": 0.6633, "step": 159 }, { "epoch": 0.6815761448349308, "grad_norm": 0.37488028407096863, "learning_rate": 4.919246547534708e-05, "loss": 0.5402, "step": 160 }, { "epoch": 0.6858359957401491, "grad_norm": 0.2964366376399994, "learning_rate": 4.7989513772102537e-05, "loss": 0.4109, "step": 161 }, { "epoch": 0.6900958466453674, "grad_norm": 0.35376259684562683, "learning_rate": 4.6796792348466356e-05, "loss": 0.636, "step": 162 }, { "epoch": 0.6943556975505857, "grad_norm": 0.3158915638923645, "learning_rate": 4.561453580854516e-05, "loss": 0.4893, "step": 163 }, { "epoch": 0.6986155484558041, "grad_norm": 0.420785516500473, "learning_rate": 4.444297669803981e-05, "loss": 0.7147, "step": 164 }, { "epoch": 0.7028753993610224, "grad_norm": 0.3272782564163208, "learning_rate": 4.328234545850442e-05, "loss": 0.3444, "step": 165 }, { "epoch": 0.7071352502662407, "grad_norm": 0.30052492022514343, "learning_rate": 4.213287038201943e-05, "loss": 0.5209, "step": 166 }, { "epoch": 0.711395101171459, "grad_norm": 0.37648481130599976, "learning_rate": 4.0994777566287204e-05, "loss": 0.684, "step": 167 }, { "epoch": 0.7156549520766773, "grad_norm": 0.3135606646537781, "learning_rate": 3.9868290870159405e-05, "loss": 0.4871, "step": 168 }, { "epoch": 0.7199148029818956, "grad_norm": 0.33847576379776, "learning_rate": 3.875363186960499e-05, "loss": 0.5294, "step": 169 }, { "epoch": 0.724174653887114, "grad_norm": 0.3337070047855377, "learning_rate": 3.7651019814126654e-05, "loss": 0.4425, "step": 170 }, { "epoch": 0.7284345047923323, "grad_norm": 0.4173165261745453, "learning_rate": 3.6560671583635467e-05, "loss": 0.637, "step": 171 }, { "epoch": 0.7326943556975506, "grad_norm": 0.41098451614379883, "learning_rate": 3.548280164579126e-05, "loss": 0.52, "step": 172 }, { "epoch": 0.7369542066027689, "grad_norm": 0.3789665699005127, "learning_rate": 3.4417622013817595e-05, "loss": 0.5995, "step": 173 }, { "epoch": 0.7412140575079872, "grad_norm": 0.3996846675872803, "learning_rate": 3.336534220479961e-05, "loss": 0.6237, "step": 174 }, { "epoch": 0.7454739084132055, "grad_norm": 0.3990687131881714, "learning_rate": 3.2326169198472556e-05, "loss": 0.555, "step": 175 }, { "epoch": 0.7497337593184239, "grad_norm": 0.32280924916267395, "learning_rate": 3.130030739650983e-05, "loss": 0.4742, "step": 176 }, { "epoch": 0.7539936102236422, "grad_norm": 0.4192362129688263, "learning_rate": 3.0287958582317676e-05, "loss": 0.6569, "step": 177 }, { "epoch": 0.7539936102236422, "eval_loss": 0.5110090970993042, "eval_runtime": 20.8066, "eval_samples_per_second": 18.984, "eval_steps_per_second": 2.403, "step": 177 }, { "epoch": 0.7582534611288605, "grad_norm": 0.35410746932029724, "learning_rate": 2.9289321881345254e-05, "loss": 0.4828, "step": 178 }, { "epoch": 0.7625133120340788, "grad_norm": 0.4463326036930084, "learning_rate": 2.8304593721917285e-05, "loss": 0.6976, "step": 179 }, { "epoch": 0.7667731629392971, "grad_norm": 0.29797378182411194, "learning_rate": 2.7333967796597315e-05, "loss": 0.564, "step": 180 }, { "epoch": 0.7710330138445154, "grad_norm": 0.31337812542915344, "learning_rate": 2.6377635024089087e-05, "loss": 0.5607, "step": 181 }, { "epoch": 0.7752928647497338, "grad_norm": 0.40470513701438904, "learning_rate": 2.5435783511683443e-05, "loss": 0.6428, "step": 182 }, { "epoch": 0.7795527156549521, "grad_norm": 0.413817822933197, "learning_rate": 2.450859851825842e-05, "loss": 0.7303, "step": 183 }, { "epoch": 0.7838125665601704, "grad_norm": 0.2931414842605591, "learning_rate": 2.3596262417839255e-05, "loss": 0.4051, "step": 184 }, { "epoch": 0.7880724174653887, "grad_norm": 0.34086865186691284, "learning_rate": 2.26989546637263e-05, "loss": 0.4329, "step": 185 }, { "epoch": 0.792332268370607, "grad_norm": 0.40336307883262634, "learning_rate": 2.181685175319702e-05, "loss": 0.5791, "step": 186 }, { "epoch": 0.7965921192758253, "grad_norm": 0.30092838406562805, "learning_rate": 2.095012719278966e-05, "loss": 0.4491, "step": 187 }, { "epoch": 0.8008519701810437, "grad_norm": 0.31043168902397156, "learning_rate": 2.009895146417512e-05, "loss": 0.4681, "step": 188 }, { "epoch": 0.805111821086262, "grad_norm": 0.3712119162082672, "learning_rate": 1.926349199062376e-05, "loss": 0.549, "step": 189 }, { "epoch": 0.8093716719914803, "grad_norm": 0.3679051995277405, "learning_rate": 1.8443913104073983e-05, "loss": 0.4971, "step": 190 }, { "epoch": 0.8136315228966986, "grad_norm": 0.3244669735431671, "learning_rate": 1.7640376012808536e-05, "loss": 0.4732, "step": 191 }, { "epoch": 0.8178913738019169, "grad_norm": 0.28653696179389954, "learning_rate": 1.6853038769745467e-05, "loss": 0.3469, "step": 192 }, { "epoch": 0.8221512247071352, "grad_norm": 0.3144218325614929, "learning_rate": 1.6082056241349786e-05, "loss": 0.5127, "step": 193 }, { "epoch": 0.8264110756123536, "grad_norm": 0.3801470994949341, "learning_rate": 1.5327580077171587e-05, "loss": 0.5178, "step": 194 }, { "epoch": 0.8306709265175719, "grad_norm": 0.37223386764526367, "learning_rate": 1.4589758680017263e-05, "loss": 0.5114, "step": 195 }, { "epoch": 0.8349307774227902, "grad_norm": 0.4167802333831787, "learning_rate": 1.3868737176759106e-05, "loss": 0.6949, "step": 196 }, { "epoch": 0.8391906283280085, "grad_norm": 0.620794951915741, "learning_rate": 1.3164657389789458e-05, "loss": 0.7015, "step": 197 }, { "epoch": 0.8434504792332268, "grad_norm": 0.32053133845329285, "learning_rate": 1.2477657809124631e-05, "loss": 0.5328, "step": 198 }, { "epoch": 0.8477103301384451, "grad_norm": 0.41892528533935547, "learning_rate": 1.1807873565164506e-05, "loss": 0.5929, "step": 199 }, { "epoch": 0.8519701810436635, "grad_norm": 0.2980664372444153, "learning_rate": 1.1155436402112785e-05, "loss": 0.4209, "step": 200 }, { "epoch": 0.8562300319488818, "grad_norm": 0.3290930986404419, "learning_rate": 1.0520474652063394e-05, "loss": 0.4423, "step": 201 }, { "epoch": 0.8604898828541001, "grad_norm": 0.3246372640132904, "learning_rate": 9.903113209758096e-06, "loss": 0.4631, "step": 202 }, { "epoch": 0.8647497337593184, "grad_norm": 0.3644905388355255, "learning_rate": 9.303473508019944e-06, "loss": 0.552, "step": 203 }, { "epoch": 0.8690095846645367, "grad_norm": 0.49974295496940613, "learning_rate": 8.72167349386811e-06, "loss": 0.7516, "step": 204 }, { "epoch": 0.873269435569755, "grad_norm": 0.3242340087890625, "learning_rate": 8.157827605317892e-06, "loss": 0.412, "step": 205 }, { "epoch": 0.8775292864749734, "grad_norm": 0.33690881729125977, "learning_rate": 7.612046748871327e-06, "loss": 0.4841, "step": 206 }, { "epoch": 0.8817891373801917, "grad_norm": 0.3246766924858093, "learning_rate": 7.084438277702188e-06, "loss": 0.4341, "step": 207 }, { "epoch": 0.88604898828541, "grad_norm": 0.4262131452560425, "learning_rate": 6.5751059705400295e-06, "loss": 0.6306, "step": 208 }, { "epoch": 0.8903088391906283, "grad_norm": 0.32158687710762024, "learning_rate": 6.084150011257239e-06, "loss": 0.4687, "step": 209 }, { "epoch": 0.8945686900958466, "grad_norm": 0.377208948135376, "learning_rate": 5.611666969163243e-06, "loss": 0.5849, "step": 210 }, { "epoch": 0.898828541001065, "grad_norm": 0.30956804752349854, "learning_rate": 5.157749780009735e-06, "loss": 0.4355, "step": 211 }, { "epoch": 0.9030883919062833, "grad_norm": 0.4885202944278717, "learning_rate": 4.722487727710368e-06, "loss": 0.6129, "step": 212 }, { "epoch": 0.9073482428115016, "grad_norm": 0.3384571075439453, "learning_rate": 4.305966426779118e-06, "loss": 0.4345, "step": 213 }, { "epoch": 0.9116080937167199, "grad_norm": 0.4629303514957428, "learning_rate": 3.908267805490051e-06, "loss": 0.6158, "step": 214 }, { "epoch": 0.9158679446219382, "grad_norm": 0.3206894099712372, "learning_rate": 3.529470089762421e-06, "loss": 0.4689, "step": 215 }, { "epoch": 0.9201277955271565, "grad_norm": 0.41424816846847534, "learning_rate": 3.169647787773866e-06, "loss": 0.5235, "step": 216 }, { "epoch": 0.9243876464323749, "grad_norm": 0.3189912736415863, "learning_rate": 2.8288716753049005e-06, "loss": 0.4262, "step": 217 }, { "epoch": 0.9286474973375932, "grad_norm": 0.3202993869781494, "learning_rate": 2.5072087818176382e-06, "loss": 0.493, "step": 218 }, { "epoch": 0.9329073482428115, "grad_norm": 0.32974228262901306, "learning_rate": 2.20472237727124e-06, "loss": 0.5512, "step": 219 }, { "epoch": 0.9371671991480298, "grad_norm": 0.27346375584602356, "learning_rate": 1.921471959676957e-06, "loss": 0.4154, "step": 220 }, { "epoch": 0.9414270500532481, "grad_norm": 0.29039615392684937, "learning_rate": 1.657513243395159e-06, "loss": 0.4367, "step": 221 }, { "epoch": 0.9456869009584664, "grad_norm": 0.3864074647426605, "learning_rate": 1.4128981481764115e-06, "loss": 0.5531, "step": 222 }, { "epoch": 0.9499467518636848, "grad_norm": 0.4020756185054779, "learning_rate": 1.1876747889491223e-06, "loss": 0.6631, "step": 223 }, { "epoch": 0.9542066027689031, "grad_norm": 0.37475478649139404, "learning_rate": 9.818874663554357e-07, "loss": 0.4979, "step": 224 }, { "epoch": 0.9584664536741214, "grad_norm": 0.3484041690826416, "learning_rate": 7.955766580375335e-07, "loss": 0.5207, "step": 225 }, { "epoch": 0.9627263045793397, "grad_norm": 0.3385999798774719, "learning_rate": 6.287790106757396e-07, "loss": 0.4832, "step": 226 }, { "epoch": 0.966986155484558, "grad_norm": 0.27909693121910095, "learning_rate": 4.815273327803182e-07, "loss": 0.3506, "step": 227 }, { "epoch": 0.9712460063897763, "grad_norm": 0.34196606278419495, "learning_rate": 3.5385058823809156e-07, "loss": 0.5283, "step": 228 }, { "epoch": 0.9755058572949947, "grad_norm": 0.39571547508239746, "learning_rate": 2.457738906153972e-07, "loss": 0.5099, "step": 229 }, { "epoch": 0.979765708200213, "grad_norm": 0.4107287526130676, "learning_rate": 1.5731849821833954e-07, "loss": 0.5042, "step": 230 }, { "epoch": 0.9840255591054313, "grad_norm": 0.3254135251045227, "learning_rate": 8.850180991131219e-08, "loss": 0.4929, "step": 231 }, { "epoch": 0.9882854100106496, "grad_norm": 0.2778495252132416, "learning_rate": 3.933736169471347e-08, "loss": 0.3571, "step": 232 }, { "epoch": 0.9925452609158679, "grad_norm": 0.4703886806964874, "learning_rate": 9.834824042498358e-09, "loss": 0.7013, "step": 233 }, { "epoch": 0.9968051118210862, "grad_norm": 0.3349379599094391, "learning_rate": 0.0, "loss": 0.4725, "step": 234 } ], "logging_steps": 1, "max_steps": 234, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1191352754700288e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }