{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9730290456431536, "eval_steps": 500, "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008298755186721992, "grad_norm": 0.80859375, "learning_rate": 8.695652173913044e-06, "loss": 2.3455, "step": 1 }, { "epoch": 0.016597510373443983, "grad_norm": 0.78125, "learning_rate": 1.739130434782609e-05, "loss": 2.2601, "step": 2 }, { "epoch": 0.024896265560165973, "grad_norm": 0.75390625, "learning_rate": 2.608695652173913e-05, "loss": 2.2494, "step": 3 }, { "epoch": 0.03319502074688797, "grad_norm": 0.8046875, "learning_rate": 3.478260869565218e-05, "loss": 2.2988, "step": 4 }, { "epoch": 0.04149377593360996, "grad_norm": 0.95703125, "learning_rate": 4.347826086956522e-05, "loss": 3.0938, "step": 5 }, { "epoch": 0.04979253112033195, "grad_norm": 0.7578125, "learning_rate": 5.217391304347826e-05, "loss": 2.1633, "step": 6 }, { "epoch": 0.058091286307053944, "grad_norm": 0.7421875, "learning_rate": 6.086956521739131e-05, "loss": 2.1488, "step": 7 }, { "epoch": 0.06639004149377593, "grad_norm": 0.62109375, "learning_rate": 6.956521739130436e-05, "loss": 2.0031, "step": 8 }, { "epoch": 0.07468879668049792, "grad_norm": 0.52734375, "learning_rate": 7.82608695652174e-05, "loss": 1.9347, "step": 9 }, { "epoch": 0.08298755186721991, "grad_norm": 0.369140625, "learning_rate": 8.695652173913044e-05, "loss": 1.8457, "step": 10 }, { "epoch": 0.0912863070539419, "grad_norm": 0.357421875, "learning_rate": 9.565217391304348e-05, "loss": 1.7273, "step": 11 }, { "epoch": 0.0995850622406639, "grad_norm": 0.376953125, "learning_rate": 0.00010434782608695653, "loss": 1.7584, "step": 12 }, { "epoch": 0.1078838174273859, "grad_norm": 0.39453125, "learning_rate": 0.00011304347826086956, "loss": 1.6387, "step": 13 }, { "epoch": 0.11618257261410789, "grad_norm": 0.458984375, "learning_rate": 0.00012173913043478263, "loss": 1.6614, "step": 14 }, { "epoch": 0.12448132780082988, "grad_norm": 0.482421875, "learning_rate": 0.00013043478260869567, "loss": 1.648, "step": 15 }, { "epoch": 0.13278008298755187, "grad_norm": 0.447265625, "learning_rate": 0.0001391304347826087, "loss": 1.6337, "step": 16 }, { "epoch": 0.14107883817427386, "grad_norm": 0.37109375, "learning_rate": 0.00014782608695652173, "loss": 1.6004, "step": 17 }, { "epoch": 0.14937759336099585, "grad_norm": 0.41796875, "learning_rate": 0.0001565217391304348, "loss": 1.6146, "step": 18 }, { "epoch": 0.15767634854771784, "grad_norm": 0.38671875, "learning_rate": 0.00016521739130434784, "loss": 1.4754, "step": 19 }, { "epoch": 0.16597510373443983, "grad_norm": 1.890625, "learning_rate": 0.00017391304347826088, "loss": 2.4256, "step": 20 }, { "epoch": 0.17427385892116182, "grad_norm": 0.296875, "learning_rate": 0.00018260869565217392, "loss": 1.4601, "step": 21 }, { "epoch": 0.1825726141078838, "grad_norm": 0.291015625, "learning_rate": 0.00019130434782608697, "loss": 1.4737, "step": 22 }, { "epoch": 0.1908713692946058, "grad_norm": 0.27734375, "learning_rate": 0.0002, "loss": 1.4942, "step": 23 }, { "epoch": 0.1991701244813278, "grad_norm": 0.259765625, "learning_rate": 0.00019998952044849376, "loss": 1.42, "step": 24 }, { "epoch": 0.2074688796680498, "grad_norm": 0.255859375, "learning_rate": 0.00019995808399039496, "loss": 1.4399, "step": 25 }, { "epoch": 0.2157676348547718, "grad_norm": 0.41796875, "learning_rate": 0.00019990569721450326, "loss": 2.1457, "step": 26 }, { "epoch": 0.22406639004149378, "grad_norm": 0.302734375, "learning_rate": 0.00019983237110061697, "loss": 1.4462, "step": 27 }, { "epoch": 0.23236514522821577, "grad_norm": 0.2890625, "learning_rate": 0.00019973812101723188, "loss": 1.3847, "step": 28 }, { "epoch": 0.24066390041493776, "grad_norm": 0.2412109375, "learning_rate": 0.00019962296671832003, "loss": 1.4013, "step": 29 }, { "epoch": 0.24896265560165975, "grad_norm": 0.2578125, "learning_rate": 0.00019948693233918952, "loss": 1.3528, "step": 30 }, { "epoch": 0.2572614107883817, "grad_norm": 0.255859375, "learning_rate": 0.00019933004639142605, "loss": 1.3523, "step": 31 }, { "epoch": 0.26556016597510373, "grad_norm": 0.265625, "learning_rate": 0.000199152341756917, "loss": 1.36, "step": 32 }, { "epoch": 0.27385892116182575, "grad_norm": 0.26953125, "learning_rate": 0.00019895385568095982, "loss": 1.3316, "step": 33 }, { "epoch": 0.2821576763485477, "grad_norm": 0.265625, "learning_rate": 0.00019873462976445553, "loss": 1.371, "step": 34 }, { "epoch": 0.29045643153526973, "grad_norm": 0.2578125, "learning_rate": 0.00019849470995518992, "loss": 1.4089, "step": 35 }, { "epoch": 0.2987551867219917, "grad_norm": 0.255859375, "learning_rate": 0.0001982341465382029, "loss": 1.3663, "step": 36 }, { "epoch": 0.3070539419087137, "grad_norm": 0.267578125, "learning_rate": 0.00019795299412524945, "loss": 1.3573, "step": 37 }, { "epoch": 0.3153526970954357, "grad_norm": 0.24609375, "learning_rate": 0.00019765131164335345, "loss": 1.3048, "step": 38 }, { "epoch": 0.3236514522821577, "grad_norm": 0.7734375, "learning_rate": 0.000197329162322457, "loss": 2.0409, "step": 39 }, { "epoch": 0.33195020746887965, "grad_norm": 0.25390625, "learning_rate": 0.00019698661368216817, "loss": 1.3583, "step": 40 }, { "epoch": 0.34024896265560167, "grad_norm": 0.2490234375, "learning_rate": 0.00019662373751760934, "loss": 1.2915, "step": 41 }, { "epoch": 0.34854771784232363, "grad_norm": 0.2578125, "learning_rate": 0.00019624060988436966, "loss": 1.3238, "step": 42 }, { "epoch": 0.35684647302904565, "grad_norm": 0.2294921875, "learning_rate": 0.0001958373110825644, "loss": 1.3412, "step": 43 }, { "epoch": 0.3651452282157676, "grad_norm": 0.236328125, "learning_rate": 0.00019541392564000488, "loss": 1.2947, "step": 44 }, { "epoch": 0.37344398340248963, "grad_norm": 0.2451171875, "learning_rate": 0.00019497054229448223, "loss": 1.3453, "step": 45 }, { "epoch": 0.3817427385892116, "grad_norm": 0.248046875, "learning_rate": 0.0001945072539751685, "loss": 1.3186, "step": 46 }, { "epoch": 0.3900414937759336, "grad_norm": 0.2431640625, "learning_rate": 0.00019402415778313977, "loss": 1.3524, "step": 47 }, { "epoch": 0.3983402489626556, "grad_norm": 0.267578125, "learning_rate": 0.00019352135497102463, "loss": 1.3403, "step": 48 }, { "epoch": 0.4066390041493776, "grad_norm": 0.251953125, "learning_rate": 0.0001929989509217824, "loss": 1.3266, "step": 49 }, { "epoch": 0.4149377593360996, "grad_norm": 0.251953125, "learning_rate": 0.0001924570551266159, "loss": 1.3373, "step": 50 }, { "epoch": 0.42323651452282157, "grad_norm": 0.2412109375, "learning_rate": 0.00019189578116202307, "loss": 1.268, "step": 51 }, { "epoch": 0.4315352697095436, "grad_norm": 0.23828125, "learning_rate": 0.00019131524666599233, "loss": 1.2878, "step": 52 }, { "epoch": 0.43983402489626555, "grad_norm": 0.236328125, "learning_rate": 0.00019071557331334669, "loss": 1.2968, "step": 53 }, { "epoch": 0.44813278008298757, "grad_norm": 0.2353515625, "learning_rate": 0.0001900968867902419, "loss": 1.3174, "step": 54 }, { "epoch": 0.45643153526970953, "grad_norm": 0.2412109375, "learning_rate": 0.00018945931676782373, "loss": 1.283, "step": 55 }, { "epoch": 0.46473029045643155, "grad_norm": 0.248046875, "learning_rate": 0.0001888029968750498, "loss": 1.279, "step": 56 }, { "epoch": 0.4730290456431535, "grad_norm": 0.240234375, "learning_rate": 0.00018812806467068268, "loss": 1.3151, "step": 57 }, { "epoch": 0.48132780082987553, "grad_norm": 0.244140625, "learning_rate": 0.00018743466161445823, "loss": 1.2763, "step": 58 }, { "epoch": 0.4896265560165975, "grad_norm": 0.25390625, "learning_rate": 0.00018672293303743738, "loss": 1.3259, "step": 59 }, { "epoch": 0.4979253112033195, "grad_norm": 0.255859375, "learning_rate": 0.00018599302811154572, "loss": 1.287, "step": 60 }, { "epoch": 0.5062240663900415, "grad_norm": 0.267578125, "learning_rate": 0.00018524509981830852, "loss": 1.2794, "step": 61 }, { "epoch": 0.5145228215767634, "grad_norm": 0.255859375, "learning_rate": 0.00018447930491678733, "loss": 1.3154, "step": 62 }, { "epoch": 0.5228215767634855, "grad_norm": 0.248046875, "learning_rate": 0.00018369580391072433, "loss": 1.2979, "step": 63 }, { "epoch": 0.5311203319502075, "grad_norm": 0.2421875, "learning_rate": 0.00018289476101490256, "loss": 1.326, "step": 64 }, { "epoch": 0.5394190871369294, "grad_norm": 0.24609375, "learning_rate": 0.00018207634412072764, "loss": 1.2843, "step": 65 }, { "epoch": 0.5477178423236515, "grad_norm": 0.2490234375, "learning_rate": 0.00018124072476103956, "loss": 1.3233, "step": 66 }, { "epoch": 0.5560165975103735, "grad_norm": 0.248046875, "learning_rate": 0.00018038807807416068, "loss": 1.3208, "step": 67 }, { "epoch": 0.5643153526970954, "grad_norm": 0.2431640625, "learning_rate": 0.00017951858276718844, "loss": 1.2949, "step": 68 }, { "epoch": 0.5726141078838174, "grad_norm": 0.412109375, "learning_rate": 0.00017863242107853995, "loss": 1.9205, "step": 69 }, { "epoch": 0.5809128630705395, "grad_norm": 0.2578125, "learning_rate": 0.0001777297787397563, "loss": 1.2847, "step": 70 }, { "epoch": 0.5892116182572614, "grad_norm": 0.2578125, "learning_rate": 0.00017681084493657525, "loss": 1.298, "step": 71 }, { "epoch": 0.5975103734439834, "grad_norm": 0.255859375, "learning_rate": 0.0001758758122692791, "loss": 1.2805, "step": 72 }, { "epoch": 0.6058091286307054, "grad_norm": 0.25390625, "learning_rate": 0.00017492487671232784, "loss": 1.2829, "step": 73 }, { "epoch": 0.6141078838174274, "grad_norm": 0.2470703125, "learning_rate": 0.00017395823757328444, "loss": 1.2523, "step": 74 }, { "epoch": 0.6224066390041494, "grad_norm": 0.25, "learning_rate": 0.00017297609745104184, "loss": 1.2977, "step": 75 }, { "epoch": 0.6307053941908713, "grad_norm": 0.25, "learning_rate": 0.0001719786621933599, "loss": 1.3103, "step": 76 }, { "epoch": 0.6390041493775933, "grad_norm": 0.25390625, "learning_rate": 0.00017096614085372185, "loss": 1.3006, "step": 77 }, { "epoch": 0.6473029045643154, "grad_norm": 0.435546875, "learning_rate": 0.00016993874564751822, "loss": 1.8113, "step": 78 }, { "epoch": 0.6556016597510373, "grad_norm": 0.6953125, "learning_rate": 0.00016889669190756868, "loss": 2.3268, "step": 79 }, { "epoch": 0.6639004149377593, "grad_norm": 0.2734375, "learning_rate": 0.00016784019803899, "loss": 1.2574, "step": 80 }, { "epoch": 0.6721991701244814, "grad_norm": 0.259765625, "learning_rate": 0.0001667694854734204, "loss": 1.2365, "step": 81 }, { "epoch": 0.6804979253112033, "grad_norm": 0.26953125, "learning_rate": 0.0001656847786226095, "loss": 1.2974, "step": 82 }, { "epoch": 0.6887966804979253, "grad_norm": 0.267578125, "learning_rate": 0.00016458630483138356, "loss": 1.2225, "step": 83 }, { "epoch": 0.6970954356846473, "grad_norm": 0.2734375, "learning_rate": 0.00016347429432999602, "loss": 1.2477, "step": 84 }, { "epoch": 0.7053941908713693, "grad_norm": 0.28125, "learning_rate": 0.00016234898018587337, "loss": 1.2872, "step": 85 }, { "epoch": 0.7136929460580913, "grad_norm": 0.271484375, "learning_rate": 0.0001612105982547663, "loss": 1.2824, "step": 86 }, { "epoch": 0.7219917012448133, "grad_norm": 0.26171875, "learning_rate": 0.00016005938713131642, "loss": 1.3056, "step": 87 }, { "epoch": 0.7302904564315352, "grad_norm": 0.265625, "learning_rate": 0.00015889558809904902, "loss": 1.244, "step": 88 }, { "epoch": 0.7385892116182573, "grad_norm": 0.267578125, "learning_rate": 0.00015771944507980207, "loss": 1.2742, "step": 89 }, { "epoch": 0.7468879668049793, "grad_norm": 0.259765625, "learning_rate": 0.00015653120458260263, "loss": 1.3265, "step": 90 }, { "epoch": 0.7551867219917012, "grad_norm": 0.25, "learning_rate": 0.00015533111565200044, "loss": 1.2373, "step": 91 }, { "epoch": 0.7634854771784232, "grad_norm": 0.259765625, "learning_rate": 0.0001541194298158708, "loss": 1.3201, "step": 92 }, { "epoch": 0.7717842323651453, "grad_norm": 0.27734375, "learning_rate": 0.00015289640103269625, "loss": 1.2052, "step": 93 }, { "epoch": 0.7800829875518672, "grad_norm": 0.283203125, "learning_rate": 0.00015166228563833934, "loss": 1.2638, "step": 94 }, { "epoch": 0.7883817427385892, "grad_norm": 0.2578125, "learning_rate": 0.00015041734229231688, "loss": 1.2334, "step": 95 }, { "epoch": 0.7966804979253111, "grad_norm": 0.263671875, "learning_rate": 0.00014916183192358718, "loss": 1.2396, "step": 96 }, { "epoch": 0.8049792531120332, "grad_norm": 0.2578125, "learning_rate": 0.00014789601767586173, "loss": 1.2272, "step": 97 }, { "epoch": 0.8132780082987552, "grad_norm": 0.28125, "learning_rate": 0.00014662016485245274, "loss": 1.2784, "step": 98 }, { "epoch": 0.8215767634854771, "grad_norm": 0.275390625, "learning_rate": 0.00014533454086066772, "loss": 1.2512, "step": 99 }, { "epoch": 0.8298755186721992, "grad_norm": 0.275390625, "learning_rate": 0.00014403941515576344, "loss": 1.2562, "step": 100 }, { "epoch": 0.8381742738589212, "grad_norm": 0.267578125, "learning_rate": 0.00014273505918447054, "loss": 1.3054, "step": 101 }, { "epoch": 0.8464730290456431, "grad_norm": 0.263671875, "learning_rate": 0.00014142174632810072, "loss": 1.2831, "step": 102 }, { "epoch": 0.8547717842323651, "grad_norm": 0.279296875, "learning_rate": 0.0001400997518452484, "loss": 1.31, "step": 103 }, { "epoch": 0.8630705394190872, "grad_norm": 0.25, "learning_rate": 0.00013876935281409907, "loss": 1.2472, "step": 104 }, { "epoch": 0.8713692946058091, "grad_norm": 0.28515625, "learning_rate": 0.00013743082807435615, "loss": 1.2792, "step": 105 }, { "epoch": 0.8796680497925311, "grad_norm": 0.267578125, "learning_rate": 0.00013608445816879866, "loss": 1.2853, "step": 106 }, { "epoch": 0.8879668049792531, "grad_norm": 0.265625, "learning_rate": 0.00013473052528448201, "loss": 1.2923, "step": 107 }, { "epoch": 0.8962655601659751, "grad_norm": 0.275390625, "learning_rate": 0.00013336931319359426, "loss": 1.2768, "step": 108 }, { "epoch": 0.9045643153526971, "grad_norm": 0.267578125, "learning_rate": 0.00013200110719397968, "loss": 1.2701, "step": 109 }, { "epoch": 0.9128630705394191, "grad_norm": 0.25390625, "learning_rate": 0.00013062619404934317, "loss": 1.21, "step": 110 }, { "epoch": 0.921161825726141, "grad_norm": 0.255859375, "learning_rate": 0.00012924486192914705, "loss": 1.2808, "step": 111 }, { "epoch": 0.9294605809128631, "grad_norm": 0.26171875, "learning_rate": 0.00012785740034821329, "loss": 1.2616, "step": 112 }, { "epoch": 0.9377593360995851, "grad_norm": 0.265625, "learning_rate": 0.00012646410010604397, "loss": 1.2094, "step": 113 }, { "epoch": 0.946058091286307, "grad_norm": 0.267578125, "learning_rate": 0.00012506525322587207, "loss": 1.2128, "step": 114 }, { "epoch": 0.9543568464730291, "grad_norm": 0.267578125, "learning_rate": 0.0001236611528934562, "loss": 1.2573, "step": 115 }, { "epoch": 0.9626556016597511, "grad_norm": 0.259765625, "learning_rate": 0.00012225209339563145, "loss": 1.2367, "step": 116 }, { "epoch": 0.970954356846473, "grad_norm": 0.263671875, "learning_rate": 0.00012083837005862946, "loss": 1.2313, "step": 117 }, { "epoch": 0.979253112033195, "grad_norm": 0.265625, "learning_rate": 0.00011942027918618074, "loss": 1.2495, "step": 118 }, { "epoch": 0.9875518672199171, "grad_norm": 0.259765625, "learning_rate": 0.0001179981179974121, "loss": 1.2225, "step": 119 }, { "epoch": 0.995850622406639, "grad_norm": 0.28125, "learning_rate": 0.00011657218456455206, "loss": 1.2869, "step": 120 }, { "epoch": 1.004149377593361, "grad_norm": 0.267578125, "learning_rate": 0.00011514277775045768, "loss": 1.2421, "step": 121 }, { "epoch": 1.012448132780083, "grad_norm": 0.267578125, "learning_rate": 0.00011371019714597562, "loss": 1.2361, "step": 122 }, { "epoch": 1.0020746887966805, "grad_norm": 0.26953125, "learning_rate": 0.00011227474300715055, "loss": 1.2266, "step": 123 }, { "epoch": 1.0103734439834025, "grad_norm": 0.26953125, "learning_rate": 0.00011083671619229408, "loss": 1.1711, "step": 124 }, { "epoch": 1.0186721991701244, "grad_norm": 0.26171875, "learning_rate": 0.00010939641809892767, "loss": 1.1194, "step": 125 }, { "epoch": 1.0269709543568464, "grad_norm": 0.267578125, "learning_rate": 0.00010795415060061243, "loss": 1.2046, "step": 126 }, { "epoch": 1.0352697095435686, "grad_norm": 1.1796875, "learning_rate": 0.00010651021598367906, "loss": 1.7307, "step": 127 }, { "epoch": 1.0435684647302905, "grad_norm": 0.26953125, "learning_rate": 0.00010506491688387127, "loss": 1.1314, "step": 128 }, { "epoch": 1.0518672199170125, "grad_norm": 0.26953125, "learning_rate": 0.00010361855622291637, "loss": 1.1103, "step": 129 }, { "epoch": 1.0601659751037344, "grad_norm": 0.287109375, "learning_rate": 0.00010217143714503508, "loss": 1.1383, "step": 130 }, { "epoch": 1.0684647302904564, "grad_norm": 0.306640625, "learning_rate": 0.00010072386295340572, "loss": 1.1588, "step": 131 }, { "epoch": 1.0767634854771784, "grad_norm": 0.291015625, "learning_rate": 9.927613704659429e-05, "loss": 1.1423, "step": 132 }, { "epoch": 1.0850622406639003, "grad_norm": 0.298828125, "learning_rate": 9.782856285496495e-05, "loss": 1.103, "step": 133 }, { "epoch": 1.0933609958506225, "grad_norm": 0.302734375, "learning_rate": 9.638144377708367e-05, "loss": 1.1682, "step": 134 }, { "epoch": 1.1016597510373445, "grad_norm": 0.296875, "learning_rate": 9.493508311612874e-05, "loss": 1.1089, "step": 135 }, { "epoch": 1.1099585062240664, "grad_norm": 0.291015625, "learning_rate": 9.348978401632101e-05, "loss": 1.072, "step": 136 }, { "epoch": 1.1182572614107884, "grad_norm": 0.296875, "learning_rate": 9.204584939938762e-05, "loss": 1.1051, "step": 137 }, { "epoch": 1.1265560165975104, "grad_norm": 0.29296875, "learning_rate": 9.060358190107234e-05, "loss": 1.1506, "step": 138 }, { "epoch": 1.1348547717842323, "grad_norm": 0.298828125, "learning_rate": 8.916328380770595e-05, "loss": 1.0714, "step": 139 }, { "epoch": 1.1431535269709543, "grad_norm": 0.294921875, "learning_rate": 8.772525699284946e-05, "loss": 1.1131, "step": 140 }, { "epoch": 1.1514522821576763, "grad_norm": 0.30078125, "learning_rate": 8.628980285402439e-05, "loss": 1.1183, "step": 141 }, { "epoch": 1.1597510373443982, "grad_norm": 0.3046875, "learning_rate": 8.485722224954237e-05, "loss": 1.1111, "step": 142 }, { "epoch": 1.1680497925311204, "grad_norm": 0.3125, "learning_rate": 8.342781543544798e-05, "loss": 1.0872, "step": 143 }, { "epoch": 1.1763485477178424, "grad_norm": 0.31640625, "learning_rate": 8.200188200258791e-05, "loss": 1.1321, "step": 144 }, { "epoch": 1.1846473029045643, "grad_norm": 0.322265625, "learning_rate": 8.057972081381927e-05, "loss": 1.1097, "step": 145 }, { "epoch": 1.1929460580912863, "grad_norm": 0.318359375, "learning_rate": 7.916162994137056e-05, "loss": 1.0885, "step": 146 }, { "epoch": 1.2012448132780082, "grad_norm": 0.31640625, "learning_rate": 7.774790660436858e-05, "loss": 1.105, "step": 147 }, { "epoch": 1.2095435684647302, "grad_norm": 0.314453125, "learning_rate": 7.633884710654383e-05, "loss": 1.0775, "step": 148 }, { "epoch": 1.2178423236514524, "grad_norm": 0.322265625, "learning_rate": 7.493474677412794e-05, "loss": 1.1075, "step": 149 }, { "epoch": 1.2261410788381744, "grad_norm": 0.328125, "learning_rate": 7.353589989395604e-05, "loss": 1.1398, "step": 150 }, { "epoch": 1.2344398340248963, "grad_norm": 0.333984375, "learning_rate": 7.214259965178674e-05, "loss": 1.1224, "step": 151 }, { "epoch": 1.2427385892116183, "grad_norm": 0.314453125, "learning_rate": 7.075513807085299e-05, "loss": 1.0898, "step": 152 }, { "epoch": 1.2510373443983402, "grad_norm": 0.33984375, "learning_rate": 6.937380595065685e-05, "loss": 1.0957, "step": 153 }, { "epoch": 1.2593360995850622, "grad_norm": 0.32421875, "learning_rate": 6.799889280602031e-05, "loss": 1.134, "step": 154 }, { "epoch": 1.2676348547717842, "grad_norm": 0.32421875, "learning_rate": 6.663068680640574e-05, "loss": 1.1226, "step": 155 }, { "epoch": 1.2759336099585061, "grad_norm": 0.318359375, "learning_rate": 6.526947471551798e-05, "loss": 1.1155, "step": 156 }, { "epoch": 1.284232365145228, "grad_norm": 0.3203125, "learning_rate": 6.391554183120138e-05, "loss": 1.1167, "step": 157 }, { "epoch": 1.2925311203319503, "grad_norm": 0.32421875, "learning_rate": 6.25691719256439e-05, "loss": 1.0482, "step": 158 }, { "epoch": 1.3008298755186722, "grad_norm": 0.326171875, "learning_rate": 6.123064718590099e-05, "loss": 1.0664, "step": 159 }, { "epoch": 1.3091286307053942, "grad_norm": 0.333984375, "learning_rate": 5.9900248154751616e-05, "loss": 1.1055, "step": 160 }, { "epoch": 1.3174273858921162, "grad_norm": 0.318359375, "learning_rate": 5.857825367189931e-05, "loss": 1.0993, "step": 161 }, { "epoch": 1.3257261410788381, "grad_norm": 0.333984375, "learning_rate": 5.7264940815529485e-05, "loss": 1.086, "step": 162 }, { "epoch": 1.33402489626556, "grad_norm": 0.328125, "learning_rate": 5.596058484423656e-05, "loss": 1.0708, "step": 163 }, { "epoch": 1.3423236514522823, "grad_norm": 1.0234375, "learning_rate": 5.46654591393323e-05, "loss": 1.7514, "step": 164 }, { "epoch": 1.3506224066390042, "grad_norm": 0.3359375, "learning_rate": 5.337983514754723e-05, "loss": 1.0925, "step": 165 }, { "epoch": 1.3589211618257262, "grad_norm": 0.333984375, "learning_rate": 5.2103982324138244e-05, "loss": 1.122, "step": 166 }, { "epoch": 1.3672199170124482, "grad_norm": 0.345703125, "learning_rate": 5.083816807641284e-05, "loss": 1.1124, "step": 167 }, { "epoch": 1.3755186721991701, "grad_norm": 0.330078125, "learning_rate": 4.958265770768316e-05, "loss": 1.0847, "step": 168 }, { "epoch": 1.383817427385892, "grad_norm": 0.33984375, "learning_rate": 4.833771436166069e-05, "loss": 1.1018, "step": 169 }, { "epoch": 1.392116182572614, "grad_norm": 0.34375, "learning_rate": 4.710359896730379e-05, "loss": 1.0666, "step": 170 }, { "epoch": 1.400414937759336, "grad_norm": 0.337890625, "learning_rate": 4.5880570184129215e-05, "loss": 1.0682, "step": 171 }, { "epoch": 1.408713692946058, "grad_norm": 0.33984375, "learning_rate": 4.466888434799958e-05, "loss": 1.0828, "step": 172 }, { "epoch": 1.4170124481327802, "grad_norm": 0.349609375, "learning_rate": 4.34687954173974e-05, "loss": 1.118, "step": 173 }, { "epoch": 1.4253112033195021, "grad_norm": 0.337890625, "learning_rate": 4.2280554920197936e-05, "loss": 1.0306, "step": 174 }, { "epoch": 1.433609958506224, "grad_norm": 0.3359375, "learning_rate": 4.1104411900951015e-05, "loss": 1.085, "step": 175 }, { "epoch": 1.441908713692946, "grad_norm": 0.349609375, "learning_rate": 3.994061286868361e-05, "loss": 1.1037, "step": 176 }, { "epoch": 1.450207468879668, "grad_norm": 0.67578125, "learning_rate": 3.878940174523371e-05, "loss": 1.5781, "step": 177 }, { "epoch": 1.45850622406639, "grad_norm": 0.3359375, "learning_rate": 3.7651019814126654e-05, "loss": 1.0691, "step": 178 }, { "epoch": 1.4668049792531122, "grad_norm": 0.345703125, "learning_rate": 3.652570567000402e-05, "loss": 1.0805, "step": 179 }, { "epoch": 1.4751037344398341, "grad_norm": 0.341796875, "learning_rate": 3.541369516861648e-05, "loss": 1.0789, "step": 180 }, { "epoch": 1.483402489626556, "grad_norm": 0.34765625, "learning_rate": 3.431522137739049e-05, "loss": 1.1362, "step": 181 }, { "epoch": 1.491701244813278, "grad_norm": 0.34375, "learning_rate": 3.323051452657961e-05, "loss": 1.0381, "step": 182 }, { "epoch": 1.5, "grad_norm": 0.3359375, "learning_rate": 3.215980196101002e-05, "loss": 1.0476, "step": 183 }, { "epoch": 1.508298755186722, "grad_norm": 0.333984375, "learning_rate": 3.110330809243134e-05, "loss": 1.0587, "step": 184 }, { "epoch": 1.516597510373444, "grad_norm": 0.349609375, "learning_rate": 3.0061254352481804e-05, "loss": 1.0961, "step": 185 }, { "epoch": 1.5248962655601659, "grad_norm": 0.353515625, "learning_rate": 2.9033859146278197e-05, "loss": 1.0974, "step": 186 }, { "epoch": 1.5331950207468878, "grad_norm": 0.3515625, "learning_rate": 2.8021337806640135e-05, "loss": 1.0793, "step": 187 }, { "epoch": 1.5414937759336098, "grad_norm": 0.34765625, "learning_rate": 2.702390254895819e-05, "loss": 1.1351, "step": 188 }, { "epoch": 1.549792531120332, "grad_norm": 0.33984375, "learning_rate": 2.6041762426715566e-05, "loss": 1.0445, "step": 189 }, { "epoch": 1.558091286307054, "grad_norm": 0.341796875, "learning_rate": 2.5075123287672175e-05, "loss": 1.0722, "step": 190 }, { "epoch": 1.566390041493776, "grad_norm": 0.33984375, "learning_rate": 2.4124187730720917e-05, "loss": 1.1048, "step": 191 }, { "epoch": 1.5746887966804979, "grad_norm": 0.33984375, "learning_rate": 2.3189155063424782e-05, "loss": 1.0925, "step": 192 }, { "epoch": 1.58298755186722, "grad_norm": 0.3515625, "learning_rate": 2.2270221260243673e-05, "loss": 1.1181, "step": 193 }, { "epoch": 1.591286307053942, "grad_norm": 0.34765625, "learning_rate": 2.1367578921460074e-05, "loss": 1.0998, "step": 194 }, { "epoch": 1.599585062240664, "grad_norm": 0.34765625, "learning_rate": 2.0481417232811573e-05, "loss": 1.1088, "step": 195 }, { "epoch": 1.607883817427386, "grad_norm": 0.34375, "learning_rate": 1.961192192583934e-05, "loss": 1.0283, "step": 196 }, { "epoch": 1.616182572614108, "grad_norm": 0.349609375, "learning_rate": 1.8759275238960473e-05, "loss": 1.0841, "step": 197 }, { "epoch": 1.6244813278008299, "grad_norm": 0.330078125, "learning_rate": 1.7923655879272393e-05, "loss": 1.0461, "step": 198 }, { "epoch": 1.6327800829875518, "grad_norm": 0.6328125, "learning_rate": 1.7105238985097472e-05, "loss": 1.5407, "step": 199 }, { "epoch": 1.6410788381742738, "grad_norm": 0.34375, "learning_rate": 1.6304196089275658e-05, "loss": 1.0827, "step": 200 }, { "epoch": 1.6493775933609958, "grad_norm": 0.353515625, "learning_rate": 1.5520695083212678e-05, "loss": 1.1022, "step": 201 }, { "epoch": 1.6576763485477177, "grad_norm": 0.34375, "learning_rate": 1.4754900181691467e-05, "loss": 1.0796, "step": 202 }, { "epoch": 1.6659751037344397, "grad_norm": 0.345703125, "learning_rate": 1.4006971888454323e-05, "loss": 1.0797, "step": 203 }, { "epoch": 1.6742738589211619, "grad_norm": 0.345703125, "learning_rate": 1.3277066962562645e-05, "loss": 1.0898, "step": 204 }, { "epoch": 1.6825726141078838, "grad_norm": 0.35546875, "learning_rate": 1.2565338385541792e-05, "loss": 1.0774, "step": 205 }, { "epoch": 1.6908713692946058, "grad_norm": 0.345703125, "learning_rate": 1.1871935329317363e-05, "loss": 1.0741, "step": 206 }, { "epoch": 1.6991701244813278, "grad_norm": 0.361328125, "learning_rate": 1.1197003124950222e-05, "loss": 1.0568, "step": 207 }, { "epoch": 1.70746887966805, "grad_norm": 0.361328125, "learning_rate": 1.0540683232176307e-05, "loss": 1.125, "step": 208 }, { "epoch": 1.715767634854772, "grad_norm": 0.34765625, "learning_rate": 9.903113209758096e-06, "loss": 1.0842, "step": 209 }, { "epoch": 1.7240663900414939, "grad_norm": 0.3359375, "learning_rate": 9.284426686653303e-06, "loss": 1.086, "step": 210 }, { "epoch": 1.7323651452282158, "grad_norm": 0.3515625, "learning_rate": 8.68475333400769e-06, "loss": 1.0743, "step": 211 }, { "epoch": 1.7406639004149378, "grad_norm": 0.345703125, "learning_rate": 8.10421883797694e-06, "loss": 1.1064, "step": 212 }, { "epoch": 1.7489626556016598, "grad_norm": 0.5546875, "learning_rate": 7.542944873384106e-06, "loss": 1.4331, "step": 213 }, { "epoch": 1.7572614107883817, "grad_norm": 0.349609375, "learning_rate": 7.0010490782176145e-06, "loss": 1.0565, "step": 214 }, { "epoch": 1.7655601659751037, "grad_norm": 0.34765625, "learning_rate": 6.4786450289753715e-06, "loss": 1.1227, "step": 215 }, { "epoch": 1.7738589211618256, "grad_norm": 0.345703125, "learning_rate": 5.975842216860239e-06, "loss": 1.0866, "step": 216 }, { "epoch": 1.7821576763485476, "grad_norm": 0.345703125, "learning_rate": 5.492746024831541e-06, "loss": 1.0861, "step": 217 }, { "epoch": 1.7904564315352696, "grad_norm": 0.337890625, "learning_rate": 5.029457705517793e-06, "loss": 1.0773, "step": 218 }, { "epoch": 1.7987551867219918, "grad_norm": 0.359375, "learning_rate": 4.586074359995119e-06, "loss": 1.0457, "step": 219 }, { "epoch": 1.8070539419087137, "grad_norm": 0.345703125, "learning_rate": 4.162688917435631e-06, "loss": 1.0953, "step": 220 }, { "epoch": 1.8153526970954357, "grad_norm": 0.353515625, "learning_rate": 3.7593901156303566e-06, "loss": 1.0966, "step": 221 }, { "epoch": 1.8236514522821576, "grad_norm": 0.341796875, "learning_rate": 3.3762624823906573e-06, "loss": 1.0517, "step": 222 }, { "epoch": 1.8319502074688798, "grad_norm": 0.33984375, "learning_rate": 3.0133863178318232e-06, "loss": 1.0669, "step": 223 }, { "epoch": 1.8402489626556018, "grad_norm": 0.349609375, "learning_rate": 2.6708376775430033e-06, "loss": 1.091, "step": 224 }, { "epoch": 1.8485477178423237, "grad_norm": 0.34765625, "learning_rate": 2.3486883566465777e-06, "loss": 1.1095, "step": 225 }, { "epoch": 1.8568464730290457, "grad_norm": 0.60546875, "learning_rate": 2.0470058747505516e-06, "loss": 1.4381, "step": 226 }, { "epoch": 1.8651452282157677, "grad_norm": 0.3359375, "learning_rate": 1.7658534617971067e-06, "loss": 1.0979, "step": 227 }, { "epoch": 1.8734439834024896, "grad_norm": 0.35546875, "learning_rate": 1.5052900448100815e-06, "loss": 1.0746, "step": 228 }, { "epoch": 1.8817427385892116, "grad_norm": 0.34765625, "learning_rate": 1.2653702355444608e-06, "loss": 1.0931, "step": 229 }, { "epoch": 1.8900414937759336, "grad_norm": 0.357421875, "learning_rate": 1.0461443190402099e-06, "loss": 1.1229, "step": 230 }, { "epoch": 1.8983402489626555, "grad_norm": 0.345703125, "learning_rate": 8.476582430830049e-07, "loss": 1.1039, "step": 231 }, { "epoch": 1.9066390041493775, "grad_norm": 0.333984375, "learning_rate": 6.699536085739588e-07, "loss": 1.07, "step": 232 }, { "epoch": 1.9149377593360994, "grad_norm": 0.34375, "learning_rate": 5.130676608104845e-07, "loss": 1.1166, "step": 233 }, { "epoch": 1.9232365145228216, "grad_norm": 0.34375, "learning_rate": 3.7703328167999485e-07, "loss": 1.0579, "step": 234 }, { "epoch": 1.9315352697095436, "grad_norm": 0.33984375, "learning_rate": 2.6187898276813784e-07, "loss": 1.0934, "step": 235 }, { "epoch": 1.9398340248962656, "grad_norm": 0.37109375, "learning_rate": 1.6762889938303217e-07, "loss": 1.0792, "step": 236 }, { "epoch": 1.9481327800829875, "grad_norm": 0.337890625, "learning_rate": 9.430278549675819e-08, "loss": 1.1108, "step": 237 }, { "epoch": 1.9564315352697097, "grad_norm": 0.353515625, "learning_rate": 4.191600960505859e-08, "loss": 1.1207, "step": 238 }, { "epoch": 1.9647302904564317, "grad_norm": 0.349609375, "learning_rate": 1.0479551506259456e-08, "loss": 1.0674, "step": 239 }, { "epoch": 1.9730290456431536, "grad_norm": 0.357421875, "learning_rate": 0.0, "loss": 1.0868, "step": 240 } ], "logging_steps": 1, "max_steps": 240, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 120, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.94707278659584e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }