{ "best_metric": null, "best_model_checkpoint": null, "epoch": 200.0, "eval_steps": 500, "global_step": 145800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.6858710562414266, "grad_norm": 1.2456575632095337, "learning_rate": 0.000996570644718793, "loss": 2.6328, "step": 500 }, { "epoch": 1.3717421124828533, "grad_norm": 0.6728461384773254, "learning_rate": 0.0009931412894375858, "loss": 1.9106, "step": 1000 }, { "epoch": 2.05761316872428, "grad_norm": 0.7172486782073975, "learning_rate": 0.0009897119341563787, "loss": 1.5385, "step": 1500 }, { "epoch": 2.7434842249657065, "grad_norm": 0.5451925992965698, "learning_rate": 0.0009862825788751715, "loss": 1.1861, "step": 2000 }, { "epoch": 3.4293552812071333, "grad_norm": 1.0264705419540405, "learning_rate": 0.0009828532235939644, "loss": 0.9558, "step": 2500 }, { "epoch": 4.11522633744856, "grad_norm": 0.5801027417182922, "learning_rate": 0.0009794238683127573, "loss": 0.7722, "step": 3000 }, { "epoch": 4.801097393689986, "grad_norm": 0.6234860420227051, "learning_rate": 0.0009759945130315501, "loss": 0.5497, "step": 3500 }, { "epoch": 5.486968449931413, "grad_norm": 0.4896549880504608, "learning_rate": 0.000972565157750343, "loss": 0.42, "step": 4000 }, { "epoch": 6.172839506172839, "grad_norm": 0.5927340984344482, "learning_rate": 0.0009691358024691358, "loss": 0.3612, "step": 4500 }, { "epoch": 6.858710562414267, "grad_norm": 0.5274905562400818, "learning_rate": 0.0009657064471879287, "loss": 0.2707, "step": 5000 }, { "epoch": 7.544581618655693, "grad_norm": 0.46353018283843994, "learning_rate": 0.0009622770919067215, "loss": 0.2088, "step": 5500 }, { "epoch": 8.23045267489712, "grad_norm": 0.7881788611412048, "learning_rate": 0.0009588477366255144, "loss": 0.2041, "step": 6000 }, { "epoch": 8.916323731138545, "grad_norm": 0.6398069262504578, "learning_rate": 0.0009554183813443072, "loss": 0.1744, "step": 6500 }, { "epoch": 9.602194787379972, "grad_norm": 0.44406837224960327, "learning_rate": 0.0009519890260631001, "loss": 0.1369, "step": 7000 }, { "epoch": 10.2880658436214, "grad_norm": 0.3870859742164612, "learning_rate": 0.000948559670781893, "loss": 0.1408, "step": 7500 }, { "epoch": 10.973936899862826, "grad_norm": 0.6403707265853882, "learning_rate": 0.0009451303155006859, "loss": 0.1307, "step": 8000 }, { "epoch": 11.659807956104252, "grad_norm": 0.406328022480011, "learning_rate": 0.0009417009602194788, "loss": 0.1105, "step": 8500 }, { "epoch": 12.345679012345679, "grad_norm": 0.4761105179786682, "learning_rate": 0.0009382716049382715, "loss": 0.1075, "step": 9000 }, { "epoch": 13.031550068587105, "grad_norm": 0.39505085349082947, "learning_rate": 0.0009348422496570644, "loss": 0.1101, "step": 9500 }, { "epoch": 13.717421124828533, "grad_norm": 0.3096862733364105, "learning_rate": 0.0009314128943758574, "loss": 0.0889, "step": 10000 }, { "epoch": 14.40329218106996, "grad_norm": 0.681281328201294, "learning_rate": 0.0009279835390946503, "loss": 0.097, "step": 10500 }, { "epoch": 15.089163237311386, "grad_norm": 0.23753629624843597, "learning_rate": 0.0009245541838134432, "loss": 0.0927, "step": 11000 }, { "epoch": 15.775034293552812, "grad_norm": 0.464749813079834, "learning_rate": 0.000921124828532236, "loss": 0.0793, "step": 11500 }, { "epoch": 16.46090534979424, "grad_norm": 0.3283621668815613, "learning_rate": 0.0009176954732510289, "loss": 0.0764, "step": 12000 }, { "epoch": 17.146776406035666, "grad_norm": 0.297809898853302, "learning_rate": 0.0009142661179698217, "loss": 0.0788, "step": 12500 }, { "epoch": 17.83264746227709, "grad_norm": 0.37593135237693787, "learning_rate": 0.0009108367626886146, "loss": 0.0746, "step": 13000 }, { "epoch": 18.51851851851852, "grad_norm": 0.3363408148288727, "learning_rate": 0.0009074074074074074, "loss": 0.0764, "step": 13500 }, { "epoch": 19.204389574759944, "grad_norm": 0.27103549242019653, "learning_rate": 0.0009039780521262003, "loss": 0.0751, "step": 14000 }, { "epoch": 19.89026063100137, "grad_norm": 0.23534435033798218, "learning_rate": 0.0009005486968449932, "loss": 0.0643, "step": 14500 }, { "epoch": 20.5761316872428, "grad_norm": 0.3298335671424866, "learning_rate": 0.0008971193415637861, "loss": 0.0621, "step": 15000 }, { "epoch": 21.262002743484224, "grad_norm": 0.28924617171287537, "learning_rate": 0.000893689986282579, "loss": 0.0661, "step": 15500 }, { "epoch": 21.947873799725652, "grad_norm": 0.34902724623680115, "learning_rate": 0.0008902606310013717, "loss": 0.0693, "step": 16000 }, { "epoch": 22.633744855967077, "grad_norm": 0.24693663418293, "learning_rate": 0.0008868312757201646, "loss": 0.0591, "step": 16500 }, { "epoch": 23.319615912208505, "grad_norm": 0.2739815413951874, "learning_rate": 0.0008834019204389575, "loss": 0.0576, "step": 17000 }, { "epoch": 24.005486968449933, "grad_norm": 0.36366912722587585, "learning_rate": 0.0008799725651577504, "loss": 0.0555, "step": 17500 }, { "epoch": 24.691358024691358, "grad_norm": 0.3027900159358978, "learning_rate": 0.0008765432098765433, "loss": 0.0507, "step": 18000 }, { "epoch": 25.377229080932786, "grad_norm": 0.18507389724254608, "learning_rate": 0.0008731138545953361, "loss": 0.0525, "step": 18500 }, { "epoch": 26.06310013717421, "grad_norm": 0.23404183983802795, "learning_rate": 0.000869684499314129, "loss": 0.0579, "step": 19000 }, { "epoch": 26.74897119341564, "grad_norm": 0.3586121201515198, "learning_rate": 0.0008662551440329218, "loss": 0.048, "step": 19500 }, { "epoch": 27.434842249657063, "grad_norm": 0.2888980805873871, "learning_rate": 0.0008628257887517147, "loss": 0.0515, "step": 20000 }, { "epoch": 28.12071330589849, "grad_norm": 0.23876026272773743, "learning_rate": 0.0008593964334705075, "loss": 0.0491, "step": 20500 }, { "epoch": 28.80658436213992, "grad_norm": 0.2646074891090393, "learning_rate": 0.0008559670781893004, "loss": 0.0466, "step": 21000 }, { "epoch": 29.492455418381343, "grad_norm": 0.1991817206144333, "learning_rate": 0.0008525377229080933, "loss": 0.0451, "step": 21500 }, { "epoch": 30.17832647462277, "grad_norm": 0.218049556016922, "learning_rate": 0.0008491083676268862, "loss": 0.0445, "step": 22000 }, { "epoch": 30.864197530864196, "grad_norm": 0.2530564069747925, "learning_rate": 0.0008456790123456791, "loss": 0.0427, "step": 22500 }, { "epoch": 31.550068587105624, "grad_norm": 0.2068854421377182, "learning_rate": 0.0008422496570644718, "loss": 0.0421, "step": 23000 }, { "epoch": 32.23593964334705, "grad_norm": 0.18602465093135834, "learning_rate": 0.0008388203017832647, "loss": 0.0437, "step": 23500 }, { "epoch": 32.92181069958848, "grad_norm": 0.21352776885032654, "learning_rate": 0.0008353909465020576, "loss": 0.0462, "step": 24000 }, { "epoch": 33.6076817558299, "grad_norm": 0.2586299777030945, "learning_rate": 0.0008319615912208505, "loss": 0.0417, "step": 24500 }, { "epoch": 34.29355281207133, "grad_norm": 0.2829551696777344, "learning_rate": 0.0008285322359396434, "loss": 0.0381, "step": 25000 }, { "epoch": 34.97942386831276, "grad_norm": 0.23624148964881897, "learning_rate": 0.0008251028806584362, "loss": 0.0361, "step": 25500 }, { "epoch": 35.66529492455418, "grad_norm": 0.21780389547348022, "learning_rate": 0.0008216735253772291, "loss": 0.0362, "step": 26000 }, { "epoch": 36.351165980795614, "grad_norm": 0.3541390299797058, "learning_rate": 0.0008182441700960219, "loss": 0.0395, "step": 26500 }, { "epoch": 37.03703703703704, "grad_norm": 0.15323896706104279, "learning_rate": 0.0008148148148148148, "loss": 0.0382, "step": 27000 }, { "epoch": 37.72290809327846, "grad_norm": 0.16792021691799164, "learning_rate": 0.0008113854595336076, "loss": 0.0319, "step": 27500 }, { "epoch": 38.40877914951989, "grad_norm": 0.19167844951152802, "learning_rate": 0.0008079561042524005, "loss": 0.0367, "step": 28000 }, { "epoch": 39.09465020576132, "grad_norm": 0.21890634298324585, "learning_rate": 0.0008045267489711934, "loss": 0.037, "step": 28500 }, { "epoch": 39.78052126200274, "grad_norm": 0.14012588560581207, "learning_rate": 0.0008010973936899864, "loss": 0.0325, "step": 29000 }, { "epoch": 40.46639231824417, "grad_norm": 1.626105546951294, "learning_rate": 0.0007976680384087793, "loss": 0.0315, "step": 29500 }, { "epoch": 41.1522633744856, "grad_norm": 0.1850096881389618, "learning_rate": 0.000794238683127572, "loss": 0.0379, "step": 30000 }, { "epoch": 41.838134430727024, "grad_norm": 0.20383605360984802, "learning_rate": 0.0007908093278463649, "loss": 0.0315, "step": 30500 }, { "epoch": 42.52400548696845, "grad_norm": 0.1643492877483368, "learning_rate": 0.0007873799725651578, "loss": 0.0276, "step": 31000 }, { "epoch": 43.20987654320987, "grad_norm": 0.15405911207199097, "learning_rate": 0.0007839506172839507, "loss": 0.0312, "step": 31500 }, { "epoch": 43.895747599451305, "grad_norm": 0.2370378077030182, "learning_rate": 0.0007805212620027436, "loss": 0.0337, "step": 32000 }, { "epoch": 44.58161865569273, "grad_norm": 0.14176137745380402, "learning_rate": 0.0007770919067215364, "loss": 0.0283, "step": 32500 }, { "epoch": 45.267489711934154, "grad_norm": 0.21307243406772614, "learning_rate": 0.0007736625514403293, "loss": 0.0277, "step": 33000 }, { "epoch": 45.953360768175585, "grad_norm": 0.2646368145942688, "learning_rate": 0.0007702331961591221, "loss": 0.03, "step": 33500 }, { "epoch": 46.63923182441701, "grad_norm": 0.22391417622566223, "learning_rate": 0.000766803840877915, "loss": 0.0298, "step": 34000 }, { "epoch": 47.325102880658434, "grad_norm": 0.15177056193351746, "learning_rate": 0.0007633744855967078, "loss": 0.0272, "step": 34500 }, { "epoch": 48.010973936899866, "grad_norm": 0.20461246371269226, "learning_rate": 0.0007599451303155007, "loss": 0.0278, "step": 35000 }, { "epoch": 48.69684499314129, "grad_norm": 0.12500979006290436, "learning_rate": 0.0007565157750342936, "loss": 0.0268, "step": 35500 }, { "epoch": 49.382716049382715, "grad_norm": 0.4091637134552002, "learning_rate": 0.0007530864197530865, "loss": 0.0267, "step": 36000 }, { "epoch": 50.06858710562414, "grad_norm": 0.22375427186489105, "learning_rate": 0.0007496570644718793, "loss": 0.0327, "step": 36500 }, { "epoch": 50.75445816186557, "grad_norm": 0.11407098174095154, "learning_rate": 0.0007462277091906721, "loss": 0.0245, "step": 37000 }, { "epoch": 51.440329218106996, "grad_norm": 0.1025354415178299, "learning_rate": 0.000742798353909465, "loss": 0.0246, "step": 37500 }, { "epoch": 52.12620027434842, "grad_norm": 0.20578157901763916, "learning_rate": 0.0007393689986282579, "loss": 0.0258, "step": 38000 }, { "epoch": 52.81207133058985, "grad_norm": 0.1874309480190277, "learning_rate": 0.0007359396433470508, "loss": 0.0276, "step": 38500 }, { "epoch": 53.49794238683128, "grad_norm": 0.1793011724948883, "learning_rate": 0.0007325102880658437, "loss": 0.0232, "step": 39000 }, { "epoch": 54.1838134430727, "grad_norm": 0.18910464644432068, "learning_rate": 0.0007290809327846365, "loss": 0.0237, "step": 39500 }, { "epoch": 54.869684499314126, "grad_norm": 0.26874855160713196, "learning_rate": 0.0007256515775034293, "loss": 0.0249, "step": 40000 }, { "epoch": 55.55555555555556, "grad_norm": 0.12961339950561523, "learning_rate": 0.0007222222222222222, "loss": 0.0262, "step": 40500 }, { "epoch": 56.24142661179698, "grad_norm": 0.14723250269889832, "learning_rate": 0.0007187928669410151, "loss": 0.0224, "step": 41000 }, { "epoch": 56.927297668038406, "grad_norm": 0.17689082026481628, "learning_rate": 0.000715363511659808, "loss": 0.0226, "step": 41500 }, { "epoch": 57.61316872427984, "grad_norm": 0.14181144535541534, "learning_rate": 0.0007119341563786008, "loss": 0.0214, "step": 42000 }, { "epoch": 58.29903978052126, "grad_norm": 0.5437673330307007, "learning_rate": 0.0007085048010973937, "loss": 0.0234, "step": 42500 }, { "epoch": 58.98491083676269, "grad_norm": 0.0570339597761631, "learning_rate": 0.0007050754458161866, "loss": 0.0238, "step": 43000 }, { "epoch": 59.67078189300412, "grad_norm": 0.14648722112178802, "learning_rate": 0.0007016460905349794, "loss": 0.0209, "step": 43500 }, { "epoch": 60.35665294924554, "grad_norm": 0.06610771268606186, "learning_rate": 0.0006982167352537722, "loss": 0.0203, "step": 44000 }, { "epoch": 61.04252400548697, "grad_norm": 0.092800073325634, "learning_rate": 0.0006947873799725651, "loss": 0.0198, "step": 44500 }, { "epoch": 61.72839506172839, "grad_norm": 0.12414117157459259, "learning_rate": 0.000691358024691358, "loss": 0.0195, "step": 45000 }, { "epoch": 62.414266117969824, "grad_norm": 0.10231557488441467, "learning_rate": 0.0006879286694101509, "loss": 0.0225, "step": 45500 }, { "epoch": 63.10013717421125, "grad_norm": 0.14670057594776154, "learning_rate": 0.0006844993141289438, "loss": 0.0213, "step": 46000 }, { "epoch": 63.78600823045267, "grad_norm": 0.1486993432044983, "learning_rate": 0.0006810699588477366, "loss": 0.0225, "step": 46500 }, { "epoch": 64.4718792866941, "grad_norm": 0.11352519690990448, "learning_rate": 0.0006776406035665294, "loss": 0.0187, "step": 47000 }, { "epoch": 65.15775034293553, "grad_norm": 0.08721095323562622, "learning_rate": 0.0006742112482853223, "loss": 0.0186, "step": 47500 }, { "epoch": 65.84362139917695, "grad_norm": 0.2794257402420044, "learning_rate": 0.0006707818930041153, "loss": 0.0184, "step": 48000 }, { "epoch": 66.52949245541838, "grad_norm": 0.16597363352775574, "learning_rate": 0.0006673525377229082, "loss": 0.0203, "step": 48500 }, { "epoch": 67.2153635116598, "grad_norm": 0.1067240834236145, "learning_rate": 0.000663923182441701, "loss": 0.0208, "step": 49000 }, { "epoch": 67.90123456790124, "grad_norm": 0.13352781534194946, "learning_rate": 0.0006604938271604939, "loss": 0.0181, "step": 49500 }, { "epoch": 68.58710562414267, "grad_norm": 0.15161901712417603, "learning_rate": 0.0006570644718792868, "loss": 0.02, "step": 50000 }, { "epoch": 69.27297668038409, "grad_norm": 0.19234149158000946, "learning_rate": 0.0006536351165980796, "loss": 0.0179, "step": 50500 }, { "epoch": 69.95884773662551, "grad_norm": 0.11277095228433609, "learning_rate": 0.0006502057613168724, "loss": 0.0181, "step": 51000 }, { "epoch": 70.64471879286694, "grad_norm": 0.10098372399806976, "learning_rate": 0.0006467764060356653, "loss": 0.0187, "step": 51500 }, { "epoch": 71.33058984910836, "grad_norm": 0.08452719449996948, "learning_rate": 0.0006433470507544582, "loss": 0.0167, "step": 52000 }, { "epoch": 72.01646090534979, "grad_norm": 0.11851578205823898, "learning_rate": 0.0006399176954732511, "loss": 0.0189, "step": 52500 }, { "epoch": 72.70233196159123, "grad_norm": 0.2409876585006714, "learning_rate": 0.000636488340192044, "loss": 0.018, "step": 53000 }, { "epoch": 73.38820301783265, "grad_norm": 0.1037708967924118, "learning_rate": 0.0006330589849108368, "loss": 0.0159, "step": 53500 }, { "epoch": 74.07407407407408, "grad_norm": 0.1006944552063942, "learning_rate": 0.0006296296296296296, "loss": 0.018, "step": 54000 }, { "epoch": 74.7599451303155, "grad_norm": 0.11186862736940384, "learning_rate": 0.0006262002743484225, "loss": 0.0169, "step": 54500 }, { "epoch": 75.44581618655693, "grad_norm": 0.08759860694408417, "learning_rate": 0.0006227709190672154, "loss": 0.0165, "step": 55000 }, { "epoch": 76.13168724279835, "grad_norm": 0.14316201210021973, "learning_rate": 0.0006193415637860083, "loss": 0.0176, "step": 55500 }, { "epoch": 76.81755829903977, "grad_norm": 0.1392996907234192, "learning_rate": 0.0006159122085048011, "loss": 0.0161, "step": 56000 }, { "epoch": 77.50342935528121, "grad_norm": 0.14493365585803986, "learning_rate": 0.000612482853223594, "loss": 0.0157, "step": 56500 }, { "epoch": 78.18930041152264, "grad_norm": 0.0993877425789833, "learning_rate": 0.0006090534979423869, "loss": 0.0168, "step": 57000 }, { "epoch": 78.87517146776406, "grad_norm": 0.09831307828426361, "learning_rate": 0.0006056241426611797, "loss": 0.0164, "step": 57500 }, { "epoch": 79.56104252400549, "grad_norm": 0.09205558151006699, "learning_rate": 0.0006021947873799725, "loss": 0.0157, "step": 58000 }, { "epoch": 80.24691358024691, "grad_norm": 0.10848256945610046, "learning_rate": 0.0005987654320987654, "loss": 0.014, "step": 58500 }, { "epoch": 80.93278463648834, "grad_norm": 0.14594706892967224, "learning_rate": 0.0005953360768175583, "loss": 0.0144, "step": 59000 }, { "epoch": 81.61865569272976, "grad_norm": 0.057823847979307175, "learning_rate": 0.0005919067215363512, "loss": 0.0143, "step": 59500 }, { "epoch": 82.3045267489712, "grad_norm": 0.3916853368282318, "learning_rate": 0.0005884773662551441, "loss": 0.0159, "step": 60000 }, { "epoch": 82.99039780521262, "grad_norm": 0.11421903222799301, "learning_rate": 0.0005850480109739369, "loss": 0.017, "step": 60500 }, { "epoch": 83.67626886145405, "grad_norm": 0.10148236900568008, "learning_rate": 0.0005816186556927297, "loss": 0.014, "step": 61000 }, { "epoch": 84.36213991769547, "grad_norm": 0.12611427903175354, "learning_rate": 0.0005781893004115226, "loss": 0.0139, "step": 61500 }, { "epoch": 85.0480109739369, "grad_norm": 0.07960600405931473, "learning_rate": 0.0005747599451303155, "loss": 0.0145, "step": 62000 }, { "epoch": 85.73388203017832, "grad_norm": 0.09838591516017914, "learning_rate": 0.0005713305898491084, "loss": 0.0131, "step": 62500 }, { "epoch": 86.41975308641975, "grad_norm": 0.06399769335985184, "learning_rate": 0.0005679012345679012, "loss": 0.0131, "step": 63000 }, { "epoch": 87.10562414266118, "grad_norm": 0.1615062952041626, "learning_rate": 0.0005644718792866941, "loss": 0.0159, "step": 63500 }, { "epoch": 87.79149519890261, "grad_norm": 0.11928918957710266, "learning_rate": 0.000561042524005487, "loss": 0.0153, "step": 64000 }, { "epoch": 88.47736625514403, "grad_norm": 0.0789957344532013, "learning_rate": 0.0005576131687242798, "loss": 0.0128, "step": 64500 }, { "epoch": 89.16323731138546, "grad_norm": 0.10119163990020752, "learning_rate": 0.0005541838134430726, "loss": 0.0134, "step": 65000 }, { "epoch": 89.84910836762688, "grad_norm": 0.10709987580776215, "learning_rate": 0.0005507544581618655, "loss": 0.0129, "step": 65500 }, { "epoch": 90.53497942386831, "grad_norm": 0.1411323994398117, "learning_rate": 0.0005473251028806584, "loss": 0.0135, "step": 66000 }, { "epoch": 91.22085048010975, "grad_norm": 0.1025325134396553, "learning_rate": 0.0005438957475994513, "loss": 0.0128, "step": 66500 }, { "epoch": 91.90672153635117, "grad_norm": 0.11264779418706894, "learning_rate": 0.0005404663923182442, "loss": 0.0143, "step": 67000 }, { "epoch": 92.5925925925926, "grad_norm": 0.11864063143730164, "learning_rate": 0.0005370370370370371, "loss": 0.0124, "step": 67500 }, { "epoch": 93.27846364883402, "grad_norm": 0.07646331936120987, "learning_rate": 0.0005336076817558299, "loss": 0.0126, "step": 68000 }, { "epoch": 93.96433470507544, "grad_norm": 0.06260576099157333, "learning_rate": 0.0005301783264746228, "loss": 0.0116, "step": 68500 }, { "epoch": 94.65020576131687, "grad_norm": 0.10267277806997299, "learning_rate": 0.0005267489711934157, "loss": 0.0117, "step": 69000 }, { "epoch": 95.3360768175583, "grad_norm": 0.07879356294870377, "learning_rate": 0.0005233196159122086, "loss": 0.0133, "step": 69500 }, { "epoch": 96.02194787379973, "grad_norm": 0.06801025569438934, "learning_rate": 0.0005198902606310014, "loss": 0.0121, "step": 70000 }, { "epoch": 96.70781893004116, "grad_norm": 0.1383305788040161, "learning_rate": 0.0005164609053497943, "loss": 0.0118, "step": 70500 }, { "epoch": 97.39368998628258, "grad_norm": 0.13461919128894806, "learning_rate": 0.0005130315500685872, "loss": 0.0121, "step": 71000 }, { "epoch": 98.079561042524, "grad_norm": 0.07046571373939514, "learning_rate": 0.00050960219478738, "loss": 0.0119, "step": 71500 }, { "epoch": 98.76543209876543, "grad_norm": 0.12527473270893097, "learning_rate": 0.0005061728395061728, "loss": 0.0116, "step": 72000 }, { "epoch": 99.45130315500685, "grad_norm": 0.08155812323093414, "learning_rate": 0.0005027434842249657, "loss": 0.0108, "step": 72500 }, { "epoch": 100.13717421124828, "grad_norm": 0.10491594672203064, "learning_rate": 0.0004993141289437586, "loss": 0.0112, "step": 73000 }, { "epoch": 100.82304526748972, "grad_norm": 0.10411892831325531, "learning_rate": 0.0004958847736625515, "loss": 0.0105, "step": 73500 }, { "epoch": 101.50891632373114, "grad_norm": 0.049623072147369385, "learning_rate": 0.0004924554183813444, "loss": 0.0111, "step": 74000 }, { "epoch": 102.19478737997257, "grad_norm": 0.11287267506122589, "learning_rate": 0.0004890260631001372, "loss": 0.0113, "step": 74500 }, { "epoch": 102.88065843621399, "grad_norm": 0.09406940639019012, "learning_rate": 0.00048559670781893007, "loss": 0.0113, "step": 75000 }, { "epoch": 103.56652949245542, "grad_norm": 0.05741545557975769, "learning_rate": 0.00048216735253772295, "loss": 0.0116, "step": 75500 }, { "epoch": 104.25240054869684, "grad_norm": 0.06398554146289825, "learning_rate": 0.0004787379972565158, "loss": 0.0124, "step": 76000 }, { "epoch": 104.93827160493827, "grad_norm": 0.05331519991159439, "learning_rate": 0.00047530864197530866, "loss": 0.0105, "step": 76500 }, { "epoch": 105.6241426611797, "grad_norm": 0.040314387530088425, "learning_rate": 0.00047187928669410154, "loss": 0.0091, "step": 77000 }, { "epoch": 106.31001371742113, "grad_norm": 0.0661318302154541, "learning_rate": 0.00046844993141289436, "loss": 0.0096, "step": 77500 }, { "epoch": 106.99588477366255, "grad_norm": 0.13146652281284332, "learning_rate": 0.00046502057613168724, "loss": 0.0112, "step": 78000 }, { "epoch": 107.68175582990398, "grad_norm": 0.060037847608327866, "learning_rate": 0.0004615912208504801, "loss": 0.0096, "step": 78500 }, { "epoch": 108.3676268861454, "grad_norm": 0.10393790900707245, "learning_rate": 0.00045816186556927295, "loss": 0.0107, "step": 79000 }, { "epoch": 109.05349794238683, "grad_norm": 0.053112734109163284, "learning_rate": 0.0004547325102880658, "loss": 0.0096, "step": 79500 }, { "epoch": 109.73936899862825, "grad_norm": 0.05577271804213524, "learning_rate": 0.0004513031550068587, "loss": 0.0091, "step": 80000 }, { "epoch": 110.42524005486969, "grad_norm": 0.03647785261273384, "learning_rate": 0.0004478737997256516, "loss": 0.009, "step": 80500 }, { "epoch": 111.11111111111111, "grad_norm": 0.09830909222364426, "learning_rate": 0.0004444444444444444, "loss": 0.0098, "step": 81000 }, { "epoch": 111.79698216735254, "grad_norm": 0.025291450321674347, "learning_rate": 0.0004410150891632373, "loss": 0.0099, "step": 81500 }, { "epoch": 112.48285322359396, "grad_norm": 0.0518038235604763, "learning_rate": 0.0004375857338820302, "loss": 0.0093, "step": 82000 }, { "epoch": 113.16872427983539, "grad_norm": 0.08746583759784698, "learning_rate": 0.00043415637860082305, "loss": 0.0096, "step": 82500 }, { "epoch": 113.85459533607681, "grad_norm": 0.0944758877158165, "learning_rate": 0.00043072702331961593, "loss": 0.0098, "step": 83000 }, { "epoch": 114.54046639231825, "grad_norm": 0.10082614421844482, "learning_rate": 0.0004272976680384088, "loss": 0.0087, "step": 83500 }, { "epoch": 115.22633744855968, "grad_norm": 0.05366198346018791, "learning_rate": 0.0004238683127572017, "loss": 0.0088, "step": 84000 }, { "epoch": 115.9122085048011, "grad_norm": 0.04216461256146431, "learning_rate": 0.0004204389574759945, "loss": 0.0083, "step": 84500 }, { "epoch": 116.59807956104252, "grad_norm": 0.061591554433107376, "learning_rate": 0.0004170096021947874, "loss": 0.0098, "step": 85000 }, { "epoch": 117.28395061728395, "grad_norm": 0.10498243570327759, "learning_rate": 0.0004135802469135803, "loss": 0.0093, "step": 85500 }, { "epoch": 117.96982167352537, "grad_norm": 0.041311949491500854, "learning_rate": 0.0004101508916323731, "loss": 0.0087, "step": 86000 }, { "epoch": 118.6556927297668, "grad_norm": 0.09587587416172028, "learning_rate": 0.000406721536351166, "loss": 0.0098, "step": 86500 }, { "epoch": 119.34156378600824, "grad_norm": 0.05686916410923004, "learning_rate": 0.00040329218106995886, "loss": 0.0084, "step": 87000 }, { "epoch": 120.02743484224966, "grad_norm": 0.06297193467617035, "learning_rate": 0.00039986282578875174, "loss": 0.0082, "step": 87500 }, { "epoch": 120.71330589849109, "grad_norm": 0.13432051241397858, "learning_rate": 0.00039643347050754456, "loss": 0.0075, "step": 88000 }, { "epoch": 121.39917695473251, "grad_norm": 0.03966566175222397, "learning_rate": 0.00039300411522633744, "loss": 0.0133, "step": 88500 }, { "epoch": 122.08504801097394, "grad_norm": 0.09373347461223602, "learning_rate": 0.0003895747599451303, "loss": 0.0082, "step": 89000 }, { "epoch": 122.77091906721536, "grad_norm": 0.06179581582546234, "learning_rate": 0.00038614540466392315, "loss": 0.0081, "step": 89500 }, { "epoch": 123.45679012345678, "grad_norm": 0.050256237387657166, "learning_rate": 0.00038271604938271603, "loss": 0.0081, "step": 90000 }, { "epoch": 124.14266117969822, "grad_norm": 0.03073493391275406, "learning_rate": 0.0003792866941015089, "loss": 0.0076, "step": 90500 }, { "epoch": 124.82853223593965, "grad_norm": 0.07006064057350159, "learning_rate": 0.0003758573388203018, "loss": 0.0075, "step": 91000 }, { "epoch": 125.51440329218107, "grad_norm": 0.05567869916558266, "learning_rate": 0.00037242798353909467, "loss": 0.0079, "step": 91500 }, { "epoch": 126.2002743484225, "grad_norm": 0.03926622495055199, "learning_rate": 0.00036899862825788755, "loss": 0.008, "step": 92000 }, { "epoch": 126.88614540466392, "grad_norm": 0.05135864019393921, "learning_rate": 0.0003655692729766804, "loss": 0.0075, "step": 92500 }, { "epoch": 127.57201646090535, "grad_norm": 0.09010568261146545, "learning_rate": 0.00036213991769547325, "loss": 0.0068, "step": 93000 }, { "epoch": 128.25788751714677, "grad_norm": 0.043294016271829605, "learning_rate": 0.00035871056241426613, "loss": 0.0072, "step": 93500 }, { "epoch": 128.9437585733882, "grad_norm": 0.07277141511440277, "learning_rate": 0.000355281207133059, "loss": 0.0073, "step": 94000 }, { "epoch": 129.62962962962962, "grad_norm": 0.06138594448566437, "learning_rate": 0.0003518518518518519, "loss": 0.007, "step": 94500 }, { "epoch": 130.31550068587106, "grad_norm": 0.06205645576119423, "learning_rate": 0.0003484224965706447, "loss": 0.0078, "step": 95000 }, { "epoch": 131.0013717421125, "grad_norm": 0.0695485770702362, "learning_rate": 0.0003449931412894376, "loss": 0.0074, "step": 95500 }, { "epoch": 131.6872427983539, "grad_norm": 0.027826467528939247, "learning_rate": 0.0003415637860082305, "loss": 0.0063, "step": 96000 }, { "epoch": 132.37311385459535, "grad_norm": 0.05079466477036476, "learning_rate": 0.0003381344307270233, "loss": 0.0068, "step": 96500 }, { "epoch": 133.05898491083676, "grad_norm": 0.031004609540104866, "learning_rate": 0.0003347050754458162, "loss": 0.0076, "step": 97000 }, { "epoch": 133.7448559670782, "grad_norm": 0.04684276878833771, "learning_rate": 0.00033127572016460906, "loss": 0.0066, "step": 97500 }, { "epoch": 134.4307270233196, "grad_norm": 0.04819253832101822, "learning_rate": 0.00032784636488340194, "loss": 0.0078, "step": 98000 }, { "epoch": 135.11659807956104, "grad_norm": 0.04323802888393402, "learning_rate": 0.00032441700960219477, "loss": 0.0066, "step": 98500 }, { "epoch": 135.80246913580248, "grad_norm": 0.0720980316400528, "learning_rate": 0.00032098765432098765, "loss": 0.006, "step": 99000 }, { "epoch": 136.4883401920439, "grad_norm": 0.03386896476149559, "learning_rate": 0.0003175582990397805, "loss": 0.0061, "step": 99500 }, { "epoch": 137.17421124828533, "grad_norm": 0.033124495297670364, "learning_rate": 0.00031412894375857335, "loss": 0.0062, "step": 100000 }, { "epoch": 137.86008230452674, "grad_norm": 0.05887928605079651, "learning_rate": 0.00031069958847736623, "loss": 0.0061, "step": 100500 }, { "epoch": 138.54595336076818, "grad_norm": 0.04949059709906578, "learning_rate": 0.00030727023319615916, "loss": 0.0064, "step": 101000 }, { "epoch": 139.2318244170096, "grad_norm": 0.020787570625543594, "learning_rate": 0.00030384087791495204, "loss": 0.0057, "step": 101500 }, { "epoch": 139.91769547325103, "grad_norm": 0.02130681276321411, "learning_rate": 0.00030041152263374487, "loss": 0.0062, "step": 102000 }, { "epoch": 140.60356652949247, "grad_norm": 0.025823501870036125, "learning_rate": 0.00029698216735253775, "loss": 0.0062, "step": 102500 }, { "epoch": 141.28943758573388, "grad_norm": 0.06032751873135567, "learning_rate": 0.00029355281207133063, "loss": 0.0059, "step": 103000 }, { "epoch": 141.97530864197532, "grad_norm": 0.05369709059596062, "learning_rate": 0.00029012345679012345, "loss": 0.0067, "step": 103500 }, { "epoch": 142.66117969821673, "grad_norm": 0.012244037352502346, "learning_rate": 0.00028669410150891633, "loss": 0.0055, "step": 104000 }, { "epoch": 143.34705075445817, "grad_norm": 0.03182597830891609, "learning_rate": 0.0002832647462277092, "loss": 0.0057, "step": 104500 }, { "epoch": 144.03292181069958, "grad_norm": 0.11879345774650574, "learning_rate": 0.0002798353909465021, "loss": 0.0058, "step": 105000 }, { "epoch": 144.71879286694102, "grad_norm": 0.06132959946990013, "learning_rate": 0.0002764060356652949, "loss": 0.0059, "step": 105500 }, { "epoch": 145.40466392318245, "grad_norm": 0.03873404487967491, "learning_rate": 0.0002729766803840878, "loss": 0.0054, "step": 106000 }, { "epoch": 146.09053497942386, "grad_norm": 0.05853896215558052, "learning_rate": 0.0002695473251028807, "loss": 0.0055, "step": 106500 }, { "epoch": 146.7764060356653, "grad_norm": 0.022752197459340096, "learning_rate": 0.0002661179698216735, "loss": 0.0054, "step": 107000 }, { "epoch": 147.4622770919067, "grad_norm": 0.013148725032806396, "learning_rate": 0.0002626886145404664, "loss": 0.0052, "step": 107500 }, { "epoch": 148.14814814814815, "grad_norm": 0.0366031639277935, "learning_rate": 0.00025925925925925926, "loss": 0.0057, "step": 108000 }, { "epoch": 148.83401920438956, "grad_norm": 0.08972814679145813, "learning_rate": 0.0002558299039780521, "loss": 0.0055, "step": 108500 }, { "epoch": 149.519890260631, "grad_norm": 0.034281570464372635, "learning_rate": 0.00025240054869684497, "loss": 0.0051, "step": 109000 }, { "epoch": 150.20576131687244, "grad_norm": 0.010903775691986084, "learning_rate": 0.00024897119341563785, "loss": 0.005, "step": 109500 }, { "epoch": 150.89163237311385, "grad_norm": 0.015872234478592873, "learning_rate": 0.0002455418381344307, "loss": 0.0048, "step": 110000 }, { "epoch": 151.5775034293553, "grad_norm": 0.03331177309155464, "learning_rate": 0.0002421124828532236, "loss": 0.0053, "step": 110500 }, { "epoch": 152.2633744855967, "grad_norm": 0.018091242760419846, "learning_rate": 0.00023868312757201646, "loss": 0.0052, "step": 111000 }, { "epoch": 152.94924554183814, "grad_norm": 0.03916322439908981, "learning_rate": 0.00023525377229080934, "loss": 0.0051, "step": 111500 }, { "epoch": 153.63511659807955, "grad_norm": 0.023662962019443512, "learning_rate": 0.0002318244170096022, "loss": 0.0048, "step": 112000 }, { "epoch": 154.320987654321, "grad_norm": 0.014827713370323181, "learning_rate": 0.00022839506172839504, "loss": 0.0046, "step": 112500 }, { "epoch": 155.00685871056243, "grad_norm": 0.03849729895591736, "learning_rate": 0.00022496570644718795, "loss": 0.0047, "step": 113000 }, { "epoch": 155.69272976680384, "grad_norm": 0.057328708469867706, "learning_rate": 0.0002215363511659808, "loss": 0.0046, "step": 113500 }, { "epoch": 156.37860082304528, "grad_norm": 0.017304692417383194, "learning_rate": 0.00021810699588477368, "loss": 0.0047, "step": 114000 }, { "epoch": 157.06447187928669, "grad_norm": 0.02947048842906952, "learning_rate": 0.00021467764060356654, "loss": 0.0047, "step": 114500 }, { "epoch": 157.75034293552812, "grad_norm": 0.02040564827620983, "learning_rate": 0.0002112482853223594, "loss": 0.0048, "step": 115000 }, { "epoch": 158.43621399176953, "grad_norm": 0.05288396403193474, "learning_rate": 0.00020781893004115227, "loss": 0.0047, "step": 115500 }, { "epoch": 159.12208504801097, "grad_norm": 0.026866160333156586, "learning_rate": 0.00020438957475994512, "loss": 0.0047, "step": 116000 }, { "epoch": 159.8079561042524, "grad_norm": 0.04552701115608215, "learning_rate": 0.000200960219478738, "loss": 0.0043, "step": 116500 }, { "epoch": 160.49382716049382, "grad_norm": 0.020935742184519768, "learning_rate": 0.00019753086419753085, "loss": 0.0045, "step": 117000 }, { "epoch": 161.17969821673526, "grad_norm": 0.03535682335495949, "learning_rate": 0.00019410150891632373, "loss": 0.0044, "step": 117500 }, { "epoch": 161.86556927297667, "grad_norm": 0.019771935418248177, "learning_rate": 0.0001906721536351166, "loss": 0.0044, "step": 118000 }, { "epoch": 162.5514403292181, "grad_norm": 0.013906066305935383, "learning_rate": 0.00018724279835390946, "loss": 0.004, "step": 118500 }, { "epoch": 163.23731138545952, "grad_norm": 0.012126780115067959, "learning_rate": 0.00018381344307270234, "loss": 0.0043, "step": 119000 }, { "epoch": 163.92318244170096, "grad_norm": 0.021324895322322845, "learning_rate": 0.0001803840877914952, "loss": 0.0043, "step": 119500 }, { "epoch": 164.6090534979424, "grad_norm": 0.013377720490098, "learning_rate": 0.00017695473251028808, "loss": 0.0043, "step": 120000 }, { "epoch": 165.2949245541838, "grad_norm": 0.08576662838459015, "learning_rate": 0.00017352537722908093, "loss": 0.0043, "step": 120500 }, { "epoch": 165.98079561042525, "grad_norm": 0.008204947225749493, "learning_rate": 0.0001700960219478738, "loss": 0.004, "step": 121000 }, { "epoch": 166.66666666666666, "grad_norm": 0.01228815782815218, "learning_rate": 0.00016666666666666666, "loss": 0.0038, "step": 121500 }, { "epoch": 167.3525377229081, "grad_norm": 0.007976936176419258, "learning_rate": 0.00016323731138545951, "loss": 0.004, "step": 122000 }, { "epoch": 168.0384087791495, "grad_norm": 0.06643296033143997, "learning_rate": 0.00015980795610425242, "loss": 0.0042, "step": 122500 }, { "epoch": 168.72427983539094, "grad_norm": 0.040761686861515045, "learning_rate": 0.00015637860082304527, "loss": 0.0041, "step": 123000 }, { "epoch": 169.41015089163238, "grad_norm": 0.006533100735396147, "learning_rate": 0.00015294924554183815, "loss": 0.0039, "step": 123500 }, { "epoch": 170.0960219478738, "grad_norm": 0.015169362537562847, "learning_rate": 0.000149519890260631, "loss": 0.0039, "step": 124000 }, { "epoch": 170.78189300411523, "grad_norm": 0.03795445337891579, "learning_rate": 0.00014609053497942388, "loss": 0.0039, "step": 124500 }, { "epoch": 171.46776406035664, "grad_norm": 0.03991511091589928, "learning_rate": 0.00014266117969821674, "loss": 0.0038, "step": 125000 }, { "epoch": 172.15363511659808, "grad_norm": 0.008595280349254608, "learning_rate": 0.0001392318244170096, "loss": 0.0039, "step": 125500 }, { "epoch": 172.8395061728395, "grad_norm": 0.01920117437839508, "learning_rate": 0.00013580246913580247, "loss": 0.0036, "step": 126000 }, { "epoch": 173.52537722908093, "grad_norm": 0.00689229741692543, "learning_rate": 0.00013237311385459532, "loss": 0.0035, "step": 126500 }, { "epoch": 174.21124828532237, "grad_norm": 0.007589759770780802, "learning_rate": 0.0001289437585733882, "loss": 0.0037, "step": 127000 }, { "epoch": 174.89711934156378, "grad_norm": 0.026523003354668617, "learning_rate": 0.00012551440329218108, "loss": 0.0037, "step": 127500 }, { "epoch": 175.58299039780522, "grad_norm": 0.009044776670634747, "learning_rate": 0.00012208504801097393, "loss": 0.0035, "step": 128000 }, { "epoch": 176.26886145404663, "grad_norm": 0.007540062535554171, "learning_rate": 0.00011865569272976681, "loss": 0.0038, "step": 128500 }, { "epoch": 176.95473251028807, "grad_norm": 0.00613565556704998, "learning_rate": 0.00011522633744855968, "loss": 0.0036, "step": 129000 }, { "epoch": 177.6406035665295, "grad_norm": 0.005932086147367954, "learning_rate": 0.00011179698216735255, "loss": 0.0036, "step": 129500 }, { "epoch": 178.32647462277092, "grad_norm": 0.0038564407732337713, "learning_rate": 0.0001083676268861454, "loss": 0.0035, "step": 130000 }, { "epoch": 179.01234567901236, "grad_norm": 0.10642554610967636, "learning_rate": 0.00010493827160493826, "loss": 0.0035, "step": 130500 }, { "epoch": 179.69821673525377, "grad_norm": 0.010230828076601028, "learning_rate": 0.00010150891632373114, "loss": 0.0034, "step": 131000 }, { "epoch": 180.3840877914952, "grad_norm": 0.02526956982910633, "learning_rate": 9.807956104252401e-05, "loss": 0.0034, "step": 131500 }, { "epoch": 181.06995884773661, "grad_norm": 0.00888708233833313, "learning_rate": 9.465020576131688e-05, "loss": 0.0035, "step": 132000 }, { "epoch": 181.75582990397805, "grad_norm": 0.005969726946204901, "learning_rate": 9.122085048010974e-05, "loss": 0.0033, "step": 132500 }, { "epoch": 182.4417009602195, "grad_norm": 0.004369193222373724, "learning_rate": 8.77914951989026e-05, "loss": 0.0033, "step": 133000 }, { "epoch": 183.1275720164609, "grad_norm": 0.03928361088037491, "learning_rate": 8.436213991769547e-05, "loss": 0.0035, "step": 133500 }, { "epoch": 183.81344307270234, "grad_norm": 0.008007598109543324, "learning_rate": 8.093278463648834e-05, "loss": 0.0032, "step": 134000 }, { "epoch": 184.49931412894375, "grad_norm": 0.00541963754221797, "learning_rate": 7.750342935528121e-05, "loss": 0.0032, "step": 134500 }, { "epoch": 185.1851851851852, "grad_norm": 0.010783227160573006, "learning_rate": 7.407407407407407e-05, "loss": 0.0033, "step": 135000 }, { "epoch": 185.8710562414266, "grad_norm": 0.005976933054625988, "learning_rate": 7.064471879286695e-05, "loss": 0.0033, "step": 135500 }, { "epoch": 186.55692729766804, "grad_norm": 0.010424752719700336, "learning_rate": 6.721536351165982e-05, "loss": 0.0031, "step": 136000 }, { "epoch": 187.24279835390948, "grad_norm": 0.00834636203944683, "learning_rate": 6.378600823045267e-05, "loss": 0.0032, "step": 136500 }, { "epoch": 187.9286694101509, "grad_norm": 0.0064740655943751335, "learning_rate": 6.0356652949245544e-05, "loss": 0.0032, "step": 137000 }, { "epoch": 188.61454046639233, "grad_norm": 0.007856756448745728, "learning_rate": 5.692729766803841e-05, "loss": 0.0031, "step": 137500 }, { "epoch": 189.30041152263374, "grad_norm": 0.006533577106893063, "learning_rate": 5.3497942386831277e-05, "loss": 0.0031, "step": 138000 }, { "epoch": 189.98628257887518, "grad_norm": 0.005114428699016571, "learning_rate": 5.006858710562415e-05, "loss": 0.0032, "step": 138500 }, { "epoch": 190.6721536351166, "grad_norm": 0.00717920670285821, "learning_rate": 4.663923182441701e-05, "loss": 0.0031, "step": 139000 }, { "epoch": 191.35802469135803, "grad_norm": 0.008158246986567974, "learning_rate": 4.3209876543209875e-05, "loss": 0.0031, "step": 139500 }, { "epoch": 192.04389574759946, "grad_norm": 0.005865829065442085, "learning_rate": 3.978052126200275e-05, "loss": 0.0031, "step": 140000 }, { "epoch": 192.72976680384087, "grad_norm": 0.0036265423987060785, "learning_rate": 3.635116598079561e-05, "loss": 0.003, "step": 140500 }, { "epoch": 193.4156378600823, "grad_norm": 0.005760482046753168, "learning_rate": 3.292181069958848e-05, "loss": 0.0029, "step": 141000 }, { "epoch": 194.10150891632372, "grad_norm": 0.00619177520275116, "learning_rate": 2.9492455418381346e-05, "loss": 0.0031, "step": 141500 }, { "epoch": 194.78737997256516, "grad_norm": 0.00745520880445838, "learning_rate": 2.6063100137174212e-05, "loss": 0.0029, "step": 142000 }, { "epoch": 195.47325102880657, "grad_norm": 0.006654828321188688, "learning_rate": 2.2633744855967078e-05, "loss": 0.003, "step": 142500 }, { "epoch": 196.159122085048, "grad_norm": 0.007228308357298374, "learning_rate": 1.9204389574759948e-05, "loss": 0.003, "step": 143000 }, { "epoch": 196.84499314128945, "grad_norm": 0.007839037105441093, "learning_rate": 1.577503429355281e-05, "loss": 0.0029, "step": 143500 }, { "epoch": 197.53086419753086, "grad_norm": 0.005645914003252983, "learning_rate": 1.2345679012345678e-05, "loss": 0.0029, "step": 144000 }, { "epoch": 198.2167352537723, "grad_norm": 0.007110149599611759, "learning_rate": 8.916323731138546e-06, "loss": 0.0029, "step": 144500 }, { "epoch": 198.9026063100137, "grad_norm": 0.007023398298770189, "learning_rate": 5.486968449931412e-06, "loss": 0.003, "step": 145000 }, { "epoch": 199.58847736625515, "grad_norm": 0.006341323722153902, "learning_rate": 2.05761316872428e-06, "loss": 0.0028, "step": 145500 }, { "epoch": 200.0, "step": 145800, "total_flos": 2.53654993993728e+18, "train_loss": 0.006510871606680919, "train_runtime": 172601.0106, "train_samples_per_second": 3.377, "train_steps_per_second": 0.845 } ], "logging_steps": 500, "max_steps": 145800, "num_input_tokens_seen": 0, "num_train_epochs": 200, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.53654993993728e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }