|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 200.0, |
|
"eval_steps": 500, |
|
"global_step": 145800, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.6858710562414266, |
|
"grad_norm": 1.2456575632095337, |
|
"learning_rate": 0.000996570644718793, |
|
"loss": 2.6328, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.3717421124828533, |
|
"grad_norm": 0.6728461384773254, |
|
"learning_rate": 0.0009931412894375858, |
|
"loss": 1.9106, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.05761316872428, |
|
"grad_norm": 0.7172486782073975, |
|
"learning_rate": 0.0009897119341563787, |
|
"loss": 1.5385, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.7434842249657065, |
|
"grad_norm": 0.5451925992965698, |
|
"learning_rate": 0.0009862825788751715, |
|
"loss": 1.1861, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.4293552812071333, |
|
"grad_norm": 1.0264705419540405, |
|
"learning_rate": 0.0009828532235939644, |
|
"loss": 0.9558, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.11522633744856, |
|
"grad_norm": 0.5801027417182922, |
|
"learning_rate": 0.0009794238683127573, |
|
"loss": 0.7722, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.801097393689986, |
|
"grad_norm": 0.6234860420227051, |
|
"learning_rate": 0.0009759945130315501, |
|
"loss": 0.5497, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.486968449931413, |
|
"grad_norm": 0.4896549880504608, |
|
"learning_rate": 0.000972565157750343, |
|
"loss": 0.42, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 6.172839506172839, |
|
"grad_norm": 0.5927340984344482, |
|
"learning_rate": 0.0009691358024691358, |
|
"loss": 0.3612, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 6.858710562414267, |
|
"grad_norm": 0.5274905562400818, |
|
"learning_rate": 0.0009657064471879287, |
|
"loss": 0.2707, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 7.544581618655693, |
|
"grad_norm": 0.46353018283843994, |
|
"learning_rate": 0.0009622770919067215, |
|
"loss": 0.2088, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 8.23045267489712, |
|
"grad_norm": 0.7881788611412048, |
|
"learning_rate": 0.0009588477366255144, |
|
"loss": 0.2041, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 8.916323731138545, |
|
"grad_norm": 0.6398069262504578, |
|
"learning_rate": 0.0009554183813443072, |
|
"loss": 0.1744, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 9.602194787379972, |
|
"grad_norm": 0.44406837224960327, |
|
"learning_rate": 0.0009519890260631001, |
|
"loss": 0.1369, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 10.2880658436214, |
|
"grad_norm": 0.3870859742164612, |
|
"learning_rate": 0.000948559670781893, |
|
"loss": 0.1408, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 10.973936899862826, |
|
"grad_norm": 0.6403707265853882, |
|
"learning_rate": 0.0009451303155006859, |
|
"loss": 0.1307, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 11.659807956104252, |
|
"grad_norm": 0.406328022480011, |
|
"learning_rate": 0.0009417009602194788, |
|
"loss": 0.1105, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 12.345679012345679, |
|
"grad_norm": 0.4761105179786682, |
|
"learning_rate": 0.0009382716049382715, |
|
"loss": 0.1075, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 13.031550068587105, |
|
"grad_norm": 0.39505085349082947, |
|
"learning_rate": 0.0009348422496570644, |
|
"loss": 0.1101, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 13.717421124828533, |
|
"grad_norm": 0.3096862733364105, |
|
"learning_rate": 0.0009314128943758574, |
|
"loss": 0.0889, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 14.40329218106996, |
|
"grad_norm": 0.681281328201294, |
|
"learning_rate": 0.0009279835390946503, |
|
"loss": 0.097, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 15.089163237311386, |
|
"grad_norm": 0.23753629624843597, |
|
"learning_rate": 0.0009245541838134432, |
|
"loss": 0.0927, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 15.775034293552812, |
|
"grad_norm": 0.464749813079834, |
|
"learning_rate": 0.000921124828532236, |
|
"loss": 0.0793, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 16.46090534979424, |
|
"grad_norm": 0.3283621668815613, |
|
"learning_rate": 0.0009176954732510289, |
|
"loss": 0.0764, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 17.146776406035666, |
|
"grad_norm": 0.297809898853302, |
|
"learning_rate": 0.0009142661179698217, |
|
"loss": 0.0788, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 17.83264746227709, |
|
"grad_norm": 0.37593135237693787, |
|
"learning_rate": 0.0009108367626886146, |
|
"loss": 0.0746, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 18.51851851851852, |
|
"grad_norm": 0.3363408148288727, |
|
"learning_rate": 0.0009074074074074074, |
|
"loss": 0.0764, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 19.204389574759944, |
|
"grad_norm": 0.27103549242019653, |
|
"learning_rate": 0.0009039780521262003, |
|
"loss": 0.0751, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 19.89026063100137, |
|
"grad_norm": 0.23534435033798218, |
|
"learning_rate": 0.0009005486968449932, |
|
"loss": 0.0643, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 20.5761316872428, |
|
"grad_norm": 0.3298335671424866, |
|
"learning_rate": 0.0008971193415637861, |
|
"loss": 0.0621, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 21.262002743484224, |
|
"grad_norm": 0.28924617171287537, |
|
"learning_rate": 0.000893689986282579, |
|
"loss": 0.0661, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 21.947873799725652, |
|
"grad_norm": 0.34902724623680115, |
|
"learning_rate": 0.0008902606310013717, |
|
"loss": 0.0693, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 22.633744855967077, |
|
"grad_norm": 0.24693663418293, |
|
"learning_rate": 0.0008868312757201646, |
|
"loss": 0.0591, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 23.319615912208505, |
|
"grad_norm": 0.2739815413951874, |
|
"learning_rate": 0.0008834019204389575, |
|
"loss": 0.0576, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 24.005486968449933, |
|
"grad_norm": 0.36366912722587585, |
|
"learning_rate": 0.0008799725651577504, |
|
"loss": 0.0555, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 24.691358024691358, |
|
"grad_norm": 0.3027900159358978, |
|
"learning_rate": 0.0008765432098765433, |
|
"loss": 0.0507, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 25.377229080932786, |
|
"grad_norm": 0.18507389724254608, |
|
"learning_rate": 0.0008731138545953361, |
|
"loss": 0.0525, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 26.06310013717421, |
|
"grad_norm": 0.23404183983802795, |
|
"learning_rate": 0.000869684499314129, |
|
"loss": 0.0579, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 26.74897119341564, |
|
"grad_norm": 0.3586121201515198, |
|
"learning_rate": 0.0008662551440329218, |
|
"loss": 0.048, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 27.434842249657063, |
|
"grad_norm": 0.2888980805873871, |
|
"learning_rate": 0.0008628257887517147, |
|
"loss": 0.0515, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 28.12071330589849, |
|
"grad_norm": 0.23876026272773743, |
|
"learning_rate": 0.0008593964334705075, |
|
"loss": 0.0491, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 28.80658436213992, |
|
"grad_norm": 0.2646074891090393, |
|
"learning_rate": 0.0008559670781893004, |
|
"loss": 0.0466, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 29.492455418381343, |
|
"grad_norm": 0.1991817206144333, |
|
"learning_rate": 0.0008525377229080933, |
|
"loss": 0.0451, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 30.17832647462277, |
|
"grad_norm": 0.218049556016922, |
|
"learning_rate": 0.0008491083676268862, |
|
"loss": 0.0445, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 30.864197530864196, |
|
"grad_norm": 0.2530564069747925, |
|
"learning_rate": 0.0008456790123456791, |
|
"loss": 0.0427, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 31.550068587105624, |
|
"grad_norm": 0.2068854421377182, |
|
"learning_rate": 0.0008422496570644718, |
|
"loss": 0.0421, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 32.23593964334705, |
|
"grad_norm": 0.18602465093135834, |
|
"learning_rate": 0.0008388203017832647, |
|
"loss": 0.0437, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 32.92181069958848, |
|
"grad_norm": 0.21352776885032654, |
|
"learning_rate": 0.0008353909465020576, |
|
"loss": 0.0462, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 33.6076817558299, |
|
"grad_norm": 0.2586299777030945, |
|
"learning_rate": 0.0008319615912208505, |
|
"loss": 0.0417, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 34.29355281207133, |
|
"grad_norm": 0.2829551696777344, |
|
"learning_rate": 0.0008285322359396434, |
|
"loss": 0.0381, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 34.97942386831276, |
|
"grad_norm": 0.23624148964881897, |
|
"learning_rate": 0.0008251028806584362, |
|
"loss": 0.0361, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 35.66529492455418, |
|
"grad_norm": 0.21780389547348022, |
|
"learning_rate": 0.0008216735253772291, |
|
"loss": 0.0362, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 36.351165980795614, |
|
"grad_norm": 0.3541390299797058, |
|
"learning_rate": 0.0008182441700960219, |
|
"loss": 0.0395, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 37.03703703703704, |
|
"grad_norm": 0.15323896706104279, |
|
"learning_rate": 0.0008148148148148148, |
|
"loss": 0.0382, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 37.72290809327846, |
|
"grad_norm": 0.16792021691799164, |
|
"learning_rate": 0.0008113854595336076, |
|
"loss": 0.0319, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 38.40877914951989, |
|
"grad_norm": 0.19167844951152802, |
|
"learning_rate": 0.0008079561042524005, |
|
"loss": 0.0367, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 39.09465020576132, |
|
"grad_norm": 0.21890634298324585, |
|
"learning_rate": 0.0008045267489711934, |
|
"loss": 0.037, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 39.78052126200274, |
|
"grad_norm": 0.14012588560581207, |
|
"learning_rate": 0.0008010973936899864, |
|
"loss": 0.0325, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 40.46639231824417, |
|
"grad_norm": 1.626105546951294, |
|
"learning_rate": 0.0007976680384087793, |
|
"loss": 0.0315, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 41.1522633744856, |
|
"grad_norm": 0.1850096881389618, |
|
"learning_rate": 0.000794238683127572, |
|
"loss": 0.0379, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 41.838134430727024, |
|
"grad_norm": 0.20383605360984802, |
|
"learning_rate": 0.0007908093278463649, |
|
"loss": 0.0315, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 42.52400548696845, |
|
"grad_norm": 0.1643492877483368, |
|
"learning_rate": 0.0007873799725651578, |
|
"loss": 0.0276, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 43.20987654320987, |
|
"grad_norm": 0.15405911207199097, |
|
"learning_rate": 0.0007839506172839507, |
|
"loss": 0.0312, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 43.895747599451305, |
|
"grad_norm": 0.2370378077030182, |
|
"learning_rate": 0.0007805212620027436, |
|
"loss": 0.0337, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 44.58161865569273, |
|
"grad_norm": 0.14176137745380402, |
|
"learning_rate": 0.0007770919067215364, |
|
"loss": 0.0283, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 45.267489711934154, |
|
"grad_norm": 0.21307243406772614, |
|
"learning_rate": 0.0007736625514403293, |
|
"loss": 0.0277, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 45.953360768175585, |
|
"grad_norm": 0.2646368145942688, |
|
"learning_rate": 0.0007702331961591221, |
|
"loss": 0.03, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 46.63923182441701, |
|
"grad_norm": 0.22391417622566223, |
|
"learning_rate": 0.000766803840877915, |
|
"loss": 0.0298, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 47.325102880658434, |
|
"grad_norm": 0.15177056193351746, |
|
"learning_rate": 0.0007633744855967078, |
|
"loss": 0.0272, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 48.010973936899866, |
|
"grad_norm": 0.20461246371269226, |
|
"learning_rate": 0.0007599451303155007, |
|
"loss": 0.0278, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 48.69684499314129, |
|
"grad_norm": 0.12500979006290436, |
|
"learning_rate": 0.0007565157750342936, |
|
"loss": 0.0268, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 49.382716049382715, |
|
"grad_norm": 0.4091637134552002, |
|
"learning_rate": 0.0007530864197530865, |
|
"loss": 0.0267, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 50.06858710562414, |
|
"grad_norm": 0.22375427186489105, |
|
"learning_rate": 0.0007496570644718793, |
|
"loss": 0.0327, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 50.75445816186557, |
|
"grad_norm": 0.11407098174095154, |
|
"learning_rate": 0.0007462277091906721, |
|
"loss": 0.0245, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 51.440329218106996, |
|
"grad_norm": 0.1025354415178299, |
|
"learning_rate": 0.000742798353909465, |
|
"loss": 0.0246, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 52.12620027434842, |
|
"grad_norm": 0.20578157901763916, |
|
"learning_rate": 0.0007393689986282579, |
|
"loss": 0.0258, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 52.81207133058985, |
|
"grad_norm": 0.1874309480190277, |
|
"learning_rate": 0.0007359396433470508, |
|
"loss": 0.0276, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 53.49794238683128, |
|
"grad_norm": 0.1793011724948883, |
|
"learning_rate": 0.0007325102880658437, |
|
"loss": 0.0232, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 54.1838134430727, |
|
"grad_norm": 0.18910464644432068, |
|
"learning_rate": 0.0007290809327846365, |
|
"loss": 0.0237, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 54.869684499314126, |
|
"grad_norm": 0.26874855160713196, |
|
"learning_rate": 0.0007256515775034293, |
|
"loss": 0.0249, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 55.55555555555556, |
|
"grad_norm": 0.12961339950561523, |
|
"learning_rate": 0.0007222222222222222, |
|
"loss": 0.0262, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 56.24142661179698, |
|
"grad_norm": 0.14723250269889832, |
|
"learning_rate": 0.0007187928669410151, |
|
"loss": 0.0224, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 56.927297668038406, |
|
"grad_norm": 0.17689082026481628, |
|
"learning_rate": 0.000715363511659808, |
|
"loss": 0.0226, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 57.61316872427984, |
|
"grad_norm": 0.14181144535541534, |
|
"learning_rate": 0.0007119341563786008, |
|
"loss": 0.0214, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 58.29903978052126, |
|
"grad_norm": 0.5437673330307007, |
|
"learning_rate": 0.0007085048010973937, |
|
"loss": 0.0234, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 58.98491083676269, |
|
"grad_norm": 0.0570339597761631, |
|
"learning_rate": 0.0007050754458161866, |
|
"loss": 0.0238, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 59.67078189300412, |
|
"grad_norm": 0.14648722112178802, |
|
"learning_rate": 0.0007016460905349794, |
|
"loss": 0.0209, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 60.35665294924554, |
|
"grad_norm": 0.06610771268606186, |
|
"learning_rate": 0.0006982167352537722, |
|
"loss": 0.0203, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 61.04252400548697, |
|
"grad_norm": 0.092800073325634, |
|
"learning_rate": 0.0006947873799725651, |
|
"loss": 0.0198, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 61.72839506172839, |
|
"grad_norm": 0.12414117157459259, |
|
"learning_rate": 0.000691358024691358, |
|
"loss": 0.0195, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 62.414266117969824, |
|
"grad_norm": 0.10231557488441467, |
|
"learning_rate": 0.0006879286694101509, |
|
"loss": 0.0225, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 63.10013717421125, |
|
"grad_norm": 0.14670057594776154, |
|
"learning_rate": 0.0006844993141289438, |
|
"loss": 0.0213, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 63.78600823045267, |
|
"grad_norm": 0.1486993432044983, |
|
"learning_rate": 0.0006810699588477366, |
|
"loss": 0.0225, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 64.4718792866941, |
|
"grad_norm": 0.11352519690990448, |
|
"learning_rate": 0.0006776406035665294, |
|
"loss": 0.0187, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 65.15775034293553, |
|
"grad_norm": 0.08721095323562622, |
|
"learning_rate": 0.0006742112482853223, |
|
"loss": 0.0186, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 65.84362139917695, |
|
"grad_norm": 0.2794257402420044, |
|
"learning_rate": 0.0006707818930041153, |
|
"loss": 0.0184, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 66.52949245541838, |
|
"grad_norm": 0.16597363352775574, |
|
"learning_rate": 0.0006673525377229082, |
|
"loss": 0.0203, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 67.2153635116598, |
|
"grad_norm": 0.1067240834236145, |
|
"learning_rate": 0.000663923182441701, |
|
"loss": 0.0208, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 67.90123456790124, |
|
"grad_norm": 0.13352781534194946, |
|
"learning_rate": 0.0006604938271604939, |
|
"loss": 0.0181, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 68.58710562414267, |
|
"grad_norm": 0.15161901712417603, |
|
"learning_rate": 0.0006570644718792868, |
|
"loss": 0.02, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 69.27297668038409, |
|
"grad_norm": 0.19234149158000946, |
|
"learning_rate": 0.0006536351165980796, |
|
"loss": 0.0179, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 69.95884773662551, |
|
"grad_norm": 0.11277095228433609, |
|
"learning_rate": 0.0006502057613168724, |
|
"loss": 0.0181, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 70.64471879286694, |
|
"grad_norm": 0.10098372399806976, |
|
"learning_rate": 0.0006467764060356653, |
|
"loss": 0.0187, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 71.33058984910836, |
|
"grad_norm": 0.08452719449996948, |
|
"learning_rate": 0.0006433470507544582, |
|
"loss": 0.0167, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 72.01646090534979, |
|
"grad_norm": 0.11851578205823898, |
|
"learning_rate": 0.0006399176954732511, |
|
"loss": 0.0189, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 72.70233196159123, |
|
"grad_norm": 0.2409876585006714, |
|
"learning_rate": 0.000636488340192044, |
|
"loss": 0.018, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 73.38820301783265, |
|
"grad_norm": 0.1037708967924118, |
|
"learning_rate": 0.0006330589849108368, |
|
"loss": 0.0159, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 74.07407407407408, |
|
"grad_norm": 0.1006944552063942, |
|
"learning_rate": 0.0006296296296296296, |
|
"loss": 0.018, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 74.7599451303155, |
|
"grad_norm": 0.11186862736940384, |
|
"learning_rate": 0.0006262002743484225, |
|
"loss": 0.0169, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 75.44581618655693, |
|
"grad_norm": 0.08759860694408417, |
|
"learning_rate": 0.0006227709190672154, |
|
"loss": 0.0165, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 76.13168724279835, |
|
"grad_norm": 0.14316201210021973, |
|
"learning_rate": 0.0006193415637860083, |
|
"loss": 0.0176, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 76.81755829903977, |
|
"grad_norm": 0.1392996907234192, |
|
"learning_rate": 0.0006159122085048011, |
|
"loss": 0.0161, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 77.50342935528121, |
|
"grad_norm": 0.14493365585803986, |
|
"learning_rate": 0.000612482853223594, |
|
"loss": 0.0157, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 78.18930041152264, |
|
"grad_norm": 0.0993877425789833, |
|
"learning_rate": 0.0006090534979423869, |
|
"loss": 0.0168, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 78.87517146776406, |
|
"grad_norm": 0.09831307828426361, |
|
"learning_rate": 0.0006056241426611797, |
|
"loss": 0.0164, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 79.56104252400549, |
|
"grad_norm": 0.09205558151006699, |
|
"learning_rate": 0.0006021947873799725, |
|
"loss": 0.0157, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 80.24691358024691, |
|
"grad_norm": 0.10848256945610046, |
|
"learning_rate": 0.0005987654320987654, |
|
"loss": 0.014, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 80.93278463648834, |
|
"grad_norm": 0.14594706892967224, |
|
"learning_rate": 0.0005953360768175583, |
|
"loss": 0.0144, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 81.61865569272976, |
|
"grad_norm": 0.057823847979307175, |
|
"learning_rate": 0.0005919067215363512, |
|
"loss": 0.0143, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 82.3045267489712, |
|
"grad_norm": 0.3916853368282318, |
|
"learning_rate": 0.0005884773662551441, |
|
"loss": 0.0159, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 82.99039780521262, |
|
"grad_norm": 0.11421903222799301, |
|
"learning_rate": 0.0005850480109739369, |
|
"loss": 0.017, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 83.67626886145405, |
|
"grad_norm": 0.10148236900568008, |
|
"learning_rate": 0.0005816186556927297, |
|
"loss": 0.014, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 84.36213991769547, |
|
"grad_norm": 0.12611427903175354, |
|
"learning_rate": 0.0005781893004115226, |
|
"loss": 0.0139, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 85.0480109739369, |
|
"grad_norm": 0.07960600405931473, |
|
"learning_rate": 0.0005747599451303155, |
|
"loss": 0.0145, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 85.73388203017832, |
|
"grad_norm": 0.09838591516017914, |
|
"learning_rate": 0.0005713305898491084, |
|
"loss": 0.0131, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 86.41975308641975, |
|
"grad_norm": 0.06399769335985184, |
|
"learning_rate": 0.0005679012345679012, |
|
"loss": 0.0131, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 87.10562414266118, |
|
"grad_norm": 0.1615062952041626, |
|
"learning_rate": 0.0005644718792866941, |
|
"loss": 0.0159, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 87.79149519890261, |
|
"grad_norm": 0.11928918957710266, |
|
"learning_rate": 0.000561042524005487, |
|
"loss": 0.0153, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 88.47736625514403, |
|
"grad_norm": 0.0789957344532013, |
|
"learning_rate": 0.0005576131687242798, |
|
"loss": 0.0128, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 89.16323731138546, |
|
"grad_norm": 0.10119163990020752, |
|
"learning_rate": 0.0005541838134430726, |
|
"loss": 0.0134, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 89.84910836762688, |
|
"grad_norm": 0.10709987580776215, |
|
"learning_rate": 0.0005507544581618655, |
|
"loss": 0.0129, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 90.53497942386831, |
|
"grad_norm": 0.1411323994398117, |
|
"learning_rate": 0.0005473251028806584, |
|
"loss": 0.0135, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 91.22085048010975, |
|
"grad_norm": 0.1025325134396553, |
|
"learning_rate": 0.0005438957475994513, |
|
"loss": 0.0128, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 91.90672153635117, |
|
"grad_norm": 0.11264779418706894, |
|
"learning_rate": 0.0005404663923182442, |
|
"loss": 0.0143, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 92.5925925925926, |
|
"grad_norm": 0.11864063143730164, |
|
"learning_rate": 0.0005370370370370371, |
|
"loss": 0.0124, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 93.27846364883402, |
|
"grad_norm": 0.07646331936120987, |
|
"learning_rate": 0.0005336076817558299, |
|
"loss": 0.0126, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 93.96433470507544, |
|
"grad_norm": 0.06260576099157333, |
|
"learning_rate": 0.0005301783264746228, |
|
"loss": 0.0116, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 94.65020576131687, |
|
"grad_norm": 0.10267277806997299, |
|
"learning_rate": 0.0005267489711934157, |
|
"loss": 0.0117, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 95.3360768175583, |
|
"grad_norm": 0.07879356294870377, |
|
"learning_rate": 0.0005233196159122086, |
|
"loss": 0.0133, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 96.02194787379973, |
|
"grad_norm": 0.06801025569438934, |
|
"learning_rate": 0.0005198902606310014, |
|
"loss": 0.0121, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 96.70781893004116, |
|
"grad_norm": 0.1383305788040161, |
|
"learning_rate": 0.0005164609053497943, |
|
"loss": 0.0118, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 97.39368998628258, |
|
"grad_norm": 0.13461919128894806, |
|
"learning_rate": 0.0005130315500685872, |
|
"loss": 0.0121, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 98.079561042524, |
|
"grad_norm": 0.07046571373939514, |
|
"learning_rate": 0.00050960219478738, |
|
"loss": 0.0119, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 98.76543209876543, |
|
"grad_norm": 0.12527473270893097, |
|
"learning_rate": 0.0005061728395061728, |
|
"loss": 0.0116, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 99.45130315500685, |
|
"grad_norm": 0.08155812323093414, |
|
"learning_rate": 0.0005027434842249657, |
|
"loss": 0.0108, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 100.13717421124828, |
|
"grad_norm": 0.10491594672203064, |
|
"learning_rate": 0.0004993141289437586, |
|
"loss": 0.0112, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 100.82304526748972, |
|
"grad_norm": 0.10411892831325531, |
|
"learning_rate": 0.0004958847736625515, |
|
"loss": 0.0105, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 101.50891632373114, |
|
"grad_norm": 0.049623072147369385, |
|
"learning_rate": 0.0004924554183813444, |
|
"loss": 0.0111, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 102.19478737997257, |
|
"grad_norm": 0.11287267506122589, |
|
"learning_rate": 0.0004890260631001372, |
|
"loss": 0.0113, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 102.88065843621399, |
|
"grad_norm": 0.09406940639019012, |
|
"learning_rate": 0.00048559670781893007, |
|
"loss": 0.0113, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 103.56652949245542, |
|
"grad_norm": 0.05741545557975769, |
|
"learning_rate": 0.00048216735253772295, |
|
"loss": 0.0116, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 104.25240054869684, |
|
"grad_norm": 0.06398554146289825, |
|
"learning_rate": 0.0004787379972565158, |
|
"loss": 0.0124, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 104.93827160493827, |
|
"grad_norm": 0.05331519991159439, |
|
"learning_rate": 0.00047530864197530866, |
|
"loss": 0.0105, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 105.6241426611797, |
|
"grad_norm": 0.040314387530088425, |
|
"learning_rate": 0.00047187928669410154, |
|
"loss": 0.0091, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 106.31001371742113, |
|
"grad_norm": 0.0661318302154541, |
|
"learning_rate": 0.00046844993141289436, |
|
"loss": 0.0096, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 106.99588477366255, |
|
"grad_norm": 0.13146652281284332, |
|
"learning_rate": 0.00046502057613168724, |
|
"loss": 0.0112, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 107.68175582990398, |
|
"grad_norm": 0.060037847608327866, |
|
"learning_rate": 0.0004615912208504801, |
|
"loss": 0.0096, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 108.3676268861454, |
|
"grad_norm": 0.10393790900707245, |
|
"learning_rate": 0.00045816186556927295, |
|
"loss": 0.0107, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 109.05349794238683, |
|
"grad_norm": 0.053112734109163284, |
|
"learning_rate": 0.0004547325102880658, |
|
"loss": 0.0096, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 109.73936899862825, |
|
"grad_norm": 0.05577271804213524, |
|
"learning_rate": 0.0004513031550068587, |
|
"loss": 0.0091, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 110.42524005486969, |
|
"grad_norm": 0.03647785261273384, |
|
"learning_rate": 0.0004478737997256516, |
|
"loss": 0.009, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 111.11111111111111, |
|
"grad_norm": 0.09830909222364426, |
|
"learning_rate": 0.0004444444444444444, |
|
"loss": 0.0098, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 111.79698216735254, |
|
"grad_norm": 0.025291450321674347, |
|
"learning_rate": 0.0004410150891632373, |
|
"loss": 0.0099, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 112.48285322359396, |
|
"grad_norm": 0.0518038235604763, |
|
"learning_rate": 0.0004375857338820302, |
|
"loss": 0.0093, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 113.16872427983539, |
|
"grad_norm": 0.08746583759784698, |
|
"learning_rate": 0.00043415637860082305, |
|
"loss": 0.0096, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 113.85459533607681, |
|
"grad_norm": 0.0944758877158165, |
|
"learning_rate": 0.00043072702331961593, |
|
"loss": 0.0098, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 114.54046639231825, |
|
"grad_norm": 0.10082614421844482, |
|
"learning_rate": 0.0004272976680384088, |
|
"loss": 0.0087, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 115.22633744855968, |
|
"grad_norm": 0.05366198346018791, |
|
"learning_rate": 0.0004238683127572017, |
|
"loss": 0.0088, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 115.9122085048011, |
|
"grad_norm": 0.04216461256146431, |
|
"learning_rate": 0.0004204389574759945, |
|
"loss": 0.0083, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 116.59807956104252, |
|
"grad_norm": 0.061591554433107376, |
|
"learning_rate": 0.0004170096021947874, |
|
"loss": 0.0098, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 117.28395061728395, |
|
"grad_norm": 0.10498243570327759, |
|
"learning_rate": 0.0004135802469135803, |
|
"loss": 0.0093, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 117.96982167352537, |
|
"grad_norm": 0.041311949491500854, |
|
"learning_rate": 0.0004101508916323731, |
|
"loss": 0.0087, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 118.6556927297668, |
|
"grad_norm": 0.09587587416172028, |
|
"learning_rate": 0.000406721536351166, |
|
"loss": 0.0098, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 119.34156378600824, |
|
"grad_norm": 0.05686916410923004, |
|
"learning_rate": 0.00040329218106995886, |
|
"loss": 0.0084, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 120.02743484224966, |
|
"grad_norm": 0.06297193467617035, |
|
"learning_rate": 0.00039986282578875174, |
|
"loss": 0.0082, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 120.71330589849109, |
|
"grad_norm": 0.13432051241397858, |
|
"learning_rate": 0.00039643347050754456, |
|
"loss": 0.0075, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 121.39917695473251, |
|
"grad_norm": 0.03966566175222397, |
|
"learning_rate": 0.00039300411522633744, |
|
"loss": 0.0133, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 122.08504801097394, |
|
"grad_norm": 0.09373347461223602, |
|
"learning_rate": 0.0003895747599451303, |
|
"loss": 0.0082, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 122.77091906721536, |
|
"grad_norm": 0.06179581582546234, |
|
"learning_rate": 0.00038614540466392315, |
|
"loss": 0.0081, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 123.45679012345678, |
|
"grad_norm": 0.050256237387657166, |
|
"learning_rate": 0.00038271604938271603, |
|
"loss": 0.0081, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 124.14266117969822, |
|
"grad_norm": 0.03073493391275406, |
|
"learning_rate": 0.0003792866941015089, |
|
"loss": 0.0076, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 124.82853223593965, |
|
"grad_norm": 0.07006064057350159, |
|
"learning_rate": 0.0003758573388203018, |
|
"loss": 0.0075, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 125.51440329218107, |
|
"grad_norm": 0.05567869916558266, |
|
"learning_rate": 0.00037242798353909467, |
|
"loss": 0.0079, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 126.2002743484225, |
|
"grad_norm": 0.03926622495055199, |
|
"learning_rate": 0.00036899862825788755, |
|
"loss": 0.008, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 126.88614540466392, |
|
"grad_norm": 0.05135864019393921, |
|
"learning_rate": 0.0003655692729766804, |
|
"loss": 0.0075, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 127.57201646090535, |
|
"grad_norm": 0.09010568261146545, |
|
"learning_rate": 0.00036213991769547325, |
|
"loss": 0.0068, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 128.25788751714677, |
|
"grad_norm": 0.043294016271829605, |
|
"learning_rate": 0.00035871056241426613, |
|
"loss": 0.0072, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 128.9437585733882, |
|
"grad_norm": 0.07277141511440277, |
|
"learning_rate": 0.000355281207133059, |
|
"loss": 0.0073, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 129.62962962962962, |
|
"grad_norm": 0.06138594448566437, |
|
"learning_rate": 0.0003518518518518519, |
|
"loss": 0.007, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 130.31550068587106, |
|
"grad_norm": 0.06205645576119423, |
|
"learning_rate": 0.0003484224965706447, |
|
"loss": 0.0078, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 131.0013717421125, |
|
"grad_norm": 0.0695485770702362, |
|
"learning_rate": 0.0003449931412894376, |
|
"loss": 0.0074, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 131.6872427983539, |
|
"grad_norm": 0.027826467528939247, |
|
"learning_rate": 0.0003415637860082305, |
|
"loss": 0.0063, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 132.37311385459535, |
|
"grad_norm": 0.05079466477036476, |
|
"learning_rate": 0.0003381344307270233, |
|
"loss": 0.0068, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 133.05898491083676, |
|
"grad_norm": 0.031004609540104866, |
|
"learning_rate": 0.0003347050754458162, |
|
"loss": 0.0076, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 133.7448559670782, |
|
"grad_norm": 0.04684276878833771, |
|
"learning_rate": 0.00033127572016460906, |
|
"loss": 0.0066, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 134.4307270233196, |
|
"grad_norm": 0.04819253832101822, |
|
"learning_rate": 0.00032784636488340194, |
|
"loss": 0.0078, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 135.11659807956104, |
|
"grad_norm": 0.04323802888393402, |
|
"learning_rate": 0.00032441700960219477, |
|
"loss": 0.0066, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 135.80246913580248, |
|
"grad_norm": 0.0720980316400528, |
|
"learning_rate": 0.00032098765432098765, |
|
"loss": 0.006, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 136.4883401920439, |
|
"grad_norm": 0.03386896476149559, |
|
"learning_rate": 0.0003175582990397805, |
|
"loss": 0.0061, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 137.17421124828533, |
|
"grad_norm": 0.033124495297670364, |
|
"learning_rate": 0.00031412894375857335, |
|
"loss": 0.0062, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 137.86008230452674, |
|
"grad_norm": 0.05887928605079651, |
|
"learning_rate": 0.00031069958847736623, |
|
"loss": 0.0061, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 138.54595336076818, |
|
"grad_norm": 0.04949059709906578, |
|
"learning_rate": 0.00030727023319615916, |
|
"loss": 0.0064, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 139.2318244170096, |
|
"grad_norm": 0.020787570625543594, |
|
"learning_rate": 0.00030384087791495204, |
|
"loss": 0.0057, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 139.91769547325103, |
|
"grad_norm": 0.02130681276321411, |
|
"learning_rate": 0.00030041152263374487, |
|
"loss": 0.0062, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 140.60356652949247, |
|
"grad_norm": 0.025823501870036125, |
|
"learning_rate": 0.00029698216735253775, |
|
"loss": 0.0062, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 141.28943758573388, |
|
"grad_norm": 0.06032751873135567, |
|
"learning_rate": 0.00029355281207133063, |
|
"loss": 0.0059, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 141.97530864197532, |
|
"grad_norm": 0.05369709059596062, |
|
"learning_rate": 0.00029012345679012345, |
|
"loss": 0.0067, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 142.66117969821673, |
|
"grad_norm": 0.012244037352502346, |
|
"learning_rate": 0.00028669410150891633, |
|
"loss": 0.0055, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 143.34705075445817, |
|
"grad_norm": 0.03182597830891609, |
|
"learning_rate": 0.0002832647462277092, |
|
"loss": 0.0057, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 144.03292181069958, |
|
"grad_norm": 0.11879345774650574, |
|
"learning_rate": 0.0002798353909465021, |
|
"loss": 0.0058, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 144.71879286694102, |
|
"grad_norm": 0.06132959946990013, |
|
"learning_rate": 0.0002764060356652949, |
|
"loss": 0.0059, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 145.40466392318245, |
|
"grad_norm": 0.03873404487967491, |
|
"learning_rate": 0.0002729766803840878, |
|
"loss": 0.0054, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 146.09053497942386, |
|
"grad_norm": 0.05853896215558052, |
|
"learning_rate": 0.0002695473251028807, |
|
"loss": 0.0055, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 146.7764060356653, |
|
"grad_norm": 0.022752197459340096, |
|
"learning_rate": 0.0002661179698216735, |
|
"loss": 0.0054, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 147.4622770919067, |
|
"grad_norm": 0.013148725032806396, |
|
"learning_rate": 0.0002626886145404664, |
|
"loss": 0.0052, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 148.14814814814815, |
|
"grad_norm": 0.0366031639277935, |
|
"learning_rate": 0.00025925925925925926, |
|
"loss": 0.0057, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 148.83401920438956, |
|
"grad_norm": 0.08972814679145813, |
|
"learning_rate": 0.0002558299039780521, |
|
"loss": 0.0055, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 149.519890260631, |
|
"grad_norm": 0.034281570464372635, |
|
"learning_rate": 0.00025240054869684497, |
|
"loss": 0.0051, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 150.20576131687244, |
|
"grad_norm": 0.010903775691986084, |
|
"learning_rate": 0.00024897119341563785, |
|
"loss": 0.005, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 150.89163237311385, |
|
"grad_norm": 0.015872234478592873, |
|
"learning_rate": 0.0002455418381344307, |
|
"loss": 0.0048, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 151.5775034293553, |
|
"grad_norm": 0.03331177309155464, |
|
"learning_rate": 0.0002421124828532236, |
|
"loss": 0.0053, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 152.2633744855967, |
|
"grad_norm": 0.018091242760419846, |
|
"learning_rate": 0.00023868312757201646, |
|
"loss": 0.0052, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 152.94924554183814, |
|
"grad_norm": 0.03916322439908981, |
|
"learning_rate": 0.00023525377229080934, |
|
"loss": 0.0051, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 153.63511659807955, |
|
"grad_norm": 0.023662962019443512, |
|
"learning_rate": 0.0002318244170096022, |
|
"loss": 0.0048, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 154.320987654321, |
|
"grad_norm": 0.014827713370323181, |
|
"learning_rate": 0.00022839506172839504, |
|
"loss": 0.0046, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 155.00685871056243, |
|
"grad_norm": 0.03849729895591736, |
|
"learning_rate": 0.00022496570644718795, |
|
"loss": 0.0047, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 155.69272976680384, |
|
"grad_norm": 0.057328708469867706, |
|
"learning_rate": 0.0002215363511659808, |
|
"loss": 0.0046, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 156.37860082304528, |
|
"grad_norm": 0.017304692417383194, |
|
"learning_rate": 0.00021810699588477368, |
|
"loss": 0.0047, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 157.06447187928669, |
|
"grad_norm": 0.02947048842906952, |
|
"learning_rate": 0.00021467764060356654, |
|
"loss": 0.0047, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 157.75034293552812, |
|
"grad_norm": 0.02040564827620983, |
|
"learning_rate": 0.0002112482853223594, |
|
"loss": 0.0048, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 158.43621399176953, |
|
"grad_norm": 0.05288396403193474, |
|
"learning_rate": 0.00020781893004115227, |
|
"loss": 0.0047, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 159.12208504801097, |
|
"grad_norm": 0.026866160333156586, |
|
"learning_rate": 0.00020438957475994512, |
|
"loss": 0.0047, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 159.8079561042524, |
|
"grad_norm": 0.04552701115608215, |
|
"learning_rate": 0.000200960219478738, |
|
"loss": 0.0043, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 160.49382716049382, |
|
"grad_norm": 0.020935742184519768, |
|
"learning_rate": 0.00019753086419753085, |
|
"loss": 0.0045, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 161.17969821673526, |
|
"grad_norm": 0.03535682335495949, |
|
"learning_rate": 0.00019410150891632373, |
|
"loss": 0.0044, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 161.86556927297667, |
|
"grad_norm": 0.019771935418248177, |
|
"learning_rate": 0.0001906721536351166, |
|
"loss": 0.0044, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 162.5514403292181, |
|
"grad_norm": 0.013906066305935383, |
|
"learning_rate": 0.00018724279835390946, |
|
"loss": 0.004, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 163.23731138545952, |
|
"grad_norm": 0.012126780115067959, |
|
"learning_rate": 0.00018381344307270234, |
|
"loss": 0.0043, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 163.92318244170096, |
|
"grad_norm": 0.021324895322322845, |
|
"learning_rate": 0.0001803840877914952, |
|
"loss": 0.0043, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 164.6090534979424, |
|
"grad_norm": 0.013377720490098, |
|
"learning_rate": 0.00017695473251028808, |
|
"loss": 0.0043, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 165.2949245541838, |
|
"grad_norm": 0.08576662838459015, |
|
"learning_rate": 0.00017352537722908093, |
|
"loss": 0.0043, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 165.98079561042525, |
|
"grad_norm": 0.008204947225749493, |
|
"learning_rate": 0.0001700960219478738, |
|
"loss": 0.004, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 166.66666666666666, |
|
"grad_norm": 0.01228815782815218, |
|
"learning_rate": 0.00016666666666666666, |
|
"loss": 0.0038, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 167.3525377229081, |
|
"grad_norm": 0.007976936176419258, |
|
"learning_rate": 0.00016323731138545951, |
|
"loss": 0.004, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 168.0384087791495, |
|
"grad_norm": 0.06643296033143997, |
|
"learning_rate": 0.00015980795610425242, |
|
"loss": 0.0042, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 168.72427983539094, |
|
"grad_norm": 0.040761686861515045, |
|
"learning_rate": 0.00015637860082304527, |
|
"loss": 0.0041, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 169.41015089163238, |
|
"grad_norm": 0.006533100735396147, |
|
"learning_rate": 0.00015294924554183815, |
|
"loss": 0.0039, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 170.0960219478738, |
|
"grad_norm": 0.015169362537562847, |
|
"learning_rate": 0.000149519890260631, |
|
"loss": 0.0039, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 170.78189300411523, |
|
"grad_norm": 0.03795445337891579, |
|
"learning_rate": 0.00014609053497942388, |
|
"loss": 0.0039, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 171.46776406035664, |
|
"grad_norm": 0.03991511091589928, |
|
"learning_rate": 0.00014266117969821674, |
|
"loss": 0.0038, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 172.15363511659808, |
|
"grad_norm": 0.008595280349254608, |
|
"learning_rate": 0.0001392318244170096, |
|
"loss": 0.0039, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 172.8395061728395, |
|
"grad_norm": 0.01920117437839508, |
|
"learning_rate": 0.00013580246913580247, |
|
"loss": 0.0036, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 173.52537722908093, |
|
"grad_norm": 0.00689229741692543, |
|
"learning_rate": 0.00013237311385459532, |
|
"loss": 0.0035, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 174.21124828532237, |
|
"grad_norm": 0.007589759770780802, |
|
"learning_rate": 0.0001289437585733882, |
|
"loss": 0.0037, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 174.89711934156378, |
|
"grad_norm": 0.026523003354668617, |
|
"learning_rate": 0.00012551440329218108, |
|
"loss": 0.0037, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 175.58299039780522, |
|
"grad_norm": 0.009044776670634747, |
|
"learning_rate": 0.00012208504801097393, |
|
"loss": 0.0035, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 176.26886145404663, |
|
"grad_norm": 0.007540062535554171, |
|
"learning_rate": 0.00011865569272976681, |
|
"loss": 0.0038, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 176.95473251028807, |
|
"grad_norm": 0.00613565556704998, |
|
"learning_rate": 0.00011522633744855968, |
|
"loss": 0.0036, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 177.6406035665295, |
|
"grad_norm": 0.005932086147367954, |
|
"learning_rate": 0.00011179698216735255, |
|
"loss": 0.0036, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 178.32647462277092, |
|
"grad_norm": 0.0038564407732337713, |
|
"learning_rate": 0.0001083676268861454, |
|
"loss": 0.0035, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 179.01234567901236, |
|
"grad_norm": 0.10642554610967636, |
|
"learning_rate": 0.00010493827160493826, |
|
"loss": 0.0035, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 179.69821673525377, |
|
"grad_norm": 0.010230828076601028, |
|
"learning_rate": 0.00010150891632373114, |
|
"loss": 0.0034, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 180.3840877914952, |
|
"grad_norm": 0.02526956982910633, |
|
"learning_rate": 9.807956104252401e-05, |
|
"loss": 0.0034, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 181.06995884773661, |
|
"grad_norm": 0.00888708233833313, |
|
"learning_rate": 9.465020576131688e-05, |
|
"loss": 0.0035, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 181.75582990397805, |
|
"grad_norm": 0.005969726946204901, |
|
"learning_rate": 9.122085048010974e-05, |
|
"loss": 0.0033, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 182.4417009602195, |
|
"grad_norm": 0.004369193222373724, |
|
"learning_rate": 8.77914951989026e-05, |
|
"loss": 0.0033, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 183.1275720164609, |
|
"grad_norm": 0.03928361088037491, |
|
"learning_rate": 8.436213991769547e-05, |
|
"loss": 0.0035, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 183.81344307270234, |
|
"grad_norm": 0.008007598109543324, |
|
"learning_rate": 8.093278463648834e-05, |
|
"loss": 0.0032, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 184.49931412894375, |
|
"grad_norm": 0.00541963754221797, |
|
"learning_rate": 7.750342935528121e-05, |
|
"loss": 0.0032, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 185.1851851851852, |
|
"grad_norm": 0.010783227160573006, |
|
"learning_rate": 7.407407407407407e-05, |
|
"loss": 0.0033, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 185.8710562414266, |
|
"grad_norm": 0.005976933054625988, |
|
"learning_rate": 7.064471879286695e-05, |
|
"loss": 0.0033, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 186.55692729766804, |
|
"grad_norm": 0.010424752719700336, |
|
"learning_rate": 6.721536351165982e-05, |
|
"loss": 0.0031, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 187.24279835390948, |
|
"grad_norm": 0.00834636203944683, |
|
"learning_rate": 6.378600823045267e-05, |
|
"loss": 0.0032, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 187.9286694101509, |
|
"grad_norm": 0.0064740655943751335, |
|
"learning_rate": 6.0356652949245544e-05, |
|
"loss": 0.0032, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 188.61454046639233, |
|
"grad_norm": 0.007856756448745728, |
|
"learning_rate": 5.692729766803841e-05, |
|
"loss": 0.0031, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 189.30041152263374, |
|
"grad_norm": 0.006533577106893063, |
|
"learning_rate": 5.3497942386831277e-05, |
|
"loss": 0.0031, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 189.98628257887518, |
|
"grad_norm": 0.005114428699016571, |
|
"learning_rate": 5.006858710562415e-05, |
|
"loss": 0.0032, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 190.6721536351166, |
|
"grad_norm": 0.00717920670285821, |
|
"learning_rate": 4.663923182441701e-05, |
|
"loss": 0.0031, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 191.35802469135803, |
|
"grad_norm": 0.008158246986567974, |
|
"learning_rate": 4.3209876543209875e-05, |
|
"loss": 0.0031, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 192.04389574759946, |
|
"grad_norm": 0.005865829065442085, |
|
"learning_rate": 3.978052126200275e-05, |
|
"loss": 0.0031, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 192.72976680384087, |
|
"grad_norm": 0.0036265423987060785, |
|
"learning_rate": 3.635116598079561e-05, |
|
"loss": 0.003, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 193.4156378600823, |
|
"grad_norm": 0.005760482046753168, |
|
"learning_rate": 3.292181069958848e-05, |
|
"loss": 0.0029, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 194.10150891632372, |
|
"grad_norm": 0.00619177520275116, |
|
"learning_rate": 2.9492455418381346e-05, |
|
"loss": 0.0031, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 194.78737997256516, |
|
"grad_norm": 0.00745520880445838, |
|
"learning_rate": 2.6063100137174212e-05, |
|
"loss": 0.0029, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 195.47325102880657, |
|
"grad_norm": 0.006654828321188688, |
|
"learning_rate": 2.2633744855967078e-05, |
|
"loss": 0.003, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 196.159122085048, |
|
"grad_norm": 0.007228308357298374, |
|
"learning_rate": 1.9204389574759948e-05, |
|
"loss": 0.003, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 196.84499314128945, |
|
"grad_norm": 0.007839037105441093, |
|
"learning_rate": 1.577503429355281e-05, |
|
"loss": 0.0029, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 197.53086419753086, |
|
"grad_norm": 0.005645914003252983, |
|
"learning_rate": 1.2345679012345678e-05, |
|
"loss": 0.0029, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 198.2167352537723, |
|
"grad_norm": 0.007110149599611759, |
|
"learning_rate": 8.916323731138546e-06, |
|
"loss": 0.0029, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 198.9026063100137, |
|
"grad_norm": 0.007023398298770189, |
|
"learning_rate": 5.486968449931412e-06, |
|
"loss": 0.003, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 199.58847736625515, |
|
"grad_norm": 0.006341323722153902, |
|
"learning_rate": 2.05761316872428e-06, |
|
"loss": 0.0028, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 200.0, |
|
"step": 145800, |
|
"total_flos": 2.53654993993728e+18, |
|
"train_loss": 0.006510871606680919, |
|
"train_runtime": 172601.0106, |
|
"train_samples_per_second": 3.377, |
|
"train_steps_per_second": 0.845 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 145800, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 200, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.53654993993728e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|