{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993800371977681, "eval_steps": 200, "global_step": 806, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012399256044637321, "grad_norm": 0.6019122951437104, "learning_rate": 2.469135802469136e-06, "loss": 0.4222, "step": 1 }, { "epoch": 0.006199628022318661, "grad_norm": 1.5495790328852515, "learning_rate": 1.2345679012345678e-05, "loss": 0.471, "step": 5 }, { "epoch": 0.012399256044637322, "grad_norm": 0.9274998509045178, "learning_rate": 2.4691358024691357e-05, "loss": 0.4844, "step": 10 }, { "epoch": 0.018598884066955982, "grad_norm": 0.4319451186674129, "learning_rate": 3.7037037037037037e-05, "loss": 0.4111, "step": 15 }, { "epoch": 0.024798512089274645, "grad_norm": 0.5038186551413125, "learning_rate": 4.938271604938271e-05, "loss": 0.3561, "step": 20 }, { "epoch": 0.030998140111593304, "grad_norm": 0.44167588919852313, "learning_rate": 6.17283950617284e-05, "loss": 0.347, "step": 25 }, { "epoch": 0.037197768133911964, "grad_norm": 0.33451826071357504, "learning_rate": 7.407407407407407e-05, "loss": 0.3322, "step": 30 }, { "epoch": 0.04339739615623062, "grad_norm": 0.34868148088717704, "learning_rate": 8.641975308641975e-05, "loss": 0.3399, "step": 35 }, { "epoch": 0.04959702417854929, "grad_norm": 0.3356513986019191, "learning_rate": 9.876543209876543e-05, "loss": 0.3602, "step": 40 }, { "epoch": 0.05579665220086795, "grad_norm": 0.37808175632129626, "learning_rate": 0.00011111111111111112, "loss": 0.321, "step": 45 }, { "epoch": 0.06199628022318661, "grad_norm": 0.41761376999084465, "learning_rate": 0.0001234567901234568, "loss": 0.3504, "step": 50 }, { "epoch": 0.06819590824550527, "grad_norm": 0.35639408007374757, "learning_rate": 0.00013580246913580247, "loss": 0.3108, "step": 55 }, { "epoch": 0.07439553626782393, "grad_norm": 0.34681060518951073, "learning_rate": 0.00014814814814814815, "loss": 0.305, "step": 60 }, { "epoch": 0.08059516429014259, "grad_norm": 0.29992433196050555, "learning_rate": 0.00016049382716049385, "loss": 0.3127, "step": 65 }, { "epoch": 0.08679479231246125, "grad_norm": 0.37053951666137375, "learning_rate": 0.0001728395061728395, "loss": 0.2985, "step": 70 }, { "epoch": 0.0929944203347799, "grad_norm": 0.3241692211944496, "learning_rate": 0.0001851851851851852, "loss": 0.3187, "step": 75 }, { "epoch": 0.09919404835709858, "grad_norm": 0.3336770081253457, "learning_rate": 0.00019753086419753085, "loss": 0.3013, "step": 80 }, { "epoch": 0.10539367637941724, "grad_norm": 0.340347188617851, "learning_rate": 0.0001999849788616454, "loss": 0.3127, "step": 85 }, { "epoch": 0.1115933044017359, "grad_norm": 0.24966989402744666, "learning_rate": 0.00019992396322115213, "loss": 0.3132, "step": 90 }, { "epoch": 0.11779293242405456, "grad_norm": 0.36652120591883697, "learning_rate": 0.00019981604287632102, "loss": 0.2848, "step": 95 }, { "epoch": 0.12399256044637322, "grad_norm": 0.37152546200115255, "learning_rate": 0.0001996612684853896, "loss": 0.3074, "step": 100 }, { "epoch": 0.13019218846869188, "grad_norm": 0.37937877581455376, "learning_rate": 0.00019945971270007043, "loss": 0.2966, "step": 105 }, { "epoch": 0.13639181649101054, "grad_norm": 0.32819913975467047, "learning_rate": 0.0001992114701314478, "loss": 0.2973, "step": 110 }, { "epoch": 0.1425914445133292, "grad_norm": 0.2497292935586131, "learning_rate": 0.00019891665730556725, "loss": 0.3072, "step": 115 }, { "epoch": 0.14879107253564786, "grad_norm": 0.3383083276284943, "learning_rate": 0.0001985754126087376, "loss": 0.2657, "step": 120 }, { "epoch": 0.15499070055796652, "grad_norm": 0.4050699278620848, "learning_rate": 0.00019818789622257196, "loss": 0.2882, "step": 125 }, { "epoch": 0.16119032858028517, "grad_norm": 0.3123213984265953, "learning_rate": 0.0001977542900487977, "loss": 0.2765, "step": 130 }, { "epoch": 0.16738995660260383, "grad_norm": 0.33824727842613916, "learning_rate": 0.00019727479762387116, "loss": 0.3042, "step": 135 }, { "epoch": 0.1735895846249225, "grad_norm": 0.2424449276469738, "learning_rate": 0.00019674964402343684, "loss": 0.3264, "step": 140 }, { "epoch": 0.17978921264724115, "grad_norm": 0.3067543075934479, "learning_rate": 0.00019617907575667602, "loss": 0.2886, "step": 145 }, { "epoch": 0.1859888406695598, "grad_norm": 0.362577361435356, "learning_rate": 0.00019556336065059432, "loss": 0.3086, "step": 150 }, { "epoch": 0.1921884686918785, "grad_norm": 0.3238274733762567, "learning_rate": 0.00019490278772430256, "loss": 0.2963, "step": 155 }, { "epoch": 0.19838809671419716, "grad_norm": 0.3597604834062663, "learning_rate": 0.00019419766705335026, "loss": 0.313, "step": 160 }, { "epoch": 0.20458772473651582, "grad_norm": 0.27740278103597305, "learning_rate": 0.00019344832962417475, "loss": 0.3236, "step": 165 }, { "epoch": 0.21078735275883448, "grad_norm": 0.31990061265265674, "learning_rate": 0.00019265512717873498, "loss": 0.2907, "step": 170 }, { "epoch": 0.21698698078115314, "grad_norm": 0.3521599467310119, "learning_rate": 0.00019181843204940232, "loss": 0.2933, "step": 175 }, { "epoch": 0.2231866088034718, "grad_norm": 0.32261509567312185, "learning_rate": 0.00019093863698418627, "loss": 0.2852, "step": 180 }, { "epoch": 0.22938623682579046, "grad_norm": 0.29712733472005987, "learning_rate": 0.00019001615496237712, "loss": 0.2868, "step": 185 }, { "epoch": 0.23558586484810912, "grad_norm": 0.24705896150819331, "learning_rate": 0.00018905141900069178, "loss": 0.314, "step": 190 }, { "epoch": 0.24178549287042778, "grad_norm": 0.31936683972288543, "learning_rate": 0.00018804488195001392, "loss": 0.2696, "step": 195 }, { "epoch": 0.24798512089274644, "grad_norm": 0.32950608272624643, "learning_rate": 0.00018699701628282407, "loss": 0.2844, "step": 200 }, { "epoch": 0.24798512089274644, "eval_loss": 0.3114877939224243, "eval_runtime": 1124.0065, "eval_samples_per_second": 3.559, "eval_steps_per_second": 0.111, "step": 200 }, { "epoch": 0.2541847489150651, "grad_norm": 0.37057110121612846, "learning_rate": 0.0001859083138714191, "loss": 0.2852, "step": 205 }, { "epoch": 0.26038437693738375, "grad_norm": 0.2869667980293731, "learning_rate": 0.0001847792857570255, "loss": 0.2905, "step": 210 }, { "epoch": 0.2665840049597024, "grad_norm": 0.24148834606008088, "learning_rate": 0.00018361046190991455, "loss": 0.2841, "step": 215 }, { "epoch": 0.2727836329820211, "grad_norm": 0.3669480816784026, "learning_rate": 0.0001824023909806322, "loss": 0.2745, "step": 220 }, { "epoch": 0.27898326100433973, "grad_norm": 0.3054046475344961, "learning_rate": 0.00018115564004246023, "loss": 0.2803, "step": 225 }, { "epoch": 0.2851828890266584, "grad_norm": 0.3589718511823372, "learning_rate": 0.00017987079432522996, "loss": 0.2834, "step": 230 }, { "epoch": 0.29138251704897705, "grad_norm": 0.32815022889743933, "learning_rate": 0.00017854845694061292, "loss": 0.2956, "step": 235 }, { "epoch": 0.2975821450712957, "grad_norm": 0.22811456346947673, "learning_rate": 0.00017718924859901793, "loss": 0.2992, "step": 240 }, { "epoch": 0.30378177309361437, "grad_norm": 0.3261921325400897, "learning_rate": 0.00017579380731822712, "loss": 0.2696, "step": 245 }, { "epoch": 0.30998140111593303, "grad_norm": 0.3265151909802919, "learning_rate": 0.00017436278812390786, "loss": 0.2762, "step": 250 }, { "epoch": 0.3161810291382517, "grad_norm": 0.25119517956050147, "learning_rate": 0.00017289686274214118, "loss": 0.2758, "step": 255 }, { "epoch": 0.32238065716057035, "grad_norm": 0.30203405562074537, "learning_rate": 0.00017139671928411072, "loss": 0.2888, "step": 260 }, { "epoch": 0.328580285182889, "grad_norm": 0.231597052058556, "learning_rate": 0.00016986306192310084, "loss": 0.2994, "step": 265 }, { "epoch": 0.33477991320520767, "grad_norm": 0.3462354390170321, "learning_rate": 0.00016829661056395474, "loss": 0.273, "step": 270 }, { "epoch": 0.34097954122752633, "grad_norm": 0.3416446609085483, "learning_rate": 0.00016669810050514827, "loss": 0.2794, "step": 275 }, { "epoch": 0.347179169249845, "grad_norm": 0.3004081195597297, "learning_rate": 0.00016506828209363796, "loss": 0.2785, "step": 280 }, { "epoch": 0.35337879727216365, "grad_norm": 0.3372149956322208, "learning_rate": 0.00016340792037264527, "loss": 0.2924, "step": 285 }, { "epoch": 0.3595784252944823, "grad_norm": 0.2773063250024901, "learning_rate": 0.00016171779472254206, "loss": 0.2824, "step": 290 }, { "epoch": 0.36577805331680097, "grad_norm": 0.29747563137485017, "learning_rate": 0.0001599986984950065, "loss": 0.2787, "step": 295 }, { "epoch": 0.3719776813391196, "grad_norm": 0.3237809552896745, "learning_rate": 0.0001582514386406206, "loss": 0.2821, "step": 300 }, { "epoch": 0.37817730936143834, "grad_norm": 0.29633580280256444, "learning_rate": 0.00015647683533008455, "loss": 0.2777, "step": 305 }, { "epoch": 0.384376937383757, "grad_norm": 0.3580023136614935, "learning_rate": 0.00015467572156922503, "loss": 0.2717, "step": 310 }, { "epoch": 0.39057656540607566, "grad_norm": 0.2483260531032943, "learning_rate": 0.0001528489428079793, "loss": 0.3054, "step": 315 }, { "epoch": 0.3967761934283943, "grad_norm": 0.3058237506306256, "learning_rate": 0.00015099735654353747, "loss": 0.2724, "step": 320 }, { "epoch": 0.402975821450713, "grad_norm": 0.3309948471867418, "learning_rate": 0.00014912183191782995, "loss": 0.2607, "step": 325 }, { "epoch": 0.40917544947303164, "grad_norm": 0.3163723023278254, "learning_rate": 0.00014722324930954885, "loss": 0.2881, "step": 330 }, { "epoch": 0.4153750774953503, "grad_norm": 0.30350103196436556, "learning_rate": 0.0001453024999208946, "loss": 0.2817, "step": 335 }, { "epoch": 0.42157470551766896, "grad_norm": 0.22626437910720568, "learning_rate": 0.00014336048535924223, "loss": 0.2838, "step": 340 }, { "epoch": 0.4277743335399876, "grad_norm": 0.3445629260554824, "learning_rate": 0.00014139811721392324, "loss": 0.2552, "step": 345 }, { "epoch": 0.4339739615623063, "grad_norm": 0.33697169212335193, "learning_rate": 0.00013941631662832199, "loss": 0.2738, "step": 350 }, { "epoch": 0.44017358958462494, "grad_norm": 0.27110424764249635, "learning_rate": 0.00013741601386748728, "loss": 0.2837, "step": 355 }, { "epoch": 0.4463732176069436, "grad_norm": 0.3176558105393564, "learning_rate": 0.00013539814788146235, "loss": 0.277, "step": 360 }, { "epoch": 0.45257284562926225, "grad_norm": 0.21215927511567548, "learning_rate": 0.00013336366586453783, "loss": 0.2827, "step": 365 }, { "epoch": 0.4587724736515809, "grad_norm": 0.31096775561405154, "learning_rate": 0.0001313135228106353, "loss": 0.2651, "step": 370 }, { "epoch": 0.4649721016738996, "grad_norm": 0.31585451791739527, "learning_rate": 0.0001292486810650289, "loss": 0.2773, "step": 375 }, { "epoch": 0.47117172969621823, "grad_norm": 0.3080807025298427, "learning_rate": 0.00012717010987261715, "loss": 0.2873, "step": 380 }, { "epoch": 0.4773713577185369, "grad_norm": 0.3591626086115439, "learning_rate": 0.0001250787849229552, "loss": 0.2827, "step": 385 }, { "epoch": 0.48357098574085555, "grad_norm": 0.25352609144718685, "learning_rate": 0.00012297568789226238, "loss": 0.2958, "step": 390 }, { "epoch": 0.4897706137631742, "grad_norm": 0.30645533387883167, "learning_rate": 0.00012086180598261956, "loss": 0.2597, "step": 395 }, { "epoch": 0.49597024178549287, "grad_norm": 0.33773747309079233, "learning_rate": 0.00011873813145857249, "loss": 0.2638, "step": 400 }, { "epoch": 0.49597024178549287, "eval_loss": 0.29208001494407654, "eval_runtime": 1119.5912, "eval_samples_per_second": 3.573, "eval_steps_per_second": 0.112, "step": 400 }, { "epoch": 0.5021698698078115, "grad_norm": 0.3152505586487655, "learning_rate": 0.00011660566118135894, "loss": 0.2656, "step": 405 }, { "epoch": 0.5083694978301302, "grad_norm": 0.29454612946664416, "learning_rate": 0.00011446539614097813, "loss": 0.27, "step": 410 }, { "epoch": 0.5145691258524488, "grad_norm": 0.1892793890960918, "learning_rate": 0.0001123183409863219, "loss": 0.2978, "step": 415 }, { "epoch": 0.5207687538747675, "grad_norm": 0.3335006508167189, "learning_rate": 0.00011016550355358872, "loss": 0.2538, "step": 420 }, { "epoch": 0.5269683818970862, "grad_norm": 0.32122639513136675, "learning_rate": 0.00010800789439320128, "loss": 0.2778, "step": 425 }, { "epoch": 0.5331680099194048, "grad_norm": 0.2771952164280048, "learning_rate": 0.00010584652629545011, "loss": 0.2634, "step": 430 }, { "epoch": 0.5393676379417235, "grad_norm": 0.34208078840367095, "learning_rate": 0.0001036824138150859, "loss": 0.2759, "step": 435 }, { "epoch": 0.5455672659640421, "grad_norm": 0.21374327433641943, "learning_rate": 0.00010151657279508336, "loss": 0.2818, "step": 440 }, { "epoch": 0.5517668939863608, "grad_norm": 0.28462533877594465, "learning_rate": 9.935001988980061e-05, "loss": 0.2661, "step": 445 }, { "epoch": 0.5579665220086795, "grad_norm": 0.3047348276188714, "learning_rate": 9.718377208775744e-05, "loss": 0.2677, "step": 450 }, { "epoch": 0.5641661500309981, "grad_norm": 0.30754878782389855, "learning_rate": 9.50188462342571e-05, "loss": 0.2614, "step": 455 }, { "epoch": 0.5703657780533168, "grad_norm": 0.31116409658881644, "learning_rate": 9.285625855407484e-05, "loss": 0.2673, "step": 460 }, { "epoch": 0.5765654060756354, "grad_norm": 0.2651806274866606, "learning_rate": 9.069702417443821e-05, "loss": 0.2869, "step": 465 }, { "epoch": 0.5827650340979541, "grad_norm": 0.2909752165016893, "learning_rate": 8.854215664852206e-05, "loss": 0.2669, "step": 470 }, { "epoch": 0.5889646621202728, "grad_norm": 0.30706363315844515, "learning_rate": 8.63926674796829e-05, "loss": 0.2674, "step": 475 }, { "epoch": 0.5951642901425914, "grad_norm": 0.30444202963242784, "learning_rate": 8.424956564665508e-05, "loss": 0.2802, "step": 480 }, { "epoch": 0.6013639181649101, "grad_norm": 0.2789382860436405, "learning_rate": 8.211385712993218e-05, "loss": 0.2545, "step": 485 }, { "epoch": 0.6075635461872287, "grad_norm": 0.23185325685334424, "learning_rate": 7.998654443955586e-05, "loss": 0.266, "step": 490 }, { "epoch": 0.6137631742095474, "grad_norm": 0.3567624100649805, "learning_rate": 7.786862614453355e-05, "loss": 0.2665, "step": 495 }, { "epoch": 0.6199628022318661, "grad_norm": 0.31757337819796233, "learning_rate": 7.576109640410633e-05, "loss": 0.2692, "step": 500 }, { "epoch": 0.6261624302541847, "grad_norm": 0.26049216875661646, "learning_rate": 7.366494450108659e-05, "loss": 0.253, "step": 505 }, { "epoch": 0.6323620582765034, "grad_norm": 0.3279613920140163, "learning_rate": 7.158115437748468e-05, "loss": 0.2664, "step": 510 }, { "epoch": 0.638561686298822, "grad_norm": 0.24958219949066954, "learning_rate": 6.951070417264277e-05, "loss": 0.282, "step": 515 }, { "epoch": 0.6447613143211407, "grad_norm": 0.3076230934858842, "learning_rate": 6.745456576409227e-05, "loss": 0.2541, "step": 520 }, { "epoch": 0.6509609423434594, "grad_norm": 0.3039043398796431, "learning_rate": 6.541370431135072e-05, "loss": 0.2648, "step": 525 }, { "epoch": 0.657160570365778, "grad_norm": 0.2823892989212078, "learning_rate": 6.338907780287197e-05, "loss": 0.2628, "step": 530 }, { "epoch": 0.6633601983880967, "grad_norm": 0.336100443174608, "learning_rate": 6.138163660636284e-05, "loss": 0.2609, "step": 535 }, { "epoch": 0.6695598264104153, "grad_norm": 0.22183756693967047, "learning_rate": 5.9392323022676454e-05, "loss": 0.2679, "step": 540 }, { "epoch": 0.675759454432734, "grad_norm": 0.30501277104459634, "learning_rate": 5.7422070843492734e-05, "loss": 0.2551, "step": 545 }, { "epoch": 0.6819590824550527, "grad_norm": 0.35227300737647005, "learning_rate": 5.547180491299279e-05, "loss": 0.2715, "step": 550 }, { "epoch": 0.6881587104773713, "grad_norm": 0.2645763624948684, "learning_rate": 5.35424406937333e-05, "loss": 0.2633, "step": 555 }, { "epoch": 0.69435833849969, "grad_norm": 0.2707425078158492, "learning_rate": 5.163488383692499e-05, "loss": 0.2677, "step": 560 }, { "epoch": 0.7005579665220086, "grad_norm": 0.24608132828527818, "learning_rate": 4.975002975731613e-05, "loss": 0.2763, "step": 565 }, { "epoch": 0.7067575945443273, "grad_norm": 0.2726783082526773, "learning_rate": 4.78887632128814e-05, "loss": 0.2428, "step": 570 }, { "epoch": 0.712957222566646, "grad_norm": 0.2981132504685635, "learning_rate": 4.6051957889513e-05, "loss": 0.2608, "step": 575 }, { "epoch": 0.7191568505889646, "grad_norm": 0.28491523081281545, "learning_rate": 4.4240475990909106e-05, "loss": 0.2569, "step": 580 }, { "epoch": 0.7253564786112833, "grad_norm": 0.3291243871750603, "learning_rate": 4.24551678338518e-05, "loss": 0.2612, "step": 585 }, { "epoch": 0.7315561066336019, "grad_norm": 0.2259584584201018, "learning_rate": 4.069687144906532e-05, "loss": 0.2696, "step": 590 }, { "epoch": 0.7377557346559206, "grad_norm": 0.330497624658356, "learning_rate": 3.8966412187840804e-05, "loss": 0.2478, "step": 595 }, { "epoch": 0.7439553626782393, "grad_norm": 0.2741703824563164, "learning_rate": 3.726460233461339e-05, "loss": 0.2596, "step": 600 }, { "epoch": 0.7439553626782393, "eval_loss": 0.2790033519268036, "eval_runtime": 1119.6634, "eval_samples_per_second": 3.573, "eval_steps_per_second": 0.112, "step": 600 }, { "epoch": 0.750154990700558, "grad_norm": 0.25878146862151746, "learning_rate": 3.5592240725672476e-05, "loss": 0.2631, "step": 605 }, { "epoch": 0.7563546187228767, "grad_norm": 0.30340990736911533, "learning_rate": 3.395011237418494e-05, "loss": 0.2649, "step": 610 }, { "epoch": 0.7625542467451953, "grad_norm": 0.21196981539625306, "learning_rate": 3.2338988101706726e-05, "loss": 0.2519, "step": 615 }, { "epoch": 0.768753874767514, "grad_norm": 0.2881373960219632, "learning_rate": 3.075962417635634e-05, "loss": 0.2371, "step": 620 }, { "epoch": 0.7749535027898327, "grad_norm": 0.31620268309069777, "learning_rate": 2.9212761957819347e-05, "loss": 0.2726, "step": 625 }, { "epoch": 0.7811531308121513, "grad_norm": 0.2675293914015534, "learning_rate": 2.7699127549351456e-05, "loss": 0.2723, "step": 630 }, { "epoch": 0.78735275883447, "grad_norm": 0.3208257587309821, "learning_rate": 2.6219431456942535e-05, "loss": 0.2662, "step": 635 }, { "epoch": 0.7935523868567886, "grad_norm": 0.20931592628778878, "learning_rate": 2.4774368255802483e-05, "loss": 0.2662, "step": 640 }, { "epoch": 0.7997520148791073, "grad_norm": 0.2722392147736918, "learning_rate": 2.3364616264324722e-05, "loss": 0.2528, "step": 645 }, { "epoch": 0.805951642901426, "grad_norm": 0.29854156869280446, "learning_rate": 2.199083722568095e-05, "loss": 0.2612, "step": 650 }, { "epoch": 0.8121512709237446, "grad_norm": 0.2760187147243271, "learning_rate": 2.065367599719621e-05, "loss": 0.2646, "step": 655 }, { "epoch": 0.8183508989460633, "grad_norm": 0.29187099026880386, "learning_rate": 1.93537602476504e-05, "loss": 0.2574, "step": 660 }, { "epoch": 0.8245505269683819, "grad_norm": 0.21386029056799546, "learning_rate": 1.809170016264794e-05, "loss": 0.2563, "step": 665 }, { "epoch": 0.8307501549907006, "grad_norm": 0.2823252915248677, "learning_rate": 1.686808815819435e-05, "loss": 0.2669, "step": 670 }, { "epoch": 0.8369497830130193, "grad_norm": 0.33535005943488133, "learning_rate": 1.568349860261369e-05, "loss": 0.2507, "step": 675 }, { "epoch": 0.8431494110353379, "grad_norm": 0.29068903551261105, "learning_rate": 1.4538487546937952e-05, "loss": 0.2679, "step": 680 }, { "epoch": 0.8493490390576566, "grad_norm": 0.31428561712386854, "learning_rate": 1.3433592463894373e-05, "loss": 0.256, "step": 685 }, { "epoch": 0.8555486670799752, "grad_norm": 0.2492283063735335, "learning_rate": 1.2369331995613665e-05, "loss": 0.2701, "step": 690 }, { "epoch": 0.8617482951022939, "grad_norm": 0.29516035417057684, "learning_rate": 1.1346205710177304e-05, "loss": 0.2317, "step": 695 }, { "epoch": 0.8679479231246126, "grad_norm": 0.34660496611894304, "learning_rate": 1.0364693867118424e-05, "loss": 0.2581, "step": 700 }, { "epoch": 0.8741475511469312, "grad_norm": 0.27935452297059005, "learning_rate": 9.425257191985859e-06, "loss": 0.2501, "step": 705 }, { "epoch": 0.8803471791692499, "grad_norm": 0.2975961434950699, "learning_rate": 8.528336660077974e-06, "loss": 0.2598, "step": 710 }, { "epoch": 0.8865468071915685, "grad_norm": 0.21801979336484165, "learning_rate": 7.674353289446945e-06, "loss": 0.2662, "step": 715 }, { "epoch": 0.8927464352138872, "grad_norm": 0.34419068158586147, "learning_rate": 6.863707943271325e-06, "loss": 0.235, "step": 720 }, { "epoch": 0.8989460632362059, "grad_norm": 0.3428004067461644, "learning_rate": 6.096781141689223e-06, "loss": 0.2548, "step": 725 }, { "epoch": 0.9051456912585245, "grad_norm": 0.25288813676576166, "learning_rate": 5.373932883180655e-06, "loss": 0.248, "step": 730 }, { "epoch": 0.9113453192808432, "grad_norm": 0.279501011714134, "learning_rate": 4.695502475582814e-06, "loss": 0.2486, "step": 735 }, { "epoch": 0.9175449473031618, "grad_norm": 0.2244165751957654, "learning_rate": 4.0618083768177e-06, "loss": 0.2594, "step": 740 }, { "epoch": 0.9237445753254805, "grad_norm": 0.30028321537895275, "learning_rate": 3.4731480454065825e-06, "loss": 0.243, "step": 745 }, { "epoch": 0.9299442033477991, "grad_norm": 0.34186025440033796, "learning_rate": 2.92979780084196e-06, "loss": 0.2454, "step": 750 }, { "epoch": 0.9361438313701178, "grad_norm": 0.27131508037063784, "learning_rate": 2.4320126938819023e-06, "loss": 0.2464, "step": 755 }, { "epoch": 0.9423434593924365, "grad_norm": 0.30864982789113204, "learning_rate": 1.980026386828371e-06, "loss": 0.2577, "step": 760 }, { "epoch": 0.9485430874147551, "grad_norm": 0.22799699963033782, "learning_rate": 1.5740510438451371e-06, "loss": 0.2658, "step": 765 }, { "epoch": 0.9547427154370738, "grad_norm": 0.2796812629005932, "learning_rate": 1.214277231367078e-06, "loss": 0.2615, "step": 770 }, { "epoch": 0.9609423434593924, "grad_norm": 0.2906770373909308, "learning_rate": 9.008738286475749e-07, "loss": 0.2557, "step": 775 }, { "epoch": 0.9671419714817111, "grad_norm": 0.30664325433457473, "learning_rate": 6.339879484858924e-07, "loss": 0.2485, "step": 780 }, { "epoch": 0.9733415995040298, "grad_norm": 0.3437865129781069, "learning_rate": 4.1374486817183924e-07, "loss": 0.2514, "step": 785 }, { "epoch": 0.9795412275263484, "grad_norm": 0.2189635610441498, "learning_rate": 2.402479706801741e-07, "loss": 0.2845, "step": 790 }, { "epoch": 0.9857408555486671, "grad_norm": 0.30929816396563364, "learning_rate": 1.1357869614212479e-07, "loss": 0.2564, "step": 795 }, { "epoch": 0.9919404835709857, "grad_norm": 0.31984749573339155, "learning_rate": 3.379650361716724e-08, "loss": 0.2458, "step": 800 }, { "epoch": 0.9919404835709857, "eval_loss": 0.2757934629917145, "eval_runtime": 1122.4404, "eval_samples_per_second": 3.564, "eval_steps_per_second": 0.111, "step": 800 }, { "epoch": 0.9981401115933044, "grad_norm": 0.28178798746795625, "learning_rate": 9.388431826629073e-10, "loss": 0.2537, "step": 805 }, { "epoch": 0.9993800371977681, "step": 806, "total_flos": 2.076935220048691e+16, "train_loss": 0.28037568409153013, "train_runtime": 38684.7769, "train_samples_per_second": 1.334, "train_steps_per_second": 0.021 } ], "logging_steps": 5, "max_steps": 806, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.076935220048691e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }