{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9996830427892235, "eval_steps": 200, "global_step": 1577, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006339144215530904, "grad_norm": 0.1844951284226507, "learning_rate": 1.2658227848101265e-06, "loss": 0.2445, "step": 1 }, { "epoch": 0.003169572107765452, "grad_norm": 0.4228474323435137, "learning_rate": 6.329113924050633e-06, "loss": 0.5751, "step": 5 }, { "epoch": 0.006339144215530904, "grad_norm": 0.44943947868261674, "learning_rate": 1.2658227848101267e-05, "loss": 0.6355, "step": 10 }, { "epoch": 0.009508716323296355, "grad_norm": 0.4388677072263688, "learning_rate": 1.89873417721519e-05, "loss": 0.5644, "step": 15 }, { "epoch": 0.012678288431061807, "grad_norm": 0.42271873762000256, "learning_rate": 2.5316455696202533e-05, "loss": 0.6074, "step": 20 }, { "epoch": 0.01584786053882726, "grad_norm": 0.394168038759961, "learning_rate": 3.1645569620253167e-05, "loss": 0.5618, "step": 25 }, { "epoch": 0.01901743264659271, "grad_norm": 0.3572253194331872, "learning_rate": 3.79746835443038e-05, "loss": 0.3782, "step": 30 }, { "epoch": 0.022187004754358162, "grad_norm": 0.412555556410598, "learning_rate": 4.430379746835443e-05, "loss": 0.5004, "step": 35 }, { "epoch": 0.025356576862123614, "grad_norm": 0.3794217410787016, "learning_rate": 5.0632911392405066e-05, "loss": 0.4466, "step": 40 }, { "epoch": 0.028526148969889066, "grad_norm": 0.35048440162730615, "learning_rate": 5.69620253164557e-05, "loss": 0.4673, "step": 45 }, { "epoch": 0.03169572107765452, "grad_norm": 0.5944470916433041, "learning_rate": 6.329113924050633e-05, "loss": 0.4321, "step": 50 }, { "epoch": 0.03486529318541997, "grad_norm": 0.42565747209046223, "learning_rate": 6.962025316455697e-05, "loss": 0.3458, "step": 55 }, { "epoch": 0.03803486529318542, "grad_norm": 0.4086754614850085, "learning_rate": 7.59493670886076e-05, "loss": 0.3853, "step": 60 }, { "epoch": 0.04120443740095087, "grad_norm": 0.4100402315211435, "learning_rate": 8.227848101265824e-05, "loss": 0.3248, "step": 65 }, { "epoch": 0.044374009508716325, "grad_norm": 0.439691902474589, "learning_rate": 8.860759493670887e-05, "loss": 0.3902, "step": 70 }, { "epoch": 0.04754358161648178, "grad_norm": 0.4789983229510449, "learning_rate": 9.493670886075949e-05, "loss": 0.4375, "step": 75 }, { "epoch": 0.05071315372424723, "grad_norm": 0.4326295423185557, "learning_rate": 0.00010126582278481013, "loss": 0.344, "step": 80 }, { "epoch": 0.05388272583201268, "grad_norm": 0.4082745764246507, "learning_rate": 0.00010759493670886076, "loss": 0.373, "step": 85 }, { "epoch": 0.05705229793977813, "grad_norm": 0.44917869809554406, "learning_rate": 0.0001139240506329114, "loss": 0.3366, "step": 90 }, { "epoch": 0.060221870047543584, "grad_norm": 0.4099898854278625, "learning_rate": 0.00012025316455696203, "loss": 0.3827, "step": 95 }, { "epoch": 0.06339144215530904, "grad_norm": 0.5173612120396457, "learning_rate": 0.00012658227848101267, "loss": 0.3913, "step": 100 }, { "epoch": 0.06656101426307448, "grad_norm": 0.4695908910723305, "learning_rate": 0.0001329113924050633, "loss": 0.3285, "step": 105 }, { "epoch": 0.06973058637083994, "grad_norm": 0.34610029250066504, "learning_rate": 0.00013924050632911395, "loss": 0.3542, "step": 110 }, { "epoch": 0.07290015847860538, "grad_norm": 0.3833563232036365, "learning_rate": 0.00014556962025316457, "loss": 0.3442, "step": 115 }, { "epoch": 0.07606973058637084, "grad_norm": 0.38597736664868315, "learning_rate": 0.0001518987341772152, "loss": 0.3499, "step": 120 }, { "epoch": 0.07923930269413629, "grad_norm": 0.4555824320300245, "learning_rate": 0.00015822784810126583, "loss": 0.3843, "step": 125 }, { "epoch": 0.08240887480190175, "grad_norm": 0.44058959604469733, "learning_rate": 0.00016455696202531648, "loss": 0.3321, "step": 130 }, { "epoch": 0.08557844690966719, "grad_norm": 0.37513672150754146, "learning_rate": 0.0001708860759493671, "loss": 0.3409, "step": 135 }, { "epoch": 0.08874801901743265, "grad_norm": 0.3532888739409051, "learning_rate": 0.00017721518987341773, "loss": 0.3388, "step": 140 }, { "epoch": 0.0919175911251981, "grad_norm": 0.31398944959900404, "learning_rate": 0.00018354430379746836, "loss": 0.3407, "step": 145 }, { "epoch": 0.09508716323296355, "grad_norm": 0.48473648286443866, "learning_rate": 0.00018987341772151899, "loss": 0.4109, "step": 150 }, { "epoch": 0.098256735340729, "grad_norm": 0.3832743712760423, "learning_rate": 0.00019620253164556964, "loss": 0.2894, "step": 155 }, { "epoch": 0.10142630744849446, "grad_norm": 0.3576599310136604, "learning_rate": 0.00019999901968817678, "loss": 0.3685, "step": 160 }, { "epoch": 0.1045958795562599, "grad_norm": 0.4041268184733326, "learning_rate": 0.0001999879914008964, "loss": 0.3103, "step": 165 }, { "epoch": 0.10776545166402536, "grad_norm": 0.348710082889974, "learning_rate": 0.00019996471079244477, "loss": 0.3686, "step": 170 }, { "epoch": 0.1109350237717908, "grad_norm": 0.3641139077278622, "learning_rate": 0.0001999291807155794, "loss": 0.3672, "step": 175 }, { "epoch": 0.11410459587955626, "grad_norm": 0.34875291735749603, "learning_rate": 0.0001998814055240823, "loss": 0.3289, "step": 180 }, { "epoch": 0.11727416798732171, "grad_norm": 0.35868082118594846, "learning_rate": 0.00019982139107222632, "loss": 0.3843, "step": 185 }, { "epoch": 0.12044374009508717, "grad_norm": 0.2975053354861811, "learning_rate": 0.000199749144714058, "loss": 0.3187, "step": 190 }, { "epoch": 0.12361331220285261, "grad_norm": 0.3926097041806586, "learning_rate": 0.00019966467530249627, "loss": 0.3711, "step": 195 }, { "epoch": 0.12678288431061807, "grad_norm": 0.39235636818547276, "learning_rate": 0.00019956799318824776, "loss": 0.3599, "step": 200 }, { "epoch": 0.12678288431061807, "eval_loss": 0.31717613339424133, "eval_runtime": 878.4135, "eval_samples_per_second": 4.554, "eval_steps_per_second": 0.569, "step": 200 }, { "epoch": 0.12995245641838352, "grad_norm": 0.32366959300654363, "learning_rate": 0.00019945911021853818, "loss": 0.2671, "step": 205 }, { "epoch": 0.13312202852614896, "grad_norm": 0.34183927553766114, "learning_rate": 0.00019933803973566102, "loss": 0.3491, "step": 210 }, { "epoch": 0.13629160063391443, "grad_norm": 0.355629049879592, "learning_rate": 0.0001992047965753422, "loss": 0.2778, "step": 215 }, { "epoch": 0.13946117274167988, "grad_norm": 0.31194706241410036, "learning_rate": 0.00019905939706492238, "loss": 0.3278, "step": 220 }, { "epoch": 0.14263074484944532, "grad_norm": 0.37190501088914274, "learning_rate": 0.0001989018590213561, "loss": 0.3757, "step": 225 }, { "epoch": 0.14580031695721077, "grad_norm": 0.30859177154159206, "learning_rate": 0.00019873220174902858, "loss": 0.2952, "step": 230 }, { "epoch": 0.14896988906497624, "grad_norm": 0.4072493051692793, "learning_rate": 0.0001985504460373903, "loss": 0.3576, "step": 235 }, { "epoch": 0.15213946117274169, "grad_norm": 0.3117614582623609, "learning_rate": 0.00019835661415840928, "loss": 0.3127, "step": 240 }, { "epoch": 0.15530903328050713, "grad_norm": 0.3433870206019631, "learning_rate": 0.00019815072986384218, "loss": 0.3424, "step": 245 }, { "epoch": 0.15847860538827258, "grad_norm": 0.3252374107324197, "learning_rate": 0.0001979328183823236, "loss": 0.3509, "step": 250 }, { "epoch": 0.16164817749603805, "grad_norm": 0.32574757253252834, "learning_rate": 0.00019770290641627468, "loss": 0.2913, "step": 255 }, { "epoch": 0.1648177496038035, "grad_norm": 0.37343408069668577, "learning_rate": 0.00019746102213863114, "loss": 0.3524, "step": 260 }, { "epoch": 0.16798732171156894, "grad_norm": 0.30197216412790706, "learning_rate": 0.00019720719518939083, "loss": 0.295, "step": 265 }, { "epoch": 0.17115689381933438, "grad_norm": 0.37750434171669517, "learning_rate": 0.00019694145667198195, "loss": 0.3215, "step": 270 }, { "epoch": 0.17432646592709986, "grad_norm": 0.3368196048030473, "learning_rate": 0.0001966638391494514, "loss": 0.35, "step": 275 }, { "epoch": 0.1774960380348653, "grad_norm": 0.3232595651729065, "learning_rate": 0.0001963743766404749, "loss": 0.2637, "step": 280 }, { "epoch": 0.18066561014263074, "grad_norm": 0.32199548202560035, "learning_rate": 0.00019607310461518818, "loss": 0.3262, "step": 285 }, { "epoch": 0.1838351822503962, "grad_norm": 0.29117926540088634, "learning_rate": 0.0001957600599908406, "loss": 0.3129, "step": 290 }, { "epoch": 0.18700475435816163, "grad_norm": 0.2836794081153409, "learning_rate": 0.00019543528112727146, "loss": 0.3207, "step": 295 }, { "epoch": 0.1901743264659271, "grad_norm": 0.37478385305484463, "learning_rate": 0.0001950988078222093, "loss": 0.3503, "step": 300 }, { "epoch": 0.19334389857369255, "grad_norm": 0.3323790483161259, "learning_rate": 0.00019475068130639543, "loss": 0.2873, "step": 305 }, { "epoch": 0.196513470681458, "grad_norm": 0.31045326503955184, "learning_rate": 0.0001943909442385313, "loss": 0.3379, "step": 310 }, { "epoch": 0.19968304278922344, "grad_norm": 0.295428110940092, "learning_rate": 0.00019401964070005144, "loss": 0.2913, "step": 315 }, { "epoch": 0.20285261489698891, "grad_norm": 0.31381749704770145, "learning_rate": 0.00019363681618972164, "loss": 0.3167, "step": 320 }, { "epoch": 0.20602218700475436, "grad_norm": 0.3799683908480184, "learning_rate": 0.00019324251761806374, "loss": 0.3203, "step": 325 }, { "epoch": 0.2091917591125198, "grad_norm": 0.25669447806119594, "learning_rate": 0.00019283679330160726, "loss": 0.2598, "step": 330 }, { "epoch": 0.21236133122028525, "grad_norm": 0.3253285501894849, "learning_rate": 0.00019241969295696879, "loss": 0.321, "step": 335 }, { "epoch": 0.21553090332805072, "grad_norm": 0.3015776648780859, "learning_rate": 0.0001919912676947598, "loss": 0.2912, "step": 340 }, { "epoch": 0.21870047543581617, "grad_norm": 0.3548152436637532, "learning_rate": 0.00019155157001332374, "loss": 0.3398, "step": 345 }, { "epoch": 0.2218700475435816, "grad_norm": 0.3562179525646546, "learning_rate": 0.00019110065379230289, "loss": 0.3575, "step": 350 }, { "epoch": 0.22503961965134706, "grad_norm": 0.33759944051182883, "learning_rate": 0.00019063857428603615, "loss": 0.2644, "step": 355 }, { "epoch": 0.22820919175911253, "grad_norm": 0.3478332359179607, "learning_rate": 0.00019016538811678823, "loss": 0.3421, "step": 360 }, { "epoch": 0.23137876386687797, "grad_norm": 0.3107602080624315, "learning_rate": 0.0001896811532678113, "loss": 0.262, "step": 365 }, { "epoch": 0.23454833597464342, "grad_norm": 0.26971775917740104, "learning_rate": 0.00018918592907623985, "loss": 0.3378, "step": 370 }, { "epoch": 0.23771790808240886, "grad_norm": 0.32413332448217697, "learning_rate": 0.00018867977622581957, "loss": 0.3316, "step": 375 }, { "epoch": 0.24088748019017434, "grad_norm": 0.3522975093101741, "learning_rate": 0.00018816275673947148, "loss": 0.2678, "step": 380 }, { "epoch": 0.24405705229793978, "grad_norm": 0.31661852350790726, "learning_rate": 0.00018763493397169146, "loss": 0.3275, "step": 385 }, { "epoch": 0.24722662440570523, "grad_norm": 0.27090727261610936, "learning_rate": 0.00018709637260078729, "loss": 0.2858, "step": 390 }, { "epoch": 0.25039619651347067, "grad_norm": 0.3143474617991223, "learning_rate": 0.0001865471386209527, "loss": 0.3317, "step": 395 }, { "epoch": 0.25356576862123614, "grad_norm": 0.48811153855723693, "learning_rate": 0.000185987299334181, "loss": 0.3295, "step": 400 }, { "epoch": 0.25356576862123614, "eval_loss": 0.29194891452789307, "eval_runtime": 872.9978, "eval_samples_per_second": 4.582, "eval_steps_per_second": 0.573, "step": 400 }, { "epoch": 0.25673534072900156, "grad_norm": 0.31755342222995686, "learning_rate": 0.00018541692334201771, "loss": 0.2643, "step": 405 }, { "epoch": 0.25990491283676703, "grad_norm": 0.34778059073770806, "learning_rate": 0.0001848360805371544, "loss": 0.3339, "step": 410 }, { "epoch": 0.2630744849445325, "grad_norm": 0.3183073063986642, "learning_rate": 0.00018424484209486416, "loss": 0.2673, "step": 415 }, { "epoch": 0.2662440570522979, "grad_norm": 0.2788199901083398, "learning_rate": 0.00018364328046428, "loss": 0.3272, "step": 420 }, { "epoch": 0.2694136291600634, "grad_norm": 0.3666143727147526, "learning_rate": 0.00018303146935951689, "loss": 0.3247, "step": 425 }, { "epoch": 0.27258320126782887, "grad_norm": 0.28586548327038175, "learning_rate": 0.00018240948375063926, "loss": 0.2792, "step": 430 }, { "epoch": 0.2757527733755943, "grad_norm": 0.9727255846044429, "learning_rate": 0.00018177739985447412, "loss": 0.3485, "step": 435 }, { "epoch": 0.27892234548335976, "grad_norm": 0.29065854553956355, "learning_rate": 0.0001811352951252717, "loss": 0.2729, "step": 440 }, { "epoch": 0.2820919175911252, "grad_norm": 0.320575993183303, "learning_rate": 0.0001804832482452142, "loss": 0.3354, "step": 445 }, { "epoch": 0.28526148969889065, "grad_norm": 0.34869737354697955, "learning_rate": 0.0001798213391147746, "loss": 0.3385, "step": 450 }, { "epoch": 0.2884310618066561, "grad_norm": 0.31478642211651564, "learning_rate": 0.00017914964884292544, "loss": 0.3133, "step": 455 }, { "epoch": 0.29160063391442154, "grad_norm": 0.36834278711947965, "learning_rate": 0.0001784682597372, "loss": 0.3593, "step": 460 }, { "epoch": 0.294770206022187, "grad_norm": 0.2791902388221146, "learning_rate": 0.00017777725529360676, "loss": 0.3005, "step": 465 }, { "epoch": 0.2979397781299525, "grad_norm": 0.30096452678752406, "learning_rate": 0.00017707672018639758, "loss": 0.3354, "step": 470 }, { "epoch": 0.3011093502377179, "grad_norm": 0.3708048891578612, "learning_rate": 0.00017636674025769215, "loss": 0.3147, "step": 475 }, { "epoch": 0.30427892234548337, "grad_norm": 0.305209122691005, "learning_rate": 0.00017564740250695904, "loss": 0.2713, "step": 480 }, { "epoch": 0.3074484944532488, "grad_norm": 0.3018873391630076, "learning_rate": 0.0001749187950803549, "loss": 0.3202, "step": 485 }, { "epoch": 0.31061806656101426, "grad_norm": 0.3464422287874134, "learning_rate": 0.00017418100725992316, "loss": 0.3042, "step": 490 }, { "epoch": 0.31378763866877973, "grad_norm": 0.31036543367721087, "learning_rate": 0.00017343412945265382, "loss": 0.3105, "step": 495 }, { "epoch": 0.31695721077654515, "grad_norm": 0.3090116757558095, "learning_rate": 0.00017267825317940493, "loss": 0.3086, "step": 500 }, { "epoch": 0.3201267828843106, "grad_norm": 0.32015559999952525, "learning_rate": 0.00017191347106368797, "loss": 0.2595, "step": 505 }, { "epoch": 0.3232963549920761, "grad_norm": 0.28242640929152685, "learning_rate": 0.0001711398768203178, "loss": 0.3171, "step": 510 }, { "epoch": 0.3264659270998415, "grad_norm": 0.3373697781712397, "learning_rate": 0.00017035756524392924, "loss": 0.2897, "step": 515 }, { "epoch": 0.329635499207607, "grad_norm": 0.3187883343723006, "learning_rate": 0.0001695666321973609, "loss": 0.303, "step": 520 }, { "epoch": 0.3328050713153724, "grad_norm": 0.4060972163443389, "learning_rate": 0.00016876717459990862, "loss": 0.3273, "step": 525 }, { "epoch": 0.3359746434231379, "grad_norm": 0.2709960074426642, "learning_rate": 0.0001679592904154489, "loss": 0.2629, "step": 530 }, { "epoch": 0.33914421553090335, "grad_norm": 0.2828719972128079, "learning_rate": 0.00016714307864043487, "loss": 0.2946, "step": 535 }, { "epoch": 0.34231378763866877, "grad_norm": 0.29485357171410065, "learning_rate": 0.00016631863929176524, "loss": 0.2704, "step": 540 }, { "epoch": 0.34548335974643424, "grad_norm": 0.3140677978027709, "learning_rate": 0.00016548607339452853, "loss": 0.3211, "step": 545 }, { "epoch": 0.3486529318541997, "grad_norm": 0.30224374704766904, "learning_rate": 0.00016464548296962373, "loss": 0.3289, "step": 550 }, { "epoch": 0.3518225039619651, "grad_norm": 0.3015178734291492, "learning_rate": 0.0001637969710212588, "loss": 0.262, "step": 555 }, { "epoch": 0.3549920760697306, "grad_norm": 0.3261808476280464, "learning_rate": 0.00016294064152432879, "loss": 0.3524, "step": 560 }, { "epoch": 0.358161648177496, "grad_norm": 0.30420040263110554, "learning_rate": 0.00016207659941167485, "loss": 0.2888, "step": 565 }, { "epoch": 0.3613312202852615, "grad_norm": 0.29855740633395794, "learning_rate": 0.00016120495056122622, "loss": 0.3075, "step": 570 }, { "epoch": 0.36450079239302696, "grad_norm": 0.3775755682614953, "learning_rate": 0.00016032580178302583, "loss": 0.3452, "step": 575 }, { "epoch": 0.3676703645007924, "grad_norm": 0.3189277602131783, "learning_rate": 0.00015943926080614235, "loss": 0.2643, "step": 580 }, { "epoch": 0.37083993660855785, "grad_norm": 0.32115548282274786, "learning_rate": 0.00015854543626546915, "loss": 0.3126, "step": 585 }, { "epoch": 0.37400950871632327, "grad_norm": 0.29230296850863174, "learning_rate": 0.00015764443768841234, "loss": 0.2949, "step": 590 }, { "epoch": 0.37717908082408874, "grad_norm": 0.32187057297721217, "learning_rate": 0.0001567363754814696, "loss": 0.3166, "step": 595 }, { "epoch": 0.3803486529318542, "grad_norm": 0.3766752931165212, "learning_rate": 0.0001558213609167012, "loss": 0.323, "step": 600 }, { "epoch": 0.3803486529318542, "eval_loss": 0.2788923680782318, "eval_runtime": 873.5171, "eval_samples_per_second": 4.579, "eval_steps_per_second": 0.572, "step": 600 }, { "epoch": 0.38351822503961963, "grad_norm": 0.31877960462977273, "learning_rate": 0.00015489950611809484, "loss": 0.2803, "step": 605 }, { "epoch": 0.3866877971473851, "grad_norm": 0.2903622851026156, "learning_rate": 0.00015397092404782642, "loss": 0.3178, "step": 610 }, { "epoch": 0.3898573692551506, "grad_norm": 0.2639727101749139, "learning_rate": 0.00015303572849241764, "loss": 0.2703, "step": 615 }, { "epoch": 0.393026941362916, "grad_norm": 0.3491709894849581, "learning_rate": 0.00015209403404879303, "loss": 0.3049, "step": 620 }, { "epoch": 0.39619651347068147, "grad_norm": 0.3651420024997032, "learning_rate": 0.00015114595611023744, "loss": 0.3265, "step": 625 }, { "epoch": 0.3993660855784469, "grad_norm": 0.3071330073578763, "learning_rate": 0.0001501916108522558, "loss": 0.2645, "step": 630 }, { "epoch": 0.40253565768621236, "grad_norm": 0.2739471545543727, "learning_rate": 0.00014923111521833758, "loss": 0.3035, "step": 635 }, { "epoch": 0.40570522979397783, "grad_norm": 0.30630113259525843, "learning_rate": 0.00014826458690562642, "loss": 0.2606, "step": 640 }, { "epoch": 0.40887480190174325, "grad_norm": 0.2988843883769528, "learning_rate": 0.00014729214435049793, "loss": 0.3111, "step": 645 }, { "epoch": 0.4120443740095087, "grad_norm": 0.3110979862585215, "learning_rate": 0.0001463139067140468, "loss": 0.2948, "step": 650 }, { "epoch": 0.4152139461172742, "grad_norm": 0.30767657253531316, "learning_rate": 0.0001453299938674849, "loss": 0.2638, "step": 655 }, { "epoch": 0.4183835182250396, "grad_norm": 0.27014842841388653, "learning_rate": 0.00014434052637745257, "loss": 0.2819, "step": 660 }, { "epoch": 0.4215530903328051, "grad_norm": 0.2739393681355767, "learning_rate": 0.00014334562549124467, "loss": 0.2466, "step": 665 }, { "epoch": 0.4247226624405705, "grad_norm": 0.31758998023523244, "learning_rate": 0.00014234541312195323, "loss": 0.2873, "step": 670 }, { "epoch": 0.42789223454833597, "grad_norm": 0.39847849128188423, "learning_rate": 0.00014134001183352832, "loss": 0.2979, "step": 675 }, { "epoch": 0.43106180665610144, "grad_norm": 0.30950118355401873, "learning_rate": 0.00014032954482575937, "loss": 0.2617, "step": 680 }, { "epoch": 0.43423137876386686, "grad_norm": 0.3260587574739946, "learning_rate": 0.0001393141359191787, "loss": 0.3109, "step": 685 }, { "epoch": 0.43740095087163233, "grad_norm": 0.3114375419997854, "learning_rate": 0.00013829390953988853, "loss": 0.2845, "step": 690 }, { "epoch": 0.4405705229793978, "grad_norm": 0.30019871836883555, "learning_rate": 0.00013726899070431423, "loss": 0.324, "step": 695 }, { "epoch": 0.4437400950871632, "grad_norm": 0.38021042516470643, "learning_rate": 0.00013623950500388506, "loss": 0.3269, "step": 700 }, { "epoch": 0.4469096671949287, "grad_norm": 0.3089060241706131, "learning_rate": 0.00013520557858964446, "loss": 0.2584, "step": 705 }, { "epoch": 0.4500792393026941, "grad_norm": 0.27984586622582663, "learning_rate": 0.00013416733815679166, "loss": 0.2909, "step": 710 }, { "epoch": 0.4532488114104596, "grad_norm": 0.2923559292409706, "learning_rate": 0.00013312491092915682, "loss": 0.2489, "step": 715 }, { "epoch": 0.45641838351822506, "grad_norm": 0.29223045315786345, "learning_rate": 0.00013207842464361125, "loss": 0.3135, "step": 720 }, { "epoch": 0.4595879556259905, "grad_norm": 0.33907899924090856, "learning_rate": 0.00013102800753441487, "loss": 0.3148, "step": 725 }, { "epoch": 0.46275752773375595, "grad_norm": 0.26110455456342696, "learning_rate": 0.00012997378831750242, "loss": 0.2505, "step": 730 }, { "epoch": 0.4659270998415214, "grad_norm": 0.2855563878095534, "learning_rate": 0.00012891589617471122, "loss": 0.322, "step": 735 }, { "epoch": 0.46909667194928684, "grad_norm": 0.27089962197787903, "learning_rate": 0.00012785446073795118, "loss": 0.2629, "step": 740 }, { "epoch": 0.4722662440570523, "grad_norm": 0.2787588891548799, "learning_rate": 0.00012678961207332015, "loss": 0.3071, "step": 745 }, { "epoch": 0.4754358161648177, "grad_norm": 0.35249049637057156, "learning_rate": 0.00012572148066516584, "loss": 0.3265, "step": 750 }, { "epoch": 0.4786053882725832, "grad_norm": 0.33307560406452336, "learning_rate": 0.00012465019740009662, "loss": 0.2403, "step": 755 }, { "epoch": 0.48177496038034867, "grad_norm": 0.3035753509057755, "learning_rate": 0.00012357589355094275, "loss": 0.3057, "step": 760 }, { "epoch": 0.4849445324881141, "grad_norm": 0.2950972689886197, "learning_rate": 0.00012249870076067067, "loss": 0.2637, "step": 765 }, { "epoch": 0.48811410459587956, "grad_norm": 0.2713040409786771, "learning_rate": 0.00012141875102625167, "loss": 0.3196, "step": 770 }, { "epoch": 0.49128367670364503, "grad_norm": 0.37005187803966516, "learning_rate": 0.00012033617668248723, "loss": 0.3265, "step": 775 }, { "epoch": 0.49445324881141045, "grad_norm": 0.3678796577106568, "learning_rate": 0.00011925111038579309, "loss": 0.2283, "step": 780 }, { "epoch": 0.4976228209191759, "grad_norm": 0.3021844529595635, "learning_rate": 0.00011816368509794364, "loss": 0.2967, "step": 785 }, { "epoch": 0.5007923930269413, "grad_norm": 0.3028161473676034, "learning_rate": 0.00011707403406977928, "loss": 0.2841, "step": 790 }, { "epoch": 0.5039619651347068, "grad_norm": 0.27418964538735746, "learning_rate": 0.00011598229082487784, "loss": 0.2803, "step": 795 }, { "epoch": 0.5071315372424723, "grad_norm": 0.3426638434156249, "learning_rate": 0.0001148885891431932, "loss": 0.3274, "step": 800 }, { "epoch": 0.5071315372424723, "eval_loss": 0.26855266094207764, "eval_runtime": 873.628, "eval_samples_per_second": 4.579, "eval_steps_per_second": 0.572, "step": 800 }, { "epoch": 0.5103011093502378, "grad_norm": 0.2681269338020656, "learning_rate": 0.00011379306304466198, "loss": 0.2381, "step": 805 }, { "epoch": 0.5134706814580031, "grad_norm": 0.2987060218422062, "learning_rate": 0.00011269584677278102, "loss": 0.3076, "step": 810 }, { "epoch": 0.5166402535657686, "grad_norm": 0.2804222341073312, "learning_rate": 0.00011159707477815755, "loss": 0.2395, "step": 815 }, { "epoch": 0.5198098256735341, "grad_norm": 0.25835895356413513, "learning_rate": 0.00011049688170203383, "loss": 0.3041, "step": 820 }, { "epoch": 0.5229793977812995, "grad_norm": 0.3313190058494361, "learning_rate": 0.00010939540235978845, "loss": 0.297, "step": 825 }, { "epoch": 0.526148969889065, "grad_norm": 0.2564972143294916, "learning_rate": 0.00010829277172441648, "loss": 0.2359, "step": 830 }, { "epoch": 0.5293185419968305, "grad_norm": 0.31632766018739716, "learning_rate": 0.00010718912490998991, "loss": 0.3112, "step": 835 }, { "epoch": 0.5324881141045958, "grad_norm": 0.2738970193614327, "learning_rate": 0.00010608459715510139, "loss": 0.2416, "step": 840 }, { "epoch": 0.5356576862123613, "grad_norm": 0.35306801364530893, "learning_rate": 0.00010497932380629207, "loss": 0.3334, "step": 845 }, { "epoch": 0.5388272583201268, "grad_norm": 0.3617753781992424, "learning_rate": 0.00010387344030146665, "loss": 0.3071, "step": 850 }, { "epoch": 0.5419968304278923, "grad_norm": 0.284695185318866, "learning_rate": 0.0001027670821532971, "loss": 0.2516, "step": 855 }, { "epoch": 0.5451664025356577, "grad_norm": 0.28641499966999695, "learning_rate": 0.00010166038493261722, "loss": 0.3268, "step": 860 }, { "epoch": 0.5483359746434231, "grad_norm": 0.29940254299061986, "learning_rate": 0.00010055348425181, "loss": 0.2667, "step": 865 }, { "epoch": 0.5515055467511886, "grad_norm": 0.33784906825030664, "learning_rate": 9.944651574819003e-05, "loss": 0.3006, "step": 870 }, { "epoch": 0.554675118858954, "grad_norm": 0.33800198210916443, "learning_rate": 9.83396150673828e-05, "loss": 0.3009, "step": 875 }, { "epoch": 0.5578446909667195, "grad_norm": 0.27814752259908526, "learning_rate": 9.72329178467029e-05, "loss": 0.25, "step": 880 }, { "epoch": 0.561014263074485, "grad_norm": 0.3120985607406773, "learning_rate": 9.612655969853336e-05, "loss": 0.3079, "step": 885 }, { "epoch": 0.5641838351822503, "grad_norm": 0.32270045792226343, "learning_rate": 9.502067619370794e-05, "loss": 0.2465, "step": 890 }, { "epoch": 0.5673534072900158, "grad_norm": 0.2522429392869884, "learning_rate": 9.391540284489862e-05, "loss": 0.3049, "step": 895 }, { "epoch": 0.5705229793977813, "grad_norm": 0.32479021947356745, "learning_rate": 9.281087509001011e-05, "loss": 0.3109, "step": 900 }, { "epoch": 0.5736925515055468, "grad_norm": 0.3071871099500722, "learning_rate": 9.170722827558358e-05, "loss": 0.2566, "step": 905 }, { "epoch": 0.5768621236133122, "grad_norm": 0.2808358292017096, "learning_rate": 9.060459764021156e-05, "loss": 0.2981, "step": 910 }, { "epoch": 0.5800316957210776, "grad_norm": 0.36613518181258947, "learning_rate": 8.950311829796619e-05, "loss": 0.2812, "step": 915 }, { "epoch": 0.5832012678288431, "grad_norm": 0.29120302112196544, "learning_rate": 8.840292522184247e-05, "loss": 0.2958, "step": 920 }, { "epoch": 0.5863708399366085, "grad_norm": 0.3008146054202439, "learning_rate": 8.730415322721897e-05, "loss": 0.3119, "step": 925 }, { "epoch": 0.589540412044374, "grad_norm": 0.30809505125548203, "learning_rate": 8.620693695533803e-05, "loss": 0.2603, "step": 930 }, { "epoch": 0.5927099841521395, "grad_norm": 0.3464042931932695, "learning_rate": 8.511141085680683e-05, "loss": 0.3217, "step": 935 }, { "epoch": 0.595879556259905, "grad_norm": 0.28395404105986655, "learning_rate": 8.401770917512221e-05, "loss": 0.2339, "step": 940 }, { "epoch": 0.5990491283676703, "grad_norm": 0.32456815689823176, "learning_rate": 8.292596593022075e-05, "loss": 0.2761, "step": 945 }, { "epoch": 0.6022187004754358, "grad_norm": 0.35814205267620147, "learning_rate": 8.183631490205637e-05, "loss": 0.3064, "step": 950 }, { "epoch": 0.6053882725832013, "grad_norm": 0.3307025804465351, "learning_rate": 8.074888961420695e-05, "loss": 0.2317, "step": 955 }, { "epoch": 0.6085578446909667, "grad_norm": 0.3035093202164917, "learning_rate": 7.966382331751277e-05, "loss": 0.3024, "step": 960 }, { "epoch": 0.6117274167987322, "grad_norm": 0.23483953416505404, "learning_rate": 7.858124897374837e-05, "loss": 0.2616, "step": 965 }, { "epoch": 0.6148969889064976, "grad_norm": 0.24795445024402282, "learning_rate": 7.750129923932939e-05, "loss": 0.2889, "step": 970 }, { "epoch": 0.618066561014263, "grad_norm": 0.39470726118892546, "learning_rate": 7.642410644905726e-05, "loss": 0.3255, "step": 975 }, { "epoch": 0.6212361331220285, "grad_norm": 0.28578857562483734, "learning_rate": 7.534980259990341e-05, "loss": 0.2177, "step": 980 }, { "epoch": 0.624405705229794, "grad_norm": 0.293120691065387, "learning_rate": 7.427851933483418e-05, "loss": 0.3008, "step": 985 }, { "epoch": 0.6275752773375595, "grad_norm": 0.28050824031198807, "learning_rate": 7.321038792667987e-05, "loss": 0.2617, "step": 990 }, { "epoch": 0.6307448494453248, "grad_norm": 0.3421819179459905, "learning_rate": 7.214553926204883e-05, "loss": 0.2827, "step": 995 }, { "epoch": 0.6339144215530903, "grad_norm": 0.3825000717076991, "learning_rate": 7.108410382528879e-05, "loss": 0.3171, "step": 1000 }, { "epoch": 0.6339144215530903, "eval_loss": 0.2597305178642273, "eval_runtime": 873.3574, "eval_samples_per_second": 4.58, "eval_steps_per_second": 0.573, "step": 1000 }, { "epoch": 0.6370839936608558, "grad_norm": 0.293460396656183, "learning_rate": 7.002621168249759e-05, "loss": 0.2297, "step": 1005 }, { "epoch": 0.6402535657686212, "grad_norm": 0.3006160040194, "learning_rate": 6.897199246558514e-05, "loss": 0.2956, "step": 1010 }, { "epoch": 0.6434231378763867, "grad_norm": 0.2791223126874652, "learning_rate": 6.792157535638874e-05, "loss": 0.2496, "step": 1015 }, { "epoch": 0.6465927099841522, "grad_norm": 0.2894662197144813, "learning_rate": 6.687508907084319e-05, "loss": 0.2866, "step": 1020 }, { "epoch": 0.6497622820919176, "grad_norm": 0.33156274133370534, "learning_rate": 6.583266184320836e-05, "loss": 0.32, "step": 1025 }, { "epoch": 0.652931854199683, "grad_norm": 0.3447301699746775, "learning_rate": 6.479442141035556e-05, "loss": 0.2555, "step": 1030 }, { "epoch": 0.6561014263074485, "grad_norm": 0.3019937172628048, "learning_rate": 6.376049499611496e-05, "loss": 0.2632, "step": 1035 }, { "epoch": 0.659270998415214, "grad_norm": 0.25047087286035274, "learning_rate": 6.273100929568578e-05, "loss": 0.2472, "step": 1040 }, { "epoch": 0.6624405705229794, "grad_norm": 0.31801398649186896, "learning_rate": 6.170609046011151e-05, "loss": 0.2793, "step": 1045 }, { "epoch": 0.6656101426307448, "grad_norm": 0.3464523898432614, "learning_rate": 6.068586408082133e-05, "loss": 0.3138, "step": 1050 }, { "epoch": 0.6687797147385103, "grad_norm": 0.2919062799416737, "learning_rate": 5.9670455174240614e-05, "loss": 0.2427, "step": 1055 }, { "epoch": 0.6719492868462758, "grad_norm": 0.29267872629520425, "learning_rate": 5.865998816647171e-05, "loss": 0.3038, "step": 1060 }, { "epoch": 0.6751188589540412, "grad_norm": 0.27361822239828004, "learning_rate": 5.765458687804679e-05, "loss": 0.2566, "step": 1065 }, { "epoch": 0.6782884310618067, "grad_norm": 0.3050132066017946, "learning_rate": 5.665437450875534e-05, "loss": 0.2752, "step": 1070 }, { "epoch": 0.6814580031695721, "grad_norm": 0.3580338711915158, "learning_rate": 5.565947362254746e-05, "loss": 0.3331, "step": 1075 }, { "epoch": 0.6846275752773375, "grad_norm": 0.26747930377415474, "learning_rate": 5.467000613251516e-05, "loss": 0.2429, "step": 1080 }, { "epoch": 0.687797147385103, "grad_norm": 0.32226567868782413, "learning_rate": 5.368609328595323e-05, "loss": 0.3208, "step": 1085 }, { "epoch": 0.6909667194928685, "grad_norm": 0.27314417996148593, "learning_rate": 5.270785564950208e-05, "loss": 0.2351, "step": 1090 }, { "epoch": 0.694136291600634, "grad_norm": 0.31179553442460595, "learning_rate": 5.1735413094373594e-05, "loss": 0.2791, "step": 1095 }, { "epoch": 0.6973058637083994, "grad_norm": 0.2983027582550753, "learning_rate": 5.0768884781662465e-05, "loss": 0.3123, "step": 1100 }, { "epoch": 0.7004754358161648, "grad_norm": 0.268619063810808, "learning_rate": 4.9808389147744195e-05, "loss": 0.2675, "step": 1105 }, { "epoch": 0.7036450079239303, "grad_norm": 0.34151620569667657, "learning_rate": 4.885404388976261e-05, "loss": 0.3171, "step": 1110 }, { "epoch": 0.7068145800316957, "grad_norm": 0.25963093128586956, "learning_rate": 4.790596595120699e-05, "loss": 0.2533, "step": 1115 }, { "epoch": 0.7099841521394612, "grad_norm": 0.3373621924020373, "learning_rate": 4.696427150758238e-05, "loss": 0.3017, "step": 1120 }, { "epoch": 0.7131537242472267, "grad_norm": 0.32633352666577314, "learning_rate": 4.6029075952173596e-05, "loss": 0.3052, "step": 1125 }, { "epoch": 0.716323296354992, "grad_norm": 0.24971258370165642, "learning_rate": 4.510049388190518e-05, "loss": 0.2044, "step": 1130 }, { "epoch": 0.7194928684627575, "grad_norm": 0.29602844393415106, "learning_rate": 4.417863908329884e-05, "loss": 0.2959, "step": 1135 }, { "epoch": 0.722662440570523, "grad_norm": 0.23146594836780063, "learning_rate": 4.32636245185304e-05, "loss": 0.2252, "step": 1140 }, { "epoch": 0.7258320126782885, "grad_norm": 0.2744736835188008, "learning_rate": 4.235556231158765e-05, "loss": 0.2884, "step": 1145 }, { "epoch": 0.7290015847860539, "grad_norm": 0.27538990975844047, "learning_rate": 4.145456373453087e-05, "loss": 0.2981, "step": 1150 }, { "epoch": 0.7321711568938193, "grad_norm": 0.3032208366026702, "learning_rate": 4.0560739193857625e-05, "loss": 0.2158, "step": 1155 }, { "epoch": 0.7353407290015848, "grad_norm": 0.27204457210068295, "learning_rate": 3.96741982169742e-05, "loss": 0.3028, "step": 1160 }, { "epoch": 0.7385103011093502, "grad_norm": 0.28301662262727184, "learning_rate": 3.8795049438773825e-05, "loss": 0.2946, "step": 1165 }, { "epoch": 0.7416798732171157, "grad_norm": 0.2884264535746388, "learning_rate": 3.7923400588325155e-05, "loss": 0.3015, "step": 1170 }, { "epoch": 0.7448494453248812, "grad_norm": 0.3186549926460967, "learning_rate": 3.7059358475671224e-05, "loss": 0.2773, "step": 1175 }, { "epoch": 0.7480190174326465, "grad_norm": 0.2997708530371057, "learning_rate": 3.6203028978741226e-05, "loss": 0.2469, "step": 1180 }, { "epoch": 0.751188589540412, "grad_norm": 0.32430776300917263, "learning_rate": 3.535451703037626e-05, "loss": 0.2726, "step": 1185 }, { "epoch": 0.7543581616481775, "grad_norm": 0.2946578935656507, "learning_rate": 3.45139266054715e-05, "loss": 0.2645, "step": 1190 }, { "epoch": 0.757527733755943, "grad_norm": 0.26638481808591286, "learning_rate": 3.368136070823478e-05, "loss": 0.2465, "step": 1195 }, { "epoch": 0.7606973058637084, "grad_norm": 0.3677636374426017, "learning_rate": 3.285692135956515e-05, "loss": 0.3034, "step": 1200 }, { "epoch": 0.7606973058637084, "eval_loss": 0.2539891302585602, "eval_runtime": 873.4669, "eval_samples_per_second": 4.579, "eval_steps_per_second": 0.572, "step": 1200 }, { "epoch": 0.7638668779714739, "grad_norm": 0.29762017072344943, "learning_rate": 3.2040709584551095e-05, "loss": 0.2547, "step": 1205 }, { "epoch": 0.7670364500792393, "grad_norm": 0.35066724794986226, "learning_rate": 3.123282540009139e-05, "loss": 0.3043, "step": 1210 }, { "epoch": 0.7702060221870047, "grad_norm": 0.27108651599825634, "learning_rate": 3.0433367802639112e-05, "loss": 0.2195, "step": 1215 }, { "epoch": 0.7733755942947702, "grad_norm": 0.24030479810127725, "learning_rate": 2.9642434756070793e-05, "loss": 0.2545, "step": 1220 }, { "epoch": 0.7765451664025357, "grad_norm": 0.288327556838552, "learning_rate": 2.8860123179682242e-05, "loss": 0.2942, "step": 1225 }, { "epoch": 0.7797147385103012, "grad_norm": 0.29997783643544385, "learning_rate": 2.8086528936312073e-05, "loss": 0.2407, "step": 1230 }, { "epoch": 0.7828843106180665, "grad_norm": 0.2665313932594352, "learning_rate": 2.7321746820595086e-05, "loss": 0.2863, "step": 1235 }, { "epoch": 0.786053882725832, "grad_norm": 0.24138106294481415, "learning_rate": 2.6565870547346196e-05, "loss": 0.2443, "step": 1240 }, { "epoch": 0.7892234548335975, "grad_norm": 0.27410565336257203, "learning_rate": 2.5818992740076873e-05, "loss": 0.2714, "step": 1245 }, { "epoch": 0.7923930269413629, "grad_norm": 0.3607807135248553, "learning_rate": 2.508120491964512e-05, "loss": 0.3131, "step": 1250 }, { "epoch": 0.7955625990491284, "grad_norm": 0.2752324746545014, "learning_rate": 2.435259749304096e-05, "loss": 0.2352, "step": 1255 }, { "epoch": 0.7987321711568938, "grad_norm": 0.33701412326580854, "learning_rate": 2.3633259742307844e-05, "loss": 0.3121, "step": 1260 }, { "epoch": 0.8019017432646592, "grad_norm": 0.2719696587030905, "learning_rate": 2.292327981360245e-05, "loss": 0.2569, "step": 1265 }, { "epoch": 0.8050713153724247, "grad_norm": 0.321470064394813, "learning_rate": 2.222274470639324e-05, "loss": 0.2903, "step": 1270 }, { "epoch": 0.8082408874801902, "grad_norm": 0.33376441935823614, "learning_rate": 2.1531740262800004e-05, "loss": 0.2712, "step": 1275 }, { "epoch": 0.8114104595879557, "grad_norm": 0.3559808478292093, "learning_rate": 2.0850351157074598e-05, "loss": 0.2485, "step": 1280 }, { "epoch": 0.8145800316957211, "grad_norm": 0.3006799560470683, "learning_rate": 2.017866088522541e-05, "loss": 0.2735, "step": 1285 }, { "epoch": 0.8177496038034865, "grad_norm": 0.27868991819615774, "learning_rate": 1.951675175478579e-05, "loss": 0.2479, "step": 1290 }, { "epoch": 0.820919175911252, "grad_norm": 0.30796745550467525, "learning_rate": 1.8864704874728346e-05, "loss": 0.2693, "step": 1295 }, { "epoch": 0.8240887480190174, "grad_norm": 0.327384705590186, "learning_rate": 1.822260014552587e-05, "loss": 0.2787, "step": 1300 }, { "epoch": 0.8272583201267829, "grad_norm": 0.2993843751525639, "learning_rate": 1.7590516249360754e-05, "loss": 0.2455, "step": 1305 }, { "epoch": 0.8304278922345484, "grad_norm": 0.2979918507317238, "learning_rate": 1.6968530640483127e-05, "loss": 0.2889, "step": 1310 }, { "epoch": 0.8335974643423137, "grad_norm": 0.2942240760065363, "learning_rate": 1.6356719535720056e-05, "loss": 0.2557, "step": 1315 }, { "epoch": 0.8367670364500792, "grad_norm": 0.31698805935759067, "learning_rate": 1.5755157905135843e-05, "loss": 0.2842, "step": 1320 }, { "epoch": 0.8399366085578447, "grad_norm": 0.3795639487558114, "learning_rate": 1.5163919462845622e-05, "loss": 0.2979, "step": 1325 }, { "epoch": 0.8431061806656102, "grad_norm": 0.2933950396246441, "learning_rate": 1.4583076657982297e-05, "loss": 0.2291, "step": 1330 }, { "epoch": 0.8462757527733756, "grad_norm": 0.25934135222761445, "learning_rate": 1.401270066581899e-05, "loss": 0.2981, "step": 1335 }, { "epoch": 0.849445324881141, "grad_norm": 0.2512793866151091, "learning_rate": 1.3452861379047287e-05, "loss": 0.2299, "step": 1340 }, { "epoch": 0.8526148969889065, "grad_norm": 0.27890392188122143, "learning_rate": 1.2903627399212747e-05, "loss": 0.2714, "step": 1345 }, { "epoch": 0.8557844690966719, "grad_norm": 0.3540435753559853, "learning_rate": 1.2365066028308547e-05, "loss": 0.3208, "step": 1350 }, { "epoch": 0.8589540412044374, "grad_norm": 0.3170188652169802, "learning_rate": 1.183724326052854e-05, "loss": 0.261, "step": 1355 }, { "epoch": 0.8621236133122029, "grad_norm": 0.287259110452561, "learning_rate": 1.1320223774180428e-05, "loss": 0.2918, "step": 1360 }, { "epoch": 0.8652931854199684, "grad_norm": 0.3145063929825825, "learning_rate": 1.0814070923760178e-05, "loss": 0.2562, "step": 1365 }, { "epoch": 0.8684627575277337, "grad_norm": 0.29883537499670176, "learning_rate": 1.0318846732188737e-05, "loss": 0.2585, "step": 1370 }, { "epoch": 0.8716323296354992, "grad_norm": 0.33602754178177113, "learning_rate": 9.834611883211797e-06, "loss": 0.303, "step": 1375 }, { "epoch": 0.8748019017432647, "grad_norm": 0.27917699955310804, "learning_rate": 9.361425713963878e-06, "loss": 0.2399, "step": 1380 }, { "epoch": 0.8779714738510301, "grad_norm": 0.29322424380757633, "learning_rate": 8.899346207697134e-06, "loss": 0.3192, "step": 1385 }, { "epoch": 0.8811410459587956, "grad_norm": 0.32716078301472046, "learning_rate": 8.448429986676298e-06, "loss": 0.256, "step": 1390 }, { "epoch": 0.884310618066561, "grad_norm": 0.28468261231564157, "learning_rate": 8.00873230524023e-06, "loss": 0.2864, "step": 1395 }, { "epoch": 0.8874801901743264, "grad_norm": 0.3481974787604397, "learning_rate": 7.580307043031232e-06, "loss": 0.265, "step": 1400 }, { "epoch": 0.8874801901743264, "eval_loss": 0.25099214911460876, "eval_runtime": 873.7854, "eval_samples_per_second": 4.578, "eval_steps_per_second": 0.572, "step": 1400 }, { "epoch": 0.8906497622820919, "grad_norm": 0.2756744352775957, "learning_rate": 7.163206698392744e-06, "loss": 0.2392, "step": 1405 }, { "epoch": 0.8938193343898574, "grad_norm": 0.3070714015760399, "learning_rate": 6.757482381936264e-06, "loss": 0.2722, "step": 1410 }, { "epoch": 0.8969889064976229, "grad_norm": 0.2719682030351016, "learning_rate": 6.36318381027835e-06, "loss": 0.2553, "step": 1415 }, { "epoch": 0.9001584786053882, "grad_norm": 0.30754515515844727, "learning_rate": 5.980359299948568e-06, "loss": 0.2763, "step": 1420 }, { "epoch": 0.9033280507131537, "grad_norm": 0.3599613866897873, "learning_rate": 5.609055761468707e-06, "loss": 0.2987, "step": 1425 }, { "epoch": 0.9064976228209192, "grad_norm": 0.26662442413818216, "learning_rate": 5.249318693604577e-06, "loss": 0.2632, "step": 1430 }, { "epoch": 0.9096671949286846, "grad_norm": 0.2965993748242227, "learning_rate": 4.901192177790692e-06, "loss": 0.2799, "step": 1435 }, { "epoch": 0.9128367670364501, "grad_norm": 0.2923839300339188, "learning_rate": 4.564718872728568e-06, "loss": 0.2464, "step": 1440 }, { "epoch": 0.9160063391442155, "grad_norm": 0.3004256474409844, "learning_rate": 4.2399400091594154e-06, "loss": 0.2775, "step": 1445 }, { "epoch": 0.919175911251981, "grad_norm": 0.30636844288189197, "learning_rate": 3.926895384811835e-06, "loss": 0.2917, "step": 1450 }, { "epoch": 0.9223454833597464, "grad_norm": 0.27018058178290905, "learning_rate": 3.625623359525099e-06, "loss": 0.2522, "step": 1455 }, { "epoch": 0.9255150554675119, "grad_norm": 0.3069766309513976, "learning_rate": 3.33616085054862e-06, "loss": 0.2722, "step": 1460 }, { "epoch": 0.9286846275752774, "grad_norm": 0.2673579253849767, "learning_rate": 3.0585433280180707e-06, "loss": 0.2561, "step": 1465 }, { "epoch": 0.9318541996830428, "grad_norm": 0.2688001276727079, "learning_rate": 2.792804810609173e-06, "loss": 0.2718, "step": 1470 }, { "epoch": 0.9350237717908082, "grad_norm": 0.3331860222359942, "learning_rate": 2.538977861368874e-06, "loss": 0.3163, "step": 1475 }, { "epoch": 0.9381933438985737, "grad_norm": 0.2668325932813764, "learning_rate": 2.2970935837253182e-06, "loss": 0.2393, "step": 1480 }, { "epoch": 0.9413629160063391, "grad_norm": 0.3285498156618503, "learning_rate": 2.0671816176764058e-06, "loss": 0.2862, "step": 1485 }, { "epoch": 0.9445324881141046, "grad_norm": 0.36573862269188245, "learning_rate": 1.8492701361578324e-06, "loss": 0.2447, "step": 1490 }, { "epoch": 0.9477020602218701, "grad_norm": 0.2864139944423568, "learning_rate": 1.6433858415907278e-06, "loss": 0.2777, "step": 1495 }, { "epoch": 0.9508716323296355, "grad_norm": 0.323741034773291, "learning_rate": 1.4495539626097288e-06, "loss": 0.3086, "step": 1500 }, { "epoch": 0.9540412044374009, "grad_norm": 0.2857388007026186, "learning_rate": 1.2677982509714415e-06, "loss": 0.2175, "step": 1505 }, { "epoch": 0.9572107765451664, "grad_norm": 0.2813011213045847, "learning_rate": 1.0981409786439355e-06, "loss": 0.2882, "step": 1510 }, { "epoch": 0.9603803486529319, "grad_norm": 0.27685594779071976, "learning_rate": 9.40602935077639e-07, "loss": 0.23, "step": 1515 }, { "epoch": 0.9635499207606973, "grad_norm": 0.278082958417837, "learning_rate": 7.952034246577977e-07, "loss": 0.2814, "step": 1520 }, { "epoch": 0.9667194928684627, "grad_norm": 0.332411253150925, "learning_rate": 6.619602643389899e-07, "loss": 0.2772, "step": 1525 }, { "epoch": 0.9698890649762282, "grad_norm": 0.28541188324654354, "learning_rate": 5.408897814618175e-07, "loss": 0.2456, "step": 1530 }, { "epoch": 0.9730586370839936, "grad_norm": 0.289051402982161, "learning_rate": 4.320068117522835e-07, "loss": 0.2659, "step": 1535 }, { "epoch": 0.9762282091917591, "grad_norm": 0.2896831822321737, "learning_rate": 3.35324697503725e-07, "loss": 0.2721, "step": 1540 }, { "epoch": 0.9793977812995246, "grad_norm": 0.31950029347694936, "learning_rate": 2.508552859419977e-07, "loss": 0.2622, "step": 1545 }, { "epoch": 0.9825673534072901, "grad_norm": 0.33661523682392047, "learning_rate": 1.7860892777367133e-07, "loss": 0.2731, "step": 1550 }, { "epoch": 0.9857369255150554, "grad_norm": 0.2522879758084615, "learning_rate": 1.1859447591769934e-07, "loss": 0.2291, "step": 1555 }, { "epoch": 0.9889064976228209, "grad_norm": 0.2923729662272973, "learning_rate": 7.081928442057573e-08, "loss": 0.2972, "step": 1560 }, { "epoch": 0.9920760697305864, "grad_norm": 0.24814229174923821, "learning_rate": 3.5289207555233573e-08, "loss": 0.2586, "step": 1565 }, { "epoch": 0.9952456418383518, "grad_norm": 0.24322900794711846, "learning_rate": 1.2008599103618956e-08, "loss": 0.2751, "step": 1570 }, { "epoch": 0.9984152139461173, "grad_norm": 0.4374899080765362, "learning_rate": 9.803118232398768e-10, "loss": 0.2981, "step": 1575 }, { "epoch": 0.9996830427892235, "step": 1577, "total_flos": 8013042675351552.0, "train_loss": 0.3010184336819827, "train_runtime": 18281.3669, "train_samples_per_second": 1.381, "train_steps_per_second": 0.086 } ], "logging_steps": 5, "max_steps": 1577, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8013042675351552.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }