diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,22533 +1,8005 @@ { - "epoch": 1.0, - "global_step": 15617, - "max_steps": 15617, - "logging_steps": 5, - "eval_steps": 200, - "save_steps": 0, + "epoch": 0.9991671471586905, + "global_step": 1114, + "max_steps": 1114, + "logging_steps": 1, + "eval_steps": 50, + "save_steps": 50, "train_batch_size": 8, "num_train_epochs": 1, "num_input_tokens_seen": 0, - "total_flos": 5.2748789856731136e+17, + "total_flos": 6.811715592467251e+17, "log_history": [ { - "loss": 2.6996, - "grad_norm": 2.458204984664917, - "learning_rate": 1.6005121638924457e-07, - "epoch": 0.0003201639239290517, - "step": 5 - }, - { - "loss": 2.6896, - "grad_norm": 2.7179408073425293, - "learning_rate": 3.2010243277848913e-07, - "epoch": 0.0006403278478581034, - "step": 10 - }, - { - "loss": 2.6812, - "grad_norm": 2.403977870941162, - "learning_rate": 4.801536491677337e-07, - "epoch": 0.0009604917717871551, - "step": 15 - }, - { - "loss": 2.6922, - "grad_norm": 2.4200124740600586, - "learning_rate": 6.402048655569783e-07, - "epoch": 0.0012806556957162068, - "step": 20 - }, - { - "loss": 2.6932, - "grad_norm": 2.451019048690796, - "learning_rate": 8.002560819462229e-07, - "epoch": 0.0016008196196452584, - "step": 25 - }, - { - "loss": 2.6732, - "grad_norm": 2.2547831535339355, - "learning_rate": 9.603072983354673e-07, - "epoch": 0.0019209835435743102, - "step": 30 - }, - { - "loss": 2.7105, - "grad_norm": 2.2860751152038574, - "learning_rate": 1.120358514724712e-06, - "epoch": 0.0022411474675033617, - "step": 35 - }, - { - "loss": 2.6946, - "grad_norm": 2.1596930027008057, - "learning_rate": 1.2804097311139565e-06, - "epoch": 0.0025613113914324135, - "step": 40 - }, - { - "loss": 2.697, - "grad_norm": 2.220602512359619, - "learning_rate": 1.4404609475032012e-06, - "epoch": 0.002881475315361465, - "step": 45 - }, - { - "loss": 2.6972, - "grad_norm": 2.0607404708862305, - "learning_rate": 1.6005121638924457e-06, - "epoch": 0.0032016392392905167, - "step": 50 - }, - { - "loss": 2.6861, - "grad_norm": 2.138319492340088, - "learning_rate": 1.7605633802816904e-06, - "epoch": 0.0035218031632195685, - "step": 55 - }, - { - "loss": 2.6478, - "grad_norm": 2.244121789932251, - "learning_rate": 1.9206145966709347e-06, - "epoch": 0.0038419670871486203, - "step": 60 - }, - { - "loss": 2.6676, - "grad_norm": 2.0738816261291504, - "learning_rate": 2.0806658130601794e-06, - "epoch": 0.004162131011077672, - "step": 65 - }, - { - "loss": 2.6839, - "grad_norm": 2.0179197788238525, - "learning_rate": 2.240717029449424e-06, - "epoch": 0.0044822949350067235, - "step": 70 - }, - { - "loss": 2.6514, - "grad_norm": 1.970077633857727, - "learning_rate": 2.4007682458386688e-06, - "epoch": 0.004802458858935775, - "step": 75 - }, - { - "loss": 2.6685, - "grad_norm": 1.9740138053894043, - "learning_rate": 2.560819462227913e-06, - "epoch": 0.005122622782864827, - "step": 80 - }, - { - "loss": 2.6835, - "grad_norm": 2.0440573692321777, - "learning_rate": 2.7208706786171577e-06, - "epoch": 0.0054427867067938784, - "step": 85 - }, - { - "loss": 2.6463, - "grad_norm": 1.9804933071136475, - "learning_rate": 2.8809218950064024e-06, - "epoch": 0.00576295063072293, - "step": 90 - }, - { - "loss": 2.6683, - "grad_norm": 1.9438477754592896, - "learning_rate": 3.0409731113956467e-06, - "epoch": 0.006083114554651982, - "step": 95 - }, - { - "loss": 2.6281, - "grad_norm": 2.0071446895599365, - "learning_rate": 3.2010243277848914e-06, - "epoch": 0.006403278478581033, - "step": 100 - }, - { - "loss": 2.6636, - "grad_norm": 1.9696836471557617, - "learning_rate": 3.361075544174136e-06, - "epoch": 0.006723442402510085, - "step": 105 - }, - { - "loss": 2.6637, - "grad_norm": 2.0100696086883545, - "learning_rate": 3.521126760563381e-06, - "epoch": 0.007043606326439137, - "step": 110 - }, - { - "loss": 2.6533, - "grad_norm": 1.9581183195114136, - "learning_rate": 3.681177976952625e-06, - "epoch": 0.007363770250368188, - "step": 115 - }, - { - "loss": 2.6481, - "grad_norm": 1.921207070350647, - "learning_rate": 3.841229193341869e-06, - "epoch": 0.007683934174297241, - "step": 120 - }, - { - "loss": 2.6398, - "grad_norm": 1.9562214612960815, - "learning_rate": 4.001280409731114e-06, - "epoch": 0.008004098098226291, - "step": 125 - }, - { - "loss": 2.6512, - "grad_norm": 2.040691614151001, - "learning_rate": 4.161331626120359e-06, - "epoch": 0.008324262022155344, - "step": 130 - }, - { - "loss": 2.6453, - "grad_norm": 2.003002643585205, - "learning_rate": 4.321382842509603e-06, - "epoch": 0.008644425946084396, - "step": 135 - }, - { - "loss": 2.6442, - "grad_norm": 1.9360718727111816, - "learning_rate": 4.481434058898848e-06, - "epoch": 0.008964589870013447, - "step": 140 - }, - { - "loss": 2.6393, - "grad_norm": 2.054797410964966, - "learning_rate": 4.641485275288092e-06, - "epoch": 0.009284753793942498, - "step": 145 - }, - { - "loss": 2.6241, - "grad_norm": 1.9937611818313599, - "learning_rate": 4.8015364916773375e-06, - "epoch": 0.00960491771787155, - "step": 150 - }, - { - "loss": 2.6592, - "grad_norm": 2.0823230743408203, - "learning_rate": 4.961587708066581e-06, - "epoch": 0.009925081641800603, - "step": 155 - }, - { - "loss": 2.6511, - "grad_norm": 2.1629271507263184, - "learning_rate": 5.121638924455826e-06, - "epoch": 0.010245245565729654, - "step": 160 - }, - { - "loss": 2.6291, - "grad_norm": 1.968585729598999, - "learning_rate": 5.28169014084507e-06, - "epoch": 0.010565409489658706, - "step": 165 - }, - { - "loss": 2.632, - "grad_norm": 2.0315310955047607, - "learning_rate": 5.4417413572343155e-06, - "epoch": 0.010885573413587757, - "step": 170 - }, - { - "loss": 2.6488, - "grad_norm": 2.0342774391174316, - "learning_rate": 5.60179257362356e-06, - "epoch": 0.011205737337516808, - "step": 175 - }, - { - "loss": 2.6346, - "grad_norm": 1.9805340766906738, - "learning_rate": 5.761843790012805e-06, - "epoch": 0.01152590126144586, - "step": 180 - }, - { - "loss": 2.6129, - "grad_norm": 2.0554709434509277, - "learning_rate": 5.921895006402049e-06, - "epoch": 0.011846065185374913, - "step": 185 - }, - { - "loss": 2.5925, - "grad_norm": 1.9576430320739746, - "learning_rate": 6.0819462227912934e-06, - "epoch": 0.012166229109303964, - "step": 190 - }, - { - "loss": 2.6392, - "grad_norm": 2.046264410018921, - "learning_rate": 6.241997439180538e-06, - "epoch": 0.012486393033233015, - "step": 195 - }, - { - "loss": 2.6122, - "grad_norm": 1.9459033012390137, - "learning_rate": 6.402048655569783e-06, - "epoch": 0.012806556957162067, - "step": 200 - }, - { - "eval_loss": 2.469697952270508, - "eval_runtime": 13.9916, - "eval_samples_per_second": 146.373, - "eval_steps_per_second": 18.297, - "epoch": 0.012806556957162067, - "step": 200 - }, - { - "loss": 2.6529, - "grad_norm": 1.9425408840179443, - "learning_rate": 6.562099871959026e-06, - "epoch": 0.013126720881091118, - "step": 205 - }, - { - "loss": 2.624, - "grad_norm": 1.9245175123214722, - "learning_rate": 6.722151088348272e-06, - "epoch": 0.01344688480502017, - "step": 210 - }, - { - "loss": 2.6275, - "grad_norm": 2.075512647628784, - "learning_rate": 6.882202304737516e-06, - "epoch": 0.013767048728949223, - "step": 215 - }, - { - "loss": 2.6339, - "grad_norm": 2.0762271881103516, - "learning_rate": 7.042253521126762e-06, - "epoch": 0.014087212652878274, - "step": 220 - }, - { - "loss": 2.615, - "grad_norm": 1.9588446617126465, - "learning_rate": 7.202304737516005e-06, - "epoch": 0.014407376576807325, - "step": 225 - }, - { - "loss": 2.615, - "grad_norm": 2.0142860412597656, - "learning_rate": 7.36235595390525e-06, - "epoch": 0.014727540500736377, - "step": 230 - }, - { - "loss": 2.6257, - "grad_norm": 2.0105693340301514, - "learning_rate": 7.5224071702944944e-06, - "epoch": 0.015047704424665428, - "step": 235 - }, - { - "loss": 2.6421, - "grad_norm": 2.0586354732513428, - "learning_rate": 7.682458386683739e-06, - "epoch": 0.015367868348594481, - "step": 240 - }, - { - "loss": 2.6181, - "grad_norm": 1.9821962118148804, - "learning_rate": 7.842509603072984e-06, - "epoch": 0.015688032272523533, - "step": 245 - }, - { - "loss": 2.593, - "grad_norm": 2.0167949199676514, - "learning_rate": 8.002560819462227e-06, - "epoch": 0.016008196196452582, - "step": 250 - }, - { - "loss": 2.6102, - "grad_norm": 2.1412761211395264, - "learning_rate": 8.162612035851472e-06, - "epoch": 0.016328360120381635, - "step": 255 - }, - { - "loss": 2.6243, - "grad_norm": 1.926281213760376, - "learning_rate": 8.322663252240718e-06, - "epoch": 0.01664852404431069, - "step": 260 - }, - { - "loss": 2.6286, - "grad_norm": 1.9839813709259033, - "learning_rate": 8.482714468629963e-06, - "epoch": 0.016968687968239738, - "step": 265 - }, - { - "loss": 2.5883, - "grad_norm": 2.0237507820129395, - "learning_rate": 8.642765685019206e-06, - "epoch": 0.01728885189216879, - "step": 270 - }, - { - "loss": 2.6162, - "grad_norm": 1.937261939048767, - "learning_rate": 8.802816901408451e-06, - "epoch": 0.01760901581609784, - "step": 275 - }, - { - "loss": 2.6302, - "grad_norm": 2.0588278770446777, - "learning_rate": 8.962868117797696e-06, - "epoch": 0.017929179740026894, - "step": 280 - }, - { - "loss": 2.6054, - "grad_norm": 2.097682476043701, - "learning_rate": 9.12291933418694e-06, - "epoch": 0.018249343663955947, - "step": 285 - }, - { - "loss": 2.6023, - "grad_norm": 2.114814043045044, - "learning_rate": 9.282970550576185e-06, - "epoch": 0.018569507587884997, - "step": 290 - }, - { - "loss": 2.5885, - "grad_norm": 1.9583766460418701, - "learning_rate": 9.44302176696543e-06, - "epoch": 0.01888967151181405, - "step": 295 - }, - { - "loss": 2.6237, - "grad_norm": 2.047853708267212, - "learning_rate": 9.603072983354675e-06, - "epoch": 0.0192098354357431, - "step": 300 - }, - { - "loss": 2.5634, - "grad_norm": 2.0519795417785645, - "learning_rate": 9.763124199743919e-06, - "epoch": 0.019529999359672152, - "step": 305 - }, - { - "loss": 2.624, - "grad_norm": 2.057596445083618, - "learning_rate": 9.923175416133162e-06, - "epoch": 0.019850163283601206, - "step": 310 - }, - { - "loss": 2.6016, - "grad_norm": 2.060136079788208, - "learning_rate": 1.0083226632522407e-05, - "epoch": 0.020170327207530255, - "step": 315 - }, - { - "loss": 2.6114, - "grad_norm": 2.0268754959106445, - "learning_rate": 1.0243277848911652e-05, - "epoch": 0.02049049113145931, - "step": 320 - }, - { - "loss": 2.5783, - "grad_norm": 2.0680699348449707, - "learning_rate": 1.0403329065300897e-05, - "epoch": 0.020810655055388358, - "step": 325 - }, - { - "loss": 2.5616, - "grad_norm": 1.9991204738616943, - "learning_rate": 1.056338028169014e-05, - "epoch": 0.02113081897931741, - "step": 330 - }, - { - "loss": 2.6128, - "grad_norm": 2.09833025932312, - "learning_rate": 1.0723431498079386e-05, - "epoch": 0.02145098290324646, - "step": 335 - }, - { - "loss": 2.6025, - "grad_norm": 1.9758498668670654, - "learning_rate": 1.0883482714468631e-05, - "epoch": 0.021771146827175514, - "step": 340 - }, - { - "loss": 2.5805, - "grad_norm": 2.0436413288116455, - "learning_rate": 1.1043533930857874e-05, - "epoch": 0.022091310751104567, - "step": 345 - }, - { - "loss": 2.5944, - "grad_norm": 2.0317132472991943, - "learning_rate": 1.120358514724712e-05, - "epoch": 0.022411474675033616, - "step": 350 - }, - { - "loss": 2.5963, - "grad_norm": 2.199904680252075, - "learning_rate": 1.1363636363636365e-05, - "epoch": 0.02273163859896267, - "step": 355 - }, - { - "loss": 2.5799, - "grad_norm": 1.9787744283676147, - "learning_rate": 1.152368758002561e-05, - "epoch": 0.02305180252289172, - "step": 360 - }, - { - "loss": 2.6212, - "grad_norm": 1.890915870666504, - "learning_rate": 1.1683738796414853e-05, - "epoch": 0.023371966446820772, - "step": 365 - }, - { - "loss": 2.5704, - "grad_norm": 2.013899087905884, - "learning_rate": 1.1843790012804098e-05, - "epoch": 0.023692130370749825, - "step": 370 - }, - { - "loss": 2.6101, - "grad_norm": 2.0689878463745117, - "learning_rate": 1.2003841229193342e-05, - "epoch": 0.024012294294678875, - "step": 375 - }, - { - "loss": 2.599, - "grad_norm": 2.028207540512085, - "learning_rate": 1.2163892445582587e-05, - "epoch": 0.024332458218607928, - "step": 380 - }, - { - "loss": 2.6125, - "grad_norm": 1.9936445951461792, - "learning_rate": 1.2323943661971832e-05, - "epoch": 0.024652622142536978, - "step": 385 - }, - { - "loss": 2.5983, - "grad_norm": 2.112257719039917, - "learning_rate": 1.2483994878361075e-05, - "epoch": 0.02497278606646603, - "step": 390 - }, - { - "loss": 2.6074, - "grad_norm": 2.079145669937134, - "learning_rate": 1.264404609475032e-05, - "epoch": 0.025292949990395084, - "step": 395 - }, - { - "loss": 2.5697, - "grad_norm": 2.074415922164917, - "learning_rate": 1.2804097311139566e-05, - "epoch": 0.025613113914324134, - "step": 400 - }, - { - "eval_loss": 2.420924663543701, - "eval_runtime": 14.9165, - "eval_samples_per_second": 137.297, - "eval_steps_per_second": 17.162, - "epoch": 0.025613113914324134, - "step": 400 - }, - { - "loss": 2.5931, - "grad_norm": 1.9997014999389648, - "learning_rate": 1.296414852752881e-05, - "epoch": 0.025933277838253187, - "step": 405 - }, - { - "loss": 2.5684, - "grad_norm": 2.0531606674194336, - "learning_rate": 1.3124199743918053e-05, - "epoch": 0.026253441762182236, - "step": 410 - }, - { - "loss": 2.5935, - "grad_norm": 2.0724613666534424, - "learning_rate": 1.3284250960307298e-05, - "epoch": 0.02657360568611129, - "step": 415 - }, - { - "loss": 2.5658, - "grad_norm": 2.071101188659668, - "learning_rate": 1.3444302176696544e-05, - "epoch": 0.02689376961004034, - "step": 420 - }, - { - "loss": 2.5874, - "grad_norm": 1.9947307109832764, - "learning_rate": 1.360435339308579e-05, - "epoch": 0.027213933533969392, - "step": 425 - }, - { - "loss": 2.5713, - "grad_norm": 2.066715717315674, - "learning_rate": 1.3764404609475031e-05, - "epoch": 0.027534097457898445, - "step": 430 - }, - { - "loss": 2.5831, - "grad_norm": 2.0885956287384033, - "learning_rate": 1.3924455825864276e-05, - "epoch": 0.027854261381827495, - "step": 435 - }, - { - "loss": 2.5972, - "grad_norm": 2.0391199588775635, - "learning_rate": 1.4084507042253523e-05, - "epoch": 0.028174425305756548, - "step": 440 - }, - { - "loss": 2.5874, - "grad_norm": 1.9865524768829346, - "learning_rate": 1.4244558258642765e-05, - "epoch": 0.028494589229685598, - "step": 445 - }, - { - "loss": 2.5465, - "grad_norm": 2.0678815841674805, - "learning_rate": 1.440460947503201e-05, - "epoch": 0.02881475315361465, - "step": 450 - }, - { - "loss": 2.5946, - "grad_norm": 2.057670831680298, - "learning_rate": 1.4564660691421255e-05, - "epoch": 0.029134917077543704, - "step": 455 - }, - { - "loss": 2.6092, - "grad_norm": 2.101698875427246, - "learning_rate": 1.47247119078105e-05, - "epoch": 0.029455081001472753, - "step": 460 - }, - { - "loss": 2.5654, - "grad_norm": 2.0881927013397217, - "learning_rate": 1.4884763124199744e-05, - "epoch": 0.029775244925401807, - "step": 465 - }, - { - "loss": 2.5596, - "grad_norm": 2.091878890991211, - "learning_rate": 1.5044814340588989e-05, - "epoch": 0.030095408849330856, - "step": 470 - }, - { - "loss": 2.5909, - "grad_norm": 2.0245919227600098, - "learning_rate": 1.5204865556978234e-05, - "epoch": 0.03041557277325991, - "step": 475 - }, - { - "loss": 2.5902, - "grad_norm": 2.0498111248016357, - "learning_rate": 1.5364916773367477e-05, - "epoch": 0.030735736697188962, - "step": 480 - }, - { - "loss": 2.5611, - "grad_norm": 2.0705480575561523, - "learning_rate": 1.5524967989756723e-05, - "epoch": 0.031055900621118012, - "step": 485 - }, - { - "loss": 2.5305, - "grad_norm": 2.0246481895446777, - "learning_rate": 1.5685019206145968e-05, - "epoch": 0.031376064545047065, - "step": 490 - }, - { - "loss": 2.5752, - "grad_norm": 1.9090466499328613, - "learning_rate": 1.5845070422535213e-05, - "epoch": 0.031696228468976115, - "step": 495 - }, - { - "loss": 2.5908, - "grad_norm": 2.044546604156494, - "learning_rate": 1.6005121638924455e-05, - "epoch": 0.032016392392905164, - "step": 500 - }, - { - "loss": 2.5695, - "grad_norm": 2.096444845199585, - "learning_rate": 1.61651728553137e-05, - "epoch": 0.03233655631683422, - "step": 505 - }, - { - "loss": 2.5942, - "grad_norm": 2.1051011085510254, - "learning_rate": 1.6325224071702945e-05, - "epoch": 0.03265672024076327, - "step": 510 - }, - { - "loss": 2.5629, - "grad_norm": 1.9321959018707275, - "learning_rate": 1.648527528809219e-05, - "epoch": 0.03297688416469232, - "step": 515 - }, - { - "loss": 2.5745, - "grad_norm": 2.038756847381592, - "learning_rate": 1.6645326504481435e-05, - "epoch": 0.03329704808862138, - "step": 520 - }, - { - "loss": 2.5618, - "grad_norm": 2.018385410308838, - "learning_rate": 1.680537772087068e-05, - "epoch": 0.033617212012550426, - "step": 525 - }, - { - "loss": 2.5884, - "grad_norm": 2.459459066390991, - "learning_rate": 1.6965428937259925e-05, - "epoch": 0.033937375936479476, - "step": 530 - }, - { - "loss": 2.5639, - "grad_norm": 1.96848726272583, - "learning_rate": 1.7125480153649167e-05, - "epoch": 0.034257539860408526, - "step": 535 - }, - { - "loss": 2.56, - "grad_norm": 2.0582144260406494, - "learning_rate": 1.7285531370038412e-05, - "epoch": 0.03457770378433758, - "step": 540 - }, - { - "loss": 2.5354, - "grad_norm": 2.1106033325195312, - "learning_rate": 1.7445582586427657e-05, - "epoch": 0.03489786770826663, - "step": 545 - }, - { - "loss": 2.5768, - "grad_norm": 2.137942314147949, - "learning_rate": 1.7605633802816902e-05, - "epoch": 0.03521803163219568, - "step": 550 - }, - { - "loss": 2.5491, - "grad_norm": 2.1352152824401855, - "learning_rate": 1.7765685019206147e-05, - "epoch": 0.03553819555612474, - "step": 555 - }, - { - "loss": 2.5545, - "grad_norm": 2.0633294582366943, - "learning_rate": 1.7925736235595393e-05, - "epoch": 0.03585835948005379, - "step": 560 - }, - { - "loss": 2.5579, - "grad_norm": 2.02809476852417, - "learning_rate": 1.8085787451984638e-05, - "epoch": 0.03617852340398284, - "step": 565 - }, - { - "loss": 2.5585, - "grad_norm": 2.1206002235412598, - "learning_rate": 1.824583866837388e-05, - "epoch": 0.036498687327911894, - "step": 570 - }, - { - "loss": 2.5356, - "grad_norm": 2.072930335998535, - "learning_rate": 1.8405889884763125e-05, - "epoch": 0.036818851251840944, - "step": 575 - }, - { - "loss": 2.5238, - "grad_norm": 1.958531379699707, - "learning_rate": 1.856594110115237e-05, - "epoch": 0.03713901517576999, - "step": 580 - }, - { - "loss": 2.5486, - "grad_norm": 2.0069925785064697, - "learning_rate": 1.872599231754161e-05, - "epoch": 0.03745917909969904, - "step": 585 - }, - { - "loss": 2.5567, - "grad_norm": 2.0546441078186035, - "learning_rate": 1.888604353393086e-05, - "epoch": 0.0377793430236281, - "step": 590 - }, - { - "loss": 2.5517, - "grad_norm": 2.076535940170288, - "learning_rate": 1.9046094750320105e-05, - "epoch": 0.03809950694755715, - "step": 595 - }, - { - "loss": 2.5513, - "grad_norm": 2.17091703414917, - "learning_rate": 1.920614596670935e-05, - "epoch": 0.0384196708714862, - "step": 600 - }, - { - "eval_loss": 2.400339126586914, - "eval_runtime": 10.2049, - "eval_samples_per_second": 200.687, - "eval_steps_per_second": 25.086, - "epoch": 0.0384196708714862, - "step": 600 - }, - { - "loss": 2.5444, - "grad_norm": 1.9744805097579956, - "learning_rate": 1.9366197183098592e-05, - "epoch": 0.038739834795415255, - "step": 605 - }, - { - "loss": 2.5378, - "grad_norm": 1.9619749784469604, - "learning_rate": 1.9526248399487837e-05, - "epoch": 0.039059998719344305, - "step": 610 - }, - { - "loss": 2.5362, - "grad_norm": 2.023552656173706, - "learning_rate": 1.9686299615877082e-05, - "epoch": 0.039380162643273355, - "step": 615 - }, - { - "loss": 2.563, - "grad_norm": 2.148352861404419, - "learning_rate": 1.9846350832266324e-05, - "epoch": 0.03970032656720241, - "step": 620 - }, - { - "loss": 2.5317, - "grad_norm": 2.052964925765991, - "learning_rate": 2.000640204865557e-05, - "epoch": 0.04002049049113146, - "step": 625 - }, - { - "loss": 2.5463, - "grad_norm": 2.0874581336975098, - "learning_rate": 2.0166453265044814e-05, - "epoch": 0.04034065441506051, - "step": 630 - }, - { - "loss": 2.5432, - "grad_norm": 2.1276135444641113, - "learning_rate": 2.0326504481434063e-05, - "epoch": 0.04066081833898956, - "step": 635 - }, - { - "loss": 2.5441, - "grad_norm": 2.120331048965454, - "learning_rate": 2.0486555697823304e-05, - "epoch": 0.04098098226291862, - "step": 640 - }, - { - "loss": 2.5594, - "grad_norm": 1.9808118343353271, - "learning_rate": 2.064660691421255e-05, - "epoch": 0.041301146186847666, - "step": 645 - }, - { - "loss": 2.5496, - "grad_norm": 2.133842706680298, - "learning_rate": 2.0806658130601795e-05, - "epoch": 0.041621310110776716, - "step": 650 - }, - { - "loss": 2.5293, - "grad_norm": 2.060401439666748, - "learning_rate": 2.0966709346991036e-05, - "epoch": 0.04194147403470577, - "step": 655 - }, - { - "loss": 2.5548, - "grad_norm": 2.127145767211914, - "learning_rate": 2.112676056338028e-05, - "epoch": 0.04226163795863482, - "step": 660 - }, - { - "loss": 2.5335, - "grad_norm": 2.0574846267700195, - "learning_rate": 2.1286811779769527e-05, - "epoch": 0.04258180188256387, - "step": 665 - }, - { - "loss": 2.569, - "grad_norm": 2.057927131652832, - "learning_rate": 2.1446862996158772e-05, - "epoch": 0.04290196580649292, - "step": 670 - }, - { - "loss": 2.5401, - "grad_norm": 2.064457654953003, - "learning_rate": 2.1606914212548017e-05, - "epoch": 0.04322212973042198, - "step": 675 - }, - { - "loss": 2.5279, - "grad_norm": 2.1458826065063477, - "learning_rate": 2.1766965428937262e-05, - "epoch": 0.04354229365435103, - "step": 680 - }, - { - "loss": 2.5387, - "grad_norm": 1.951903223991394, - "learning_rate": 2.1927016645326507e-05, - "epoch": 0.04386245757828008, - "step": 685 - }, - { - "loss": 2.4947, - "grad_norm": 2.019552707672119, - "learning_rate": 2.208706786171575e-05, - "epoch": 0.044182621502209134, - "step": 690 - }, - { - "loss": 2.5334, - "grad_norm": 2.1926770210266113, - "learning_rate": 2.2247119078104994e-05, - "epoch": 0.04450278542613818, - "step": 695 - }, - { - "loss": 2.5333, - "grad_norm": 2.108576774597168, - "learning_rate": 2.240717029449424e-05, - "epoch": 0.04482294935006723, - "step": 700 - }, - { - "loss": 2.5361, - "grad_norm": 2.0758907794952393, - "learning_rate": 2.2567221510883484e-05, - "epoch": 0.04514311327399629, - "step": 705 - }, - { - "loss": 2.5382, - "grad_norm": 2.120901107788086, - "learning_rate": 2.272727272727273e-05, - "epoch": 0.04546327719792534, - "step": 710 - }, - { - "loss": 2.5182, - "grad_norm": 2.098022699356079, - "learning_rate": 2.2887323943661974e-05, - "epoch": 0.04578344112185439, - "step": 715 - }, - { - "loss": 2.5062, - "grad_norm": 2.1501033306121826, - "learning_rate": 2.304737516005122e-05, - "epoch": 0.04610360504578344, - "step": 720 - }, - { - "loss": 2.5476, - "grad_norm": 2.16194748878479, - "learning_rate": 2.320742637644046e-05, - "epoch": 0.046423768969712495, - "step": 725 - }, - { - "loss": 2.5253, - "grad_norm": 2.200193405151367, - "learning_rate": 2.3367477592829706e-05, - "epoch": 0.046743932893641545, - "step": 730 - }, - { - "loss": 2.5261, - "grad_norm": 2.143402099609375, - "learning_rate": 2.352752880921895e-05, - "epoch": 0.047064096817570594, - "step": 735 - }, - { - "loss": 2.5514, - "grad_norm": 2.0588419437408447, - "learning_rate": 2.3687580025608197e-05, - "epoch": 0.04738426074149965, - "step": 740 - }, - { - "loss": 2.5248, - "grad_norm": 2.039383888244629, - "learning_rate": 2.384763124199744e-05, - "epoch": 0.0477044246654287, - "step": 745 - }, - { - "loss": 2.5464, - "grad_norm": 2.1919689178466797, - "learning_rate": 2.4007682458386683e-05, - "epoch": 0.04802458858935775, - "step": 750 - }, - { - "loss": 2.5142, - "grad_norm": 2.0702781677246094, - "learning_rate": 2.4167733674775932e-05, - "epoch": 0.0483447525132868, - "step": 755 - }, - { - "loss": 2.5399, - "grad_norm": 2.0351169109344482, - "learning_rate": 2.4327784891165174e-05, - "epoch": 0.048664916437215856, - "step": 760 - }, - { - "loss": 2.5392, - "grad_norm": 2.276874542236328, - "learning_rate": 2.448783610755442e-05, - "epoch": 0.048985080361144906, - "step": 765 - }, - { - "loss": 2.5221, - "grad_norm": 2.1543283462524414, - "learning_rate": 2.4647887323943664e-05, - "epoch": 0.049305244285073956, - "step": 770 - }, - { - "loss": 2.5136, - "grad_norm": 2.0453758239746094, - "learning_rate": 2.480793854033291e-05, - "epoch": 0.04962540820900301, - "step": 775 - }, - { - "loss": 2.5164, - "grad_norm": 2.0810751914978027, - "learning_rate": 2.496798975672215e-05, - "epoch": 0.04994557213293206, - "step": 780 - }, - { - "loss": 2.5324, - "grad_norm": 2.0781326293945312, - "learning_rate": 2.5128040973111393e-05, - "epoch": 0.05026573605686111, - "step": 785 - }, - { - "loss": 2.4946, - "grad_norm": 1.9677348136901855, - "learning_rate": 2.528809218950064e-05, - "epoch": 0.05058589998079017, - "step": 790 - }, - { - "loss": 2.5258, - "grad_norm": 2.114290475845337, - "learning_rate": 2.5448143405889886e-05, - "epoch": 0.05090606390471922, - "step": 795 - }, - { - "loss": 2.5322, - "grad_norm": 2.0765063762664795, - "learning_rate": 2.560819462227913e-05, - "epoch": 0.05122622782864827, - "step": 800 - }, - { - "eval_loss": 2.371161460876465, - "eval_runtime": 12.4134, - "eval_samples_per_second": 164.983, - "eval_steps_per_second": 20.623, - "epoch": 0.05122622782864827, - "step": 800 - }, - { - "loss": 2.5383, - "grad_norm": 2.069668769836426, - "learning_rate": 2.5768245838668376e-05, - "epoch": 0.05154639175257732, - "step": 805 - }, - { - "loss": 2.5004, - "grad_norm": 2.1110806465148926, - "learning_rate": 2.592829705505762e-05, - "epoch": 0.05186655567650637, - "step": 810 - }, - { - "loss": 2.5093, - "grad_norm": 2.0620675086975098, - "learning_rate": 2.6088348271446867e-05, - "epoch": 0.05218671960043542, - "step": 815 - }, - { - "loss": 2.4999, - "grad_norm": 1.9995859861373901, - "learning_rate": 2.6248399487836105e-05, - "epoch": 0.05250688352436447, - "step": 820 - }, - { - "loss": 2.5001, - "grad_norm": 2.050431489944458, - "learning_rate": 2.640845070422535e-05, - "epoch": 0.05282704744829353, - "step": 825 - }, - { - "loss": 2.5435, - "grad_norm": 2.142716646194458, - "learning_rate": 2.6568501920614595e-05, - "epoch": 0.05314721137222258, - "step": 830 - }, - { - "loss": 2.5513, - "grad_norm": 2.053705930709839, - "learning_rate": 2.6728553137003844e-05, - "epoch": 0.05346737529615163, - "step": 835 - }, - { - "loss": 2.5534, - "grad_norm": 1.9456514120101929, - "learning_rate": 2.688860435339309e-05, - "epoch": 0.05378753922008068, - "step": 840 - }, - { - "loss": 2.5351, - "grad_norm": 2.099213123321533, - "learning_rate": 2.7048655569782334e-05, - "epoch": 0.054107703144009735, - "step": 845 - }, - { - "loss": 2.5077, - "grad_norm": 2.0750980377197266, - "learning_rate": 2.720870678617158e-05, - "epoch": 0.054427867067938784, - "step": 850 - }, - { - "loss": 2.4956, - "grad_norm": 2.1118557453155518, - "learning_rate": 2.7368758002560817e-05, - "epoch": 0.054748030991867834, - "step": 855 - }, - { - "loss": 2.5411, - "grad_norm": 2.021570920944214, - "learning_rate": 2.7528809218950063e-05, - "epoch": 0.05506819491579689, - "step": 860 - }, - { - "loss": 2.5294, - "grad_norm": 2.095647096633911, - "learning_rate": 2.7688860435339308e-05, - "epoch": 0.05538835883972594, - "step": 865 - }, - { - "loss": 2.5224, - "grad_norm": 2.172281503677368, - "learning_rate": 2.7848911651728553e-05, - "epoch": 0.05570852276365499, - "step": 870 - }, - { - "loss": 2.51, - "grad_norm": 2.0782620906829834, - "learning_rate": 2.8008962868117798e-05, - "epoch": 0.056028686687584046, - "step": 875 - }, - { - "loss": 2.5158, - "grad_norm": 2.1138572692871094, - "learning_rate": 2.8169014084507046e-05, - "epoch": 0.056348850611513096, - "step": 880 - }, - { - "loss": 2.5186, - "grad_norm": 1.9800282716751099, - "learning_rate": 2.832906530089629e-05, - "epoch": 0.056669014535442146, - "step": 885 - }, - { - "loss": 2.4717, - "grad_norm": 2.043164014816284, - "learning_rate": 2.848911651728553e-05, - "epoch": 0.056989178459371195, - "step": 890 - }, - { - "loss": 2.5061, - "grad_norm": 2.0079712867736816, - "learning_rate": 2.8649167733674775e-05, - "epoch": 0.05730934238330025, - "step": 895 - }, - { - "loss": 2.4861, - "grad_norm": 2.0949175357818604, - "learning_rate": 2.880921895006402e-05, - "epoch": 0.0576295063072293, - "step": 900 - }, - { - "loss": 2.5082, - "grad_norm": 2.1614928245544434, - "learning_rate": 2.8969270166453265e-05, - "epoch": 0.05794967023115835, - "step": 905 - }, - { - "loss": 2.5003, - "grad_norm": 2.0658435821533203, - "learning_rate": 2.912932138284251e-05, - "epoch": 0.05826983415508741, - "step": 910 - }, - { - "loss": 2.5126, - "grad_norm": 2.056453227996826, - "learning_rate": 2.9289372599231756e-05, - "epoch": 0.05858999807901646, - "step": 915 - }, - { - "loss": 2.5079, - "grad_norm": 2.0968542098999023, - "learning_rate": 2.9449423815621e-05, - "epoch": 0.05891016200294551, - "step": 920 - }, - { - "loss": 2.5189, - "grad_norm": 2.0200068950653076, - "learning_rate": 2.9609475032010242e-05, - "epoch": 0.05923032592687456, - "step": 925 - }, - { - "loss": 2.531, - "grad_norm": 2.081430435180664, - "learning_rate": 2.9769526248399488e-05, - "epoch": 0.05955048985080361, - "step": 930 - }, - { - "loss": 2.4866, - "grad_norm": 2.0429458618164062, - "learning_rate": 2.9929577464788733e-05, - "epoch": 0.05987065377473266, - "step": 935 - }, - { - "loss": 2.51, - "grad_norm": 2.0854263305664062, - "learning_rate": 3.0089628681177978e-05, - "epoch": 0.06019081769866171, - "step": 940 - }, - { - "loss": 2.5245, - "grad_norm": 2.192448854446411, - "learning_rate": 3.0249679897567223e-05, - "epoch": 0.06051098162259077, - "step": 945 - }, - { - "loss": 2.504, - "grad_norm": 2.0920021533966064, - "learning_rate": 3.0409731113956468e-05, - "epoch": 0.06083114554651982, - "step": 950 - }, - { - "loss": 2.4903, - "grad_norm": 2.0515296459198, - "learning_rate": 3.056978233034571e-05, - "epoch": 0.06115130947044887, - "step": 955 - }, - { - "loss": 2.5079, - "grad_norm": 2.128044605255127, - "learning_rate": 3.0729833546734955e-05, - "epoch": 0.061471473394377925, - "step": 960 - }, - { - "loss": 2.4882, - "grad_norm": 2.0168440341949463, - "learning_rate": 3.0889884763124197e-05, - "epoch": 0.061791637318306974, - "step": 965 - }, - { - "loss": 2.5079, - "grad_norm": 2.090272903442383, - "learning_rate": 3.1049935979513445e-05, - "epoch": 0.062111801242236024, - "step": 970 - }, - { - "loss": 2.5153, - "grad_norm": 2.0438127517700195, - "learning_rate": 3.120998719590269e-05, - "epoch": 0.062431965166165074, - "step": 975 - }, - { - "loss": 2.4814, - "grad_norm": 2.051866292953491, - "learning_rate": 3.1370038412291935e-05, - "epoch": 0.06275212909009413, - "step": 980 - }, - { - "loss": 2.5105, - "grad_norm": 2.1288902759552, - "learning_rate": 3.1530089628681184e-05, - "epoch": 0.06307229301402317, - "step": 985 - }, - { - "loss": 2.5006, - "grad_norm": 2.1590287685394287, - "learning_rate": 3.1690140845070426e-05, - "epoch": 0.06339245693795223, - "step": 990 - }, - { - "loss": 2.5005, - "grad_norm": 2.182297945022583, - "learning_rate": 3.185019206145967e-05, - "epoch": 0.06371262086188129, - "step": 995 - }, - { - "loss": 2.5104, - "grad_norm": 1.9559736251831055, - "learning_rate": 3.201024327784891e-05, - "epoch": 0.06403278478581033, - "step": 1000 - }, - { - "eval_loss": 2.3531105518341064, - "eval_runtime": 14.4683, - "eval_samples_per_second": 141.551, - "eval_steps_per_second": 17.694, - "epoch": 0.06403278478581033, - "step": 1000 - }, - { - "loss": 2.4771, - "grad_norm": 2.445621967315674, - "learning_rate": 3.217029449423816e-05, - "epoch": 0.06435294870973939, - "step": 1005 - }, - { - "loss": 2.4956, - "grad_norm": 2.1934397220611572, - "learning_rate": 3.23303457106274e-05, - "epoch": 0.06467311263366844, - "step": 1010 - }, - { - "loss": 2.4993, - "grad_norm": 2.0829265117645264, - "learning_rate": 3.249039692701665e-05, - "epoch": 0.06499327655759748, - "step": 1015 - }, - { - "loss": 2.5011, - "grad_norm": 2.163093328475952, - "learning_rate": 3.265044814340589e-05, - "epoch": 0.06531344048152654, - "step": 1020 - }, - { - "loss": 2.4802, - "grad_norm": 2.2062385082244873, - "learning_rate": 3.281049935979514e-05, - "epoch": 0.0656336044054556, - "step": 1025 - }, - { - "loss": 2.5182, - "grad_norm": 2.199197769165039, - "learning_rate": 3.297055057618438e-05, - "epoch": 0.06595376832938464, - "step": 1030 - }, - { - "loss": 2.4656, - "grad_norm": 2.0707991123199463, - "learning_rate": 3.313060179257362e-05, - "epoch": 0.0662739322533137, - "step": 1035 - }, - { - "loss": 2.5002, - "grad_norm": 2.182140588760376, - "learning_rate": 3.329065300896287e-05, - "epoch": 0.06659409617724275, - "step": 1040 - }, - { - "loss": 2.5177, - "grad_norm": 1.9117063283920288, - "learning_rate": 3.345070422535211e-05, - "epoch": 0.0669142601011718, - "step": 1045 - }, - { - "loss": 2.4955, - "grad_norm": 2.077578067779541, - "learning_rate": 3.361075544174136e-05, - "epoch": 0.06723442402510085, - "step": 1050 - }, - { - "loss": 2.5032, - "grad_norm": 2.046825408935547, - "learning_rate": 3.37708066581306e-05, - "epoch": 0.06755458794902991, - "step": 1055 - }, - { - "loss": 2.51, - "grad_norm": 2.127065420150757, - "learning_rate": 3.393085787451985e-05, - "epoch": 0.06787475187295895, - "step": 1060 - }, - { - "loss": 2.4983, - "grad_norm": 2.076838493347168, - "learning_rate": 3.409090909090909e-05, - "epoch": 0.06819491579688801, - "step": 1065 - }, - { - "loss": 2.4759, - "grad_norm": 1.964388370513916, - "learning_rate": 3.4250960307298334e-05, - "epoch": 0.06851507972081705, - "step": 1070 - }, - { - "loss": 2.4878, - "grad_norm": 2.0563740730285645, - "learning_rate": 3.441101152368758e-05, - "epoch": 0.06883524364474611, - "step": 1075 - }, - { - "loss": 2.5031, - "grad_norm": 2.236644744873047, - "learning_rate": 3.4571062740076824e-05, - "epoch": 0.06915540756867516, - "step": 1080 - }, - { - "loss": 2.5151, - "grad_norm": 2.1143178939819336, - "learning_rate": 3.473111395646607e-05, - "epoch": 0.06947557149260421, - "step": 1085 - }, - { - "loss": 2.4976, - "grad_norm": 2.1225712299346924, - "learning_rate": 3.4891165172855314e-05, - "epoch": 0.06979573541653326, - "step": 1090 - }, - { - "loss": 2.4847, - "grad_norm": 2.148134231567383, - "learning_rate": 3.505121638924456e-05, - "epoch": 0.07011589934046232, - "step": 1095 - }, - { - "loss": 2.5044, - "grad_norm": 2.070889949798584, - "learning_rate": 3.5211267605633805e-05, - "epoch": 0.07043606326439136, - "step": 1100 - }, - { - "loss": 2.4932, - "grad_norm": 2.0513927936553955, - "learning_rate": 3.5371318822023046e-05, - "epoch": 0.07075622718832042, - "step": 1105 - }, - { - "loss": 2.4801, - "grad_norm": 2.2668299674987793, - "learning_rate": 3.5531370038412295e-05, - "epoch": 0.07107639111224948, - "step": 1110 - }, - { - "loss": 2.4892, - "grad_norm": 2.1977014541625977, - "learning_rate": 3.569142125480154e-05, - "epoch": 0.07139655503617852, - "step": 1115 - }, - { - "loss": 2.4956, - "grad_norm": 2.03102707862854, - "learning_rate": 3.5851472471190785e-05, - "epoch": 0.07171671896010758, - "step": 1120 - }, - { - "loss": 2.4646, - "grad_norm": 2.058772325515747, - "learning_rate": 3.601152368758003e-05, - "epoch": 0.07203688288403663, - "step": 1125 - }, - { - "loss": 2.4888, - "grad_norm": 2.179579019546509, - "learning_rate": 3.6171574903969275e-05, - "epoch": 0.07235704680796567, - "step": 1130 - }, - { - "loss": 2.4983, - "grad_norm": 2.1978416442871094, - "learning_rate": 3.633162612035852e-05, - "epoch": 0.07267721073189473, - "step": 1135 - }, - { - "loss": 2.4855, - "grad_norm": 1.9466743469238281, - "learning_rate": 3.649167733674776e-05, - "epoch": 0.07299737465582379, - "step": 1140 - }, - { - "loss": 2.4661, - "grad_norm": 2.139958143234253, - "learning_rate": 3.665172855313701e-05, - "epoch": 0.07331753857975283, - "step": 1145 - }, - { - "loss": 2.4822, - "grad_norm": 2.0397911071777344, - "learning_rate": 3.681177976952625e-05, - "epoch": 0.07363770250368189, - "step": 1150 - }, - { - "loss": 2.5217, - "grad_norm": 2.008603811264038, - "learning_rate": 3.69718309859155e-05, - "epoch": 0.07395786642761093, - "step": 1155 - }, - { - "loss": 2.4926, - "grad_norm": 2.2945339679718018, - "learning_rate": 3.713188220230474e-05, - "epoch": 0.07427803035153999, - "step": 1160 - }, - { - "loss": 2.5163, - "grad_norm": 2.1897239685058594, - "learning_rate": 3.729193341869399e-05, - "epoch": 0.07459819427546904, - "step": 1165 - }, - { - "loss": 2.4827, - "grad_norm": 2.0864007472991943, - "learning_rate": 3.745198463508322e-05, - "epoch": 0.07491835819939809, - "step": 1170 - }, - { - "loss": 2.4885, - "grad_norm": 2.1951565742492676, - "learning_rate": 3.761203585147247e-05, - "epoch": 0.07523852212332714, - "step": 1175 - }, - { - "loss": 2.4403, - "grad_norm": 2.176609516143799, - "learning_rate": 3.777208706786172e-05, - "epoch": 0.0755586860472562, - "step": 1180 - }, - { - "loss": 2.4691, - "grad_norm": 2.2460696697235107, - "learning_rate": 3.793213828425096e-05, - "epoch": 0.07587884997118524, - "step": 1185 - }, - { - "loss": 2.4858, - "grad_norm": 2.059447765350342, - "learning_rate": 3.809218950064021e-05, - "epoch": 0.0761990138951143, - "step": 1190 - }, - { - "loss": 2.4878, - "grad_norm": 2.062699794769287, - "learning_rate": 3.825224071702945e-05, - "epoch": 0.07651917781904335, - "step": 1195 - }, - { - "loss": 2.483, - "grad_norm": 2.070650815963745, - "learning_rate": 3.84122919334187e-05, - "epoch": 0.0768393417429724, - "step": 1200 - }, - { - "eval_loss": 2.3345463275909424, - "eval_runtime": 11.3977, - "eval_samples_per_second": 179.686, - "eval_steps_per_second": 22.461, - "epoch": 0.0768393417429724, - "step": 1200 - }, - { - "loss": 2.4838, - "grad_norm": 2.119915723800659, - "learning_rate": 3.8572343149807935e-05, - "epoch": 0.07715950566690145, - "step": 1205 - }, - { - "loss": 2.4591, - "grad_norm": 2.0714828968048096, - "learning_rate": 3.8732394366197184e-05, - "epoch": 0.07747966959083051, - "step": 1210 - }, - { - "loss": 2.472, - "grad_norm": 2.186169385910034, - "learning_rate": 3.8892445582586426e-05, - "epoch": 0.07779983351475955, - "step": 1215 - }, - { - "loss": 2.4773, - "grad_norm": 2.181817054748535, - "learning_rate": 3.9052496798975674e-05, - "epoch": 0.07811999743868861, - "step": 1220 - }, - { - "loss": 2.4565, - "grad_norm": 2.1009023189544678, - "learning_rate": 3.921254801536492e-05, - "epoch": 0.07844016136261767, - "step": 1225 - }, - { - "loss": 2.4827, - "grad_norm": 1.9727615118026733, - "learning_rate": 3.9372599231754164e-05, - "epoch": 0.07876032528654671, - "step": 1230 - }, - { - "loss": 2.494, - "grad_norm": 2.114440679550171, - "learning_rate": 3.953265044814341e-05, - "epoch": 0.07908048921047577, - "step": 1235 - }, - { - "loss": 2.4776, - "grad_norm": 2.6610660552978516, - "learning_rate": 3.969270166453265e-05, - "epoch": 0.07940065313440482, - "step": 1240 - }, - { - "loss": 2.4643, - "grad_norm": 2.1695549488067627, - "learning_rate": 3.9852752880921896e-05, - "epoch": 0.07972081705833386, - "step": 1245 - }, - { - "loss": 2.4708, - "grad_norm": 2.2169156074523926, - "learning_rate": 4.001280409731114e-05, - "epoch": 0.08004098098226292, - "step": 1250 - }, - { - "loss": 2.4964, - "grad_norm": 2.0154330730438232, - "learning_rate": 4.0172855313700387e-05, - "epoch": 0.08036114490619196, - "step": 1255 - }, - { - "loss": 2.4564, - "grad_norm": 2.2851929664611816, - "learning_rate": 4.033290653008963e-05, - "epoch": 0.08068130883012102, - "step": 1260 - }, - { - "loss": 2.4764, - "grad_norm": 2.229935646057129, - "learning_rate": 4.049295774647888e-05, - "epoch": 0.08100147275405008, - "step": 1265 - }, - { - "loss": 2.4936, - "grad_norm": 2.0593361854553223, - "learning_rate": 4.0653008962868125e-05, - "epoch": 0.08132163667797912, - "step": 1270 - }, - { - "loss": 2.4756, - "grad_norm": 1.9973433017730713, - "learning_rate": 4.081306017925736e-05, - "epoch": 0.08164180060190818, - "step": 1275 - }, - { - "loss": 2.4594, - "grad_norm": 2.005742073059082, - "learning_rate": 4.097311139564661e-05, - "epoch": 0.08196196452583723, - "step": 1280 - }, - { - "loss": 2.4543, - "grad_norm": 2.015453577041626, - "learning_rate": 4.113316261203585e-05, - "epoch": 0.08228212844976628, - "step": 1285 - }, - { - "loss": 2.4474, - "grad_norm": 2.0268640518188477, - "learning_rate": 4.12932138284251e-05, - "epoch": 0.08260229237369533, - "step": 1290 - }, - { - "loss": 2.4646, - "grad_norm": 2.077282428741455, - "learning_rate": 4.145326504481434e-05, - "epoch": 0.08292245629762439, - "step": 1295 - }, - { - "loss": 2.4894, - "grad_norm": 2.0141093730926514, - "learning_rate": 4.161331626120359e-05, - "epoch": 0.08324262022155343, - "step": 1300 - }, - { - "loss": 2.46, - "grad_norm": 2.0732734203338623, - "learning_rate": 4.177336747759283e-05, - "epoch": 0.08356278414548249, - "step": 1305 - }, - { - "loss": 2.4604, - "grad_norm": 2.012782573699951, - "learning_rate": 4.193341869398207e-05, - "epoch": 0.08388294806941154, - "step": 1310 - }, - { - "loss": 2.439, - "grad_norm": 2.024240255355835, - "learning_rate": 4.209346991037132e-05, - "epoch": 0.08420311199334059, - "step": 1315 - }, - { - "loss": 2.4635, - "grad_norm": 2.0818872451782227, - "learning_rate": 4.225352112676056e-05, - "epoch": 0.08452327591726964, - "step": 1320 - }, - { - "loss": 2.4801, - "grad_norm": 2.048849105834961, - "learning_rate": 4.241357234314981e-05, - "epoch": 0.0848434398411987, - "step": 1325 - }, - { - "loss": 2.4625, - "grad_norm": 2.284207344055176, - "learning_rate": 4.257362355953905e-05, - "epoch": 0.08516360376512774, - "step": 1330 - }, - { - "loss": 2.4771, - "grad_norm": 2.018928050994873, - "learning_rate": 4.27336747759283e-05, - "epoch": 0.0854837676890568, - "step": 1335 - }, - { - "loss": 2.4864, - "grad_norm": 2.1332316398620605, - "learning_rate": 4.2893725992317543e-05, - "epoch": 0.08580393161298584, - "step": 1340 - }, - { - "loss": 2.4726, - "grad_norm": 2.0433480739593506, - "learning_rate": 4.3053777208706785e-05, - "epoch": 0.0861240955369149, - "step": 1345 - }, - { - "loss": 2.4597, - "grad_norm": 2.058560609817505, - "learning_rate": 4.3213828425096034e-05, - "epoch": 0.08644425946084396, - "step": 1350 - }, - { - "loss": 2.4773, - "grad_norm": 2.096250534057617, - "learning_rate": 4.3373879641485275e-05, - "epoch": 0.086764423384773, - "step": 1355 - }, - { - "loss": 2.4619, - "grad_norm": 2.168686866760254, - "learning_rate": 4.3533930857874524e-05, - "epoch": 0.08708458730870205, - "step": 1360 - }, - { - "loss": 2.4256, - "grad_norm": 2.0486621856689453, - "learning_rate": 4.3693982074263766e-05, - "epoch": 0.08740475123263111, - "step": 1365 - }, - { - "loss": 2.4488, - "grad_norm": 2.1706786155700684, - "learning_rate": 4.3854033290653014e-05, - "epoch": 0.08772491515656015, - "step": 1370 - }, - { - "loss": 2.4556, - "grad_norm": 1.9638718366622925, - "learning_rate": 4.4014084507042256e-05, - "epoch": 0.08804507908048921, - "step": 1375 - }, - { - "loss": 2.4687, - "grad_norm": 2.0920019149780273, - "learning_rate": 4.41741357234315e-05, - "epoch": 0.08836524300441827, - "step": 1380 - }, - { - "loss": 2.4431, - "grad_norm": 2.1053900718688965, - "learning_rate": 4.4334186939820746e-05, - "epoch": 0.08868540692834731, - "step": 1385 - }, - { - "loss": 2.4907, - "grad_norm": 2.1533970832824707, - "learning_rate": 4.449423815620999e-05, - "epoch": 0.08900557085227637, - "step": 1390 - }, - { - "loss": 2.4612, - "grad_norm": 2.0936789512634277, - "learning_rate": 4.4654289372599236e-05, - "epoch": 0.08932573477620542, - "step": 1395 - }, - { - "loss": 2.4923, - "grad_norm": 2.1903157234191895, - "learning_rate": 4.481434058898848e-05, - "epoch": 0.08964589870013447, - "step": 1400 - }, - { - "eval_loss": 2.323389768600464, - "eval_runtime": 12.3833, - "eval_samples_per_second": 165.383, - "eval_steps_per_second": 20.673, - "epoch": 0.08964589870013447, - "step": 1400 - }, - { - "loss": 2.4615, - "grad_norm": 2.1091010570526123, - "learning_rate": 4.4974391805377727e-05, - "epoch": 0.08996606262406352, - "step": 1405 - }, - { - "loss": 2.4428, - "grad_norm": 2.227038621902466, - "learning_rate": 4.513444302176697e-05, - "epoch": 0.09028622654799258, - "step": 1410 - }, - { - "loss": 2.4297, - "grad_norm": 2.059403419494629, - "learning_rate": 4.529449423815621e-05, - "epoch": 0.09060639047192162, - "step": 1415 - }, - { - "loss": 2.4364, - "grad_norm": 2.005385398864746, - "learning_rate": 4.545454545454546e-05, - "epoch": 0.09092655439585068, - "step": 1420 - }, - { - "loss": 2.4764, - "grad_norm": 2.142878532409668, - "learning_rate": 4.56145966709347e-05, - "epoch": 0.09124671831977972, - "step": 1425 - }, - { - "loss": 2.4698, - "grad_norm": 2.005213499069214, - "learning_rate": 4.577464788732395e-05, - "epoch": 0.09156688224370878, - "step": 1430 - }, - { - "loss": 2.4566, - "grad_norm": 2.094695568084717, - "learning_rate": 4.593469910371319e-05, - "epoch": 0.09188704616763783, - "step": 1435 - }, - { - "loss": 2.4363, - "grad_norm": 2.0781939029693604, - "learning_rate": 4.609475032010244e-05, - "epoch": 0.09220721009156688, - "step": 1440 - }, - { - "loss": 2.4561, - "grad_norm": 2.1999306678771973, - "learning_rate": 4.625480153649168e-05, - "epoch": 0.09252737401549593, - "step": 1445 - }, - { - "loss": 2.456, - "grad_norm": 2.0999979972839355, - "learning_rate": 4.641485275288092e-05, - "epoch": 0.09284753793942499, - "step": 1450 - }, - { - "loss": 2.4783, - "grad_norm": 2.1072137355804443, - "learning_rate": 4.6574903969270164e-05, - "epoch": 0.09316770186335403, - "step": 1455 - }, - { - "loss": 2.4956, - "grad_norm": 1.932655692100525, - "learning_rate": 4.673495518565941e-05, - "epoch": 0.09348786578728309, - "step": 1460 - }, - { - "loss": 2.49, - "grad_norm": 2.29823637008667, - "learning_rate": 4.689500640204866e-05, - "epoch": 0.09380802971121215, - "step": 1465 - }, - { - "loss": 2.4669, - "grad_norm": 2.6139848232269287, - "learning_rate": 4.70550576184379e-05, - "epoch": 0.09412819363514119, - "step": 1470 - }, - { - "loss": 2.4432, - "grad_norm": 2.2109243869781494, - "learning_rate": 4.721510883482715e-05, - "epoch": 0.09444835755907025, - "step": 1475 - }, - { - "loss": 2.4578, - "grad_norm": 2.114405870437622, - "learning_rate": 4.737516005121639e-05, - "epoch": 0.0947685214829993, - "step": 1480 - }, - { - "loss": 2.4617, - "grad_norm": 2.0485222339630127, - "learning_rate": 4.7535211267605635e-05, - "epoch": 0.09508868540692834, - "step": 1485 - }, - { - "loss": 2.4296, - "grad_norm": 2.0749807357788086, - "learning_rate": 4.769526248399488e-05, - "epoch": 0.0954088493308574, - "step": 1490 - }, - { - "loss": 2.4569, - "grad_norm": 2.2009363174438477, - "learning_rate": 4.7855313700384125e-05, - "epoch": 0.09572901325478646, - "step": 1495 - }, - { - "loss": 2.4203, - "grad_norm": 2.4653289318084717, - "learning_rate": 4.801536491677337e-05, - "epoch": 0.0960491771787155, - "step": 1500 - }, - { - "loss": 2.445, - "grad_norm": 2.145634889602661, - "learning_rate": 4.8175416133162615e-05, - "epoch": 0.09636934110264456, - "step": 1505 - }, - { - "loss": 2.4232, - "grad_norm": 2.214996337890625, - "learning_rate": 4.8335467349551864e-05, - "epoch": 0.0966895050265736, - "step": 1510 - }, - { - "loss": 2.5058, - "grad_norm": 2.039727210998535, - "learning_rate": 4.8495518565941106e-05, - "epoch": 0.09700966895050266, - "step": 1515 - }, - { - "loss": 2.4503, - "grad_norm": 2.134812593460083, - "learning_rate": 4.865556978233035e-05, - "epoch": 0.09732983287443171, - "step": 1520 - }, - { - "loss": 2.4561, - "grad_norm": 2.0128939151763916, - "learning_rate": 4.881562099871959e-05, - "epoch": 0.09764999679836076, - "step": 1525 - }, - { - "loss": 2.4869, - "grad_norm": 1.9132862091064453, - "learning_rate": 4.897567221510884e-05, - "epoch": 0.09797016072228981, - "step": 1530 - }, - { - "loss": 2.4637, - "grad_norm": 2.2746827602386475, - "learning_rate": 4.913572343149808e-05, - "epoch": 0.09829032464621887, - "step": 1535 - }, - { - "loss": 2.4481, - "grad_norm": 2.09806489944458, - "learning_rate": 4.929577464788733e-05, - "epoch": 0.09861048857014791, - "step": 1540 - }, - { - "loss": 2.4761, - "grad_norm": 2.1433379650115967, - "learning_rate": 4.945582586427657e-05, - "epoch": 0.09893065249407697, - "step": 1545 - }, - { - "loss": 2.4404, - "grad_norm": 2.127873659133911, - "learning_rate": 4.961587708066582e-05, - "epoch": 0.09925081641800602, - "step": 1550 - }, - { - "loss": 2.4531, - "grad_norm": 2.067396879196167, - "learning_rate": 4.977592829705506e-05, - "epoch": 0.09957098034193507, - "step": 1555 - }, - { - "loss": 2.4531, - "grad_norm": 2.0335302352905273, - "learning_rate": 4.99359795134443e-05, - "epoch": 0.09989114426586412, - "step": 1560 - }, - { - "loss": 2.456, - "grad_norm": 2.0103564262390137, - "learning_rate": 5e-05, - "epoch": 0.10021130818979318, - "step": 1565 - }, - { - "loss": 2.4837, - "grad_norm": 1.9280204772949219, - "learning_rate": 5e-05, - "epoch": 0.10053147211372222, - "step": 1570 - }, - { - "loss": 2.4348, - "grad_norm": 2.0677709579467773, - "learning_rate": 5e-05, - "epoch": 0.10085163603765128, - "step": 1575 - }, - { - "loss": 2.4423, - "grad_norm": 2.088454484939575, - "learning_rate": 5e-05, - "epoch": 0.10117179996158034, - "step": 1580 - }, - { - "loss": 2.4472, - "grad_norm": 2.0001513957977295, - "learning_rate": 5e-05, - "epoch": 0.10149196388550938, - "step": 1585 - }, - { - "loss": 2.4643, - "grad_norm": 2.0158650875091553, - "learning_rate": 5e-05, - "epoch": 0.10181212780943844, - "step": 1590 - }, - { - "loss": 2.4517, - "grad_norm": 2.062638282775879, - "learning_rate": 5e-05, - "epoch": 0.10213229173336748, - "step": 1595 - }, - { - "loss": 2.4274, - "grad_norm": 2.11297345161438, - "learning_rate": 5e-05, - "epoch": 0.10245245565729653, - "step": 1600 - }, - { - "eval_loss": 2.3070931434631348, - "eval_runtime": 9.4351, - "eval_samples_per_second": 217.062, - "eval_steps_per_second": 27.133, - "epoch": 0.10245245565729653, - "step": 1600 - }, - { - "loss": 2.4551, - "grad_norm": 2.169626235961914, - "learning_rate": 5e-05, - "epoch": 0.10277261958122559, - "step": 1605 - }, - { - "loss": 2.4464, - "grad_norm": 2.102466344833374, - "learning_rate": 5e-05, - "epoch": 0.10309278350515463, - "step": 1610 - }, - { - "loss": 2.438, - "grad_norm": 1.9966940879821777, - "learning_rate": 5e-05, - "epoch": 0.10341294742908369, - "step": 1615 - }, - { - "loss": 2.4323, - "grad_norm": 2.103325605392456, - "learning_rate": 5e-05, - "epoch": 0.10373311135301275, - "step": 1620 - }, - { - "loss": 2.4293, - "grad_norm": 2.05993390083313, - "learning_rate": 5e-05, - "epoch": 0.10405327527694179, - "step": 1625 - }, - { - "loss": 2.428, - "grad_norm": 1.9764646291732788, - "learning_rate": 5e-05, - "epoch": 0.10437343920087085, - "step": 1630 - }, - { - "loss": 2.4515, - "grad_norm": 1.9260586500167847, - "learning_rate": 5e-05, - "epoch": 0.1046936031247999, - "step": 1635 - }, - { - "loss": 2.4378, - "grad_norm": 1.9698050022125244, - "learning_rate": 5e-05, - "epoch": 0.10501376704872895, - "step": 1640 - }, - { - "loss": 2.4413, - "grad_norm": 2.1451985836029053, - "learning_rate": 5e-05, - "epoch": 0.105333930972658, - "step": 1645 - }, - { - "loss": 2.4739, - "grad_norm": 2.0343995094299316, - "learning_rate": 5e-05, - "epoch": 0.10565409489658706, - "step": 1650 - }, - { - "loss": 2.4383, - "grad_norm": 2.035264253616333, - "learning_rate": 5e-05, - "epoch": 0.1059742588205161, - "step": 1655 - }, - { - "loss": 2.412, - "grad_norm": 1.9604747295379639, - "learning_rate": 5e-05, - "epoch": 0.10629442274444516, - "step": 1660 - }, - { - "loss": 2.4449, - "grad_norm": 2.0956430435180664, - "learning_rate": 5e-05, - "epoch": 0.10661458666837421, - "step": 1665 - }, - { - "loss": 2.4662, - "grad_norm": 2.05611252784729, - "learning_rate": 5e-05, - "epoch": 0.10693475059230326, - "step": 1670 - }, - { - "loss": 2.4423, - "grad_norm": 2.157836437225342, - "learning_rate": 5e-05, - "epoch": 0.10725491451623231, - "step": 1675 - }, - { - "loss": 2.4285, - "grad_norm": 1.9412627220153809, - "learning_rate": 5e-05, - "epoch": 0.10757507844016136, - "step": 1680 - }, - { - "loss": 2.4448, - "grad_norm": 2.1207661628723145, - "learning_rate": 5e-05, - "epoch": 0.10789524236409041, - "step": 1685 - }, - { - "loss": 2.4737, - "grad_norm": 2.0780351161956787, - "learning_rate": 5e-05, - "epoch": 0.10821540628801947, - "step": 1690 - }, - { - "loss": 2.4195, - "grad_norm": 1.9629524946212769, - "learning_rate": 5e-05, - "epoch": 0.10853557021194851, - "step": 1695 - }, - { - "loss": 2.4542, - "grad_norm": 2.141195774078369, - "learning_rate": 5e-05, - "epoch": 0.10885573413587757, - "step": 1700 - }, - { - "loss": 2.4617, - "grad_norm": 2.042581081390381, - "learning_rate": 5e-05, - "epoch": 0.10917589805980663, - "step": 1705 - }, - { - "loss": 2.4604, - "grad_norm": 2.0919344425201416, - "learning_rate": 5e-05, - "epoch": 0.10949606198373567, - "step": 1710 - }, - { - "loss": 2.438, - "grad_norm": 2.0356478691101074, - "learning_rate": 5e-05, - "epoch": 0.10981622590766472, - "step": 1715 - }, - { - "loss": 2.4202, - "grad_norm": 1.9953988790512085, - "learning_rate": 5e-05, - "epoch": 0.11013638983159378, - "step": 1720 - }, - { - "loss": 2.4372, - "grad_norm": 2.033454656600952, - "learning_rate": 5e-05, - "epoch": 0.11045655375552282, - "step": 1725 - }, - { - "loss": 2.4564, - "grad_norm": 1.898619532585144, - "learning_rate": 5e-05, - "epoch": 0.11077671767945188, - "step": 1730 - }, - { - "loss": 2.4104, - "grad_norm": 1.9848005771636963, - "learning_rate": 5e-05, - "epoch": 0.11109688160338094, - "step": 1735 - }, - { - "loss": 2.4339, - "grad_norm": 2.142657518386841, - "learning_rate": 5e-05, - "epoch": 0.11141704552730998, - "step": 1740 - }, - { - "loss": 2.4583, - "grad_norm": 1.9848843812942505, - "learning_rate": 5e-05, - "epoch": 0.11173720945123904, - "step": 1745 - }, - { - "loss": 2.4431, - "grad_norm": 1.9729015827178955, - "learning_rate": 5e-05, - "epoch": 0.11205737337516809, - "step": 1750 - }, - { - "loss": 2.4312, - "grad_norm": 8.698593139648438, - "learning_rate": 5e-05, - "epoch": 0.11237753729909714, - "step": 1755 - }, - { - "loss": 2.4592, - "grad_norm": 2.070530414581299, - "learning_rate": 5e-05, - "epoch": 0.11269770122302619, - "step": 1760 - }, - { - "loss": 2.4533, - "grad_norm": 2.034292697906494, - "learning_rate": 5e-05, - "epoch": 0.11301786514695523, - "step": 1765 - }, - { - "loss": 2.4622, - "grad_norm": 2.0932867527008057, - "learning_rate": 5e-05, - "epoch": 0.11333802907088429, - "step": 1770 - }, - { - "loss": 2.4165, - "grad_norm": 1.9923781156539917, - "learning_rate": 5e-05, - "epoch": 0.11365819299481335, - "step": 1775 - }, - { - "loss": 2.4384, - "grad_norm": 2.063328504562378, - "learning_rate": 5e-05, - "epoch": 0.11397835691874239, - "step": 1780 - }, - { - "loss": 2.4038, - "grad_norm": 2.021510124206543, - "learning_rate": 5e-05, - "epoch": 0.11429852084267145, - "step": 1785 - }, - { - "loss": 2.4164, - "grad_norm": 1.953681468963623, - "learning_rate": 5e-05, - "epoch": 0.1146186847666005, - "step": 1790 - }, - { - "loss": 2.4103, - "grad_norm": 1.976102590560913, - "learning_rate": 5e-05, - "epoch": 0.11493884869052955, - "step": 1795 - }, - { - "loss": 2.4182, - "grad_norm": 1.886995792388916, - "learning_rate": 5e-05, - "epoch": 0.1152590126144586, - "step": 1800 - }, - { - "eval_loss": 2.283639430999756, - "eval_runtime": 12.6341, - "eval_samples_per_second": 162.101, - "eval_steps_per_second": 20.263, - "epoch": 0.1152590126144586, - "step": 1800 - }, - { - "loss": 2.4338, - "grad_norm": 2.145838975906372, - "learning_rate": 5e-05, - "epoch": 0.11557917653838766, - "step": 1805 - }, - { - "loss": 2.4621, - "grad_norm": 2.145569324493408, - "learning_rate": 5e-05, - "epoch": 0.1158993404623167, - "step": 1810 - }, - { - "loss": 2.4088, - "grad_norm": 2.0806474685668945, - "learning_rate": 5e-05, - "epoch": 0.11621950438624576, - "step": 1815 - }, - { - "loss": 2.4203, - "grad_norm": 2.0347843170166016, - "learning_rate": 5e-05, - "epoch": 0.11653966831017482, - "step": 1820 - }, - { - "loss": 2.4236, - "grad_norm": 1.9410957098007202, - "learning_rate": 5e-05, - "epoch": 0.11685983223410386, - "step": 1825 - }, - { - "loss": 2.4548, - "grad_norm": 1.9339467287063599, - "learning_rate": 5e-05, - "epoch": 0.11717999615803291, - "step": 1830 - }, - { - "loss": 2.4241, - "grad_norm": 2.0596871376037598, - "learning_rate": 5e-05, - "epoch": 0.11750016008196197, - "step": 1835 - }, - { - "loss": 2.4443, - "grad_norm": 2.01784610748291, - "learning_rate": 5e-05, - "epoch": 0.11782032400589101, - "step": 1840 - }, - { - "loss": 2.4297, - "grad_norm": 1.915007472038269, - "learning_rate": 5e-05, - "epoch": 0.11814048792982007, - "step": 1845 - }, - { - "loss": 2.412, - "grad_norm": 2.025275945663452, - "learning_rate": 5e-05, - "epoch": 0.11846065185374911, - "step": 1850 - }, - { - "loss": 2.4118, - "grad_norm": 1.9844189882278442, - "learning_rate": 5e-05, - "epoch": 0.11878081577767817, - "step": 1855 - }, - { - "loss": 2.4055, - "grad_norm": 2.0724167823791504, - "learning_rate": 5e-05, - "epoch": 0.11910097970160723, - "step": 1860 - }, - { - "loss": 2.4241, - "grad_norm": 2.0442521572113037, - "learning_rate": 5e-05, - "epoch": 0.11942114362553627, - "step": 1865 - }, - { - "loss": 2.4462, - "grad_norm": 1.9685866832733154, - "learning_rate": 5e-05, - "epoch": 0.11974130754946533, - "step": 1870 - }, - { - "loss": 2.4232, - "grad_norm": 1.9572803974151611, - "learning_rate": 5e-05, - "epoch": 0.12006147147339438, - "step": 1875 - }, - { - "loss": 2.4565, - "grad_norm": 2.105123519897461, - "learning_rate": 5e-05, - "epoch": 0.12038163539732342, - "step": 1880 - }, - { - "loss": 2.4146, - "grad_norm": 2.017563581466675, - "learning_rate": 5e-05, - "epoch": 0.12070179932125248, - "step": 1885 - }, - { - "loss": 2.4295, - "grad_norm": 1.9783453941345215, - "learning_rate": 5e-05, - "epoch": 0.12102196324518154, - "step": 1890 - }, - { - "loss": 2.4148, - "grad_norm": 2.017634868621826, - "learning_rate": 5e-05, - "epoch": 0.12134212716911058, - "step": 1895 - }, - { - "loss": 2.4214, - "grad_norm": 2.16438627243042, - "learning_rate": 5e-05, - "epoch": 0.12166229109303964, - "step": 1900 - }, - { - "loss": 2.42, - "grad_norm": 1.980455994606018, - "learning_rate": 5e-05, - "epoch": 0.1219824550169687, - "step": 1905 - }, - { - "loss": 2.3945, - "grad_norm": 1.9849518537521362, - "learning_rate": 5e-05, - "epoch": 0.12230261894089774, - "step": 1910 - }, - { - "loss": 2.459, - "grad_norm": 1.9081141948699951, - "learning_rate": 5e-05, - "epoch": 0.1226227828648268, - "step": 1915 - }, - { - "loss": 2.4555, - "grad_norm": 1.9519824981689453, - "learning_rate": 5e-05, - "epoch": 0.12294294678875585, - "step": 1920 - }, - { - "loss": 2.4165, - "grad_norm": 1.8904905319213867, - "learning_rate": 5e-05, - "epoch": 0.12326311071268489, - "step": 1925 - }, - { - "loss": 2.3908, - "grad_norm": 2.1023762226104736, - "learning_rate": 5e-05, - "epoch": 0.12358327463661395, - "step": 1930 - }, - { - "loss": 2.4155, - "grad_norm": 1.937259554862976, - "learning_rate": 5e-05, - "epoch": 0.12390343856054299, - "step": 1935 - }, - { - "loss": 2.3992, - "grad_norm": 2.0456533432006836, - "learning_rate": 5e-05, - "epoch": 0.12422360248447205, - "step": 1940 - }, - { - "loss": 2.4287, - "grad_norm": 2.0166923999786377, - "learning_rate": 5e-05, - "epoch": 0.1245437664084011, - "step": 1945 - }, - { - "loss": 2.4264, - "grad_norm": 2.064141273498535, - "learning_rate": 5e-05, - "epoch": 0.12486393033233015, - "step": 1950 - }, - { - "loss": 2.4268, - "grad_norm": 1.9742683172225952, - "learning_rate": 5e-05, - "epoch": 0.1251840942562592, - "step": 1955 - }, - { - "loss": 2.4056, - "grad_norm": 1.9377827644348145, - "learning_rate": 5e-05, - "epoch": 0.12550425818018826, - "step": 1960 - }, - { - "loss": 2.4298, - "grad_norm": 1.92035710811615, - "learning_rate": 5e-05, - "epoch": 0.12582442210411732, - "step": 1965 - }, - { - "loss": 2.4565, - "grad_norm": 1.9152588844299316, - "learning_rate": 5e-05, - "epoch": 0.12614458602804635, - "step": 1970 - }, - { - "loss": 2.4223, - "grad_norm": 1.9562370777130127, - "learning_rate": 5e-05, - "epoch": 0.1264647499519754, - "step": 1975 - }, - { - "loss": 2.3804, - "grad_norm": 1.9528794288635254, - "learning_rate": 5e-05, - "epoch": 0.12678491387590446, - "step": 1980 - }, - { - "loss": 2.4475, - "grad_norm": 1.9754786491394043, - "learning_rate": 5e-05, - "epoch": 0.12710507779983352, - "step": 1985 - }, - { - "loss": 2.4395, - "grad_norm": 1.9041407108306885, - "learning_rate": 5e-05, - "epoch": 0.12742524172376257, - "step": 1990 - }, - { - "loss": 2.4235, - "grad_norm": 2.0152764320373535, - "learning_rate": 5e-05, - "epoch": 0.12774540564769163, - "step": 1995 - }, - { - "loss": 2.4227, - "grad_norm": 1.8605611324310303, - "learning_rate": 5e-05, - "epoch": 0.12806556957162066, - "step": 2000 - }, - { - "eval_loss": 2.2688512802124023, - "eval_runtime": 9.5336, - "eval_samples_per_second": 214.818, - "eval_steps_per_second": 26.852, - "epoch": 0.12806556957162066, - "step": 2000 - }, - { - "loss": 2.4202, - "grad_norm": 2.040646553039551, - "learning_rate": 5e-05, - "epoch": 0.12838573349554971, - "step": 2005 - }, - { - "loss": 2.4003, - "grad_norm": 2.212007522583008, - "learning_rate": 5e-05, - "epoch": 0.12870589741947877, - "step": 2010 - }, - { - "loss": 2.4121, - "grad_norm": 1.964044451713562, - "learning_rate": 5e-05, - "epoch": 0.12902606134340783, - "step": 2015 - }, - { - "loss": 2.3939, - "grad_norm": 1.9541124105453491, - "learning_rate": 5e-05, - "epoch": 0.12934622526733688, - "step": 2020 - }, - { - "loss": 2.4335, - "grad_norm": 1.8698004484176636, - "learning_rate": 5e-05, - "epoch": 0.12966638919126594, - "step": 2025 - }, - { - "loss": 2.4246, - "grad_norm": 1.956687092781067, - "learning_rate": 5e-05, - "epoch": 0.12998655311519497, - "step": 2030 - }, - { - "loss": 2.4097, - "grad_norm": 2.137901544570923, - "learning_rate": 5e-05, - "epoch": 0.13030671703912403, - "step": 2035 - }, - { - "loss": 2.4066, - "grad_norm": 1.9962667226791382, - "learning_rate": 5e-05, - "epoch": 0.13062688096305308, - "step": 2040 - }, - { - "loss": 2.404, - "grad_norm": 1.9223984479904175, - "learning_rate": 5e-05, - "epoch": 0.13094704488698214, - "step": 2045 - }, - { - "loss": 2.3836, - "grad_norm": 1.956534504890442, - "learning_rate": 5e-05, - "epoch": 0.1312672088109112, - "step": 2050 - }, - { - "loss": 2.4043, - "grad_norm": 1.9311598539352417, - "learning_rate": 5e-05, - "epoch": 0.13158737273484022, - "step": 2055 - }, - { - "loss": 2.4129, - "grad_norm": 2.080878496170044, - "learning_rate": 5e-05, - "epoch": 0.13190753665876928, - "step": 2060 - }, - { - "loss": 2.4115, - "grad_norm": 2.0229105949401855, - "learning_rate": 5e-05, - "epoch": 0.13222770058269834, - "step": 2065 - }, - { - "loss": 2.3804, - "grad_norm": 2.0059521198272705, - "learning_rate": 5e-05, - "epoch": 0.1325478645066274, - "step": 2070 - }, - { - "loss": 2.402, - "grad_norm": 2.0359392166137695, - "learning_rate": 5e-05, - "epoch": 0.13286802843055645, - "step": 2075 - }, - { - "loss": 2.3843, - "grad_norm": 1.9484703540802002, - "learning_rate": 5e-05, - "epoch": 0.1331881923544855, - "step": 2080 - }, - { - "loss": 2.4186, - "grad_norm": 2.008492946624756, - "learning_rate": 5e-05, - "epoch": 0.13350835627841454, - "step": 2085 - }, - { - "loss": 2.4124, - "grad_norm": 2.0623104572296143, - "learning_rate": 5e-05, - "epoch": 0.1338285202023436, - "step": 2090 - }, - { - "loss": 2.3599, - "grad_norm": 1.8425260782241821, - "learning_rate": 5e-05, - "epoch": 0.13414868412627265, - "step": 2095 - }, - { - "loss": 2.3984, - "grad_norm": 2.0146563053131104, - "learning_rate": 5e-05, - "epoch": 0.1344688480502017, - "step": 2100 - }, - { - "loss": 2.3858, - "grad_norm": 2.082679033279419, - "learning_rate": 5e-05, - "epoch": 0.13478901197413076, - "step": 2105 - }, - { - "loss": 2.4171, - "grad_norm": 2.029128074645996, - "learning_rate": 5e-05, - "epoch": 0.13510917589805982, - "step": 2110 - }, - { - "loss": 2.3959, - "grad_norm": 1.987762212753296, - "learning_rate": 5e-05, - "epoch": 0.13542933982198885, - "step": 2115 - }, - { - "loss": 2.3947, - "grad_norm": 2.0950815677642822, - "learning_rate": 5e-05, - "epoch": 0.1357495037459179, - "step": 2120 - }, - { - "loss": 2.4035, - "grad_norm": 2.036588668823242, - "learning_rate": 5e-05, - "epoch": 0.13606966766984696, - "step": 2125 - }, - { - "loss": 2.4071, - "grad_norm": 2.120378017425537, - "learning_rate": 5e-05, - "epoch": 0.13638983159377602, - "step": 2130 - }, - { - "loss": 2.404, - "grad_norm": 1.9909882545471191, - "learning_rate": 5e-05, - "epoch": 0.13670999551770507, - "step": 2135 - }, - { - "loss": 2.4018, - "grad_norm": 2.004340887069702, - "learning_rate": 5e-05, - "epoch": 0.1370301594416341, - "step": 2140 - }, - { - "loss": 2.4058, - "grad_norm": 1.8815783262252808, - "learning_rate": 5e-05, - "epoch": 0.13735032336556316, - "step": 2145 - }, - { - "loss": 2.3965, - "grad_norm": 1.9335230588912964, - "learning_rate": 5e-05, - "epoch": 0.13767048728949222, - "step": 2150 - }, - { - "loss": 2.4259, - "grad_norm": 2.0381062030792236, - "learning_rate": 5e-05, - "epoch": 0.13799065121342127, - "step": 2155 - }, - { - "loss": 2.406, - "grad_norm": 1.9996901750564575, - "learning_rate": 5e-05, - "epoch": 0.13831081513735033, - "step": 2160 - }, - { - "loss": 2.4039, - "grad_norm": 1.899214267730713, - "learning_rate": 5e-05, - "epoch": 0.13863097906127939, - "step": 2165 - }, - { - "loss": 2.4298, - "grad_norm": 1.9462037086486816, - "learning_rate": 5e-05, - "epoch": 0.13895114298520841, - "step": 2170 - }, - { - "loss": 2.4035, - "grad_norm": 1.935436487197876, - "learning_rate": 5e-05, - "epoch": 0.13927130690913747, - "step": 2175 - }, - { - "loss": 2.3991, - "grad_norm": 1.9040533304214478, - "learning_rate": 5e-05, - "epoch": 0.13959147083306653, - "step": 2180 - }, - { - "loss": 2.4263, - "grad_norm": 1.9670405387878418, - "learning_rate": 5e-05, - "epoch": 0.13991163475699558, - "step": 2185 - }, - { - "loss": 2.4282, - "grad_norm": 2.0874922275543213, - "learning_rate": 5e-05, - "epoch": 0.14023179868092464, - "step": 2190 - }, - { - "loss": 2.4144, - "grad_norm": 1.953114628791809, - "learning_rate": 5e-05, - "epoch": 0.1405519626048537, - "step": 2195 - }, - { - "loss": 2.3966, - "grad_norm": 2.0168471336364746, - "learning_rate": 5e-05, - "epoch": 0.14087212652878273, - "step": 2200 - }, - { - "eval_loss": 2.253744125366211, - "eval_runtime": 10.1134, - "eval_samples_per_second": 202.504, - "eval_steps_per_second": 25.313, - "epoch": 0.14087212652878273, - "step": 2200 - }, - { - "loss": 2.3911, - "grad_norm": 1.9229140281677246, - "learning_rate": 5e-05, - "epoch": 0.14119229045271178, - "step": 2205 - }, - { - "loss": 2.4168, - "grad_norm": 1.9505460262298584, - "learning_rate": 5e-05, - "epoch": 0.14151245437664084, - "step": 2210 - }, - { - "loss": 2.4122, - "grad_norm": 1.9959040880203247, - "learning_rate": 5e-05, - "epoch": 0.1418326183005699, - "step": 2215 - }, - { - "loss": 2.4155, - "grad_norm": 2.070401191711426, - "learning_rate": 5e-05, - "epoch": 0.14215278222449895, - "step": 2220 - }, - { - "loss": 2.3864, - "grad_norm": 1.980580449104309, - "learning_rate": 5e-05, - "epoch": 0.14247294614842798, - "step": 2225 - }, - { - "loss": 2.4002, - "grad_norm": 1.9218393564224243, - "learning_rate": 5e-05, - "epoch": 0.14279311007235704, - "step": 2230 - }, - { - "loss": 2.3819, - "grad_norm": 2.057966709136963, - "learning_rate": 5e-05, - "epoch": 0.1431132739962861, - "step": 2235 - }, - { - "loss": 2.4171, - "grad_norm": 2.0004384517669678, - "learning_rate": 5e-05, - "epoch": 0.14343343792021515, - "step": 2240 - }, - { - "loss": 2.3866, - "grad_norm": 2.0221750736236572, - "learning_rate": 5e-05, - "epoch": 0.1437536018441442, - "step": 2245 - }, - { - "loss": 2.4216, - "grad_norm": 2.0313234329223633, - "learning_rate": 5e-05, - "epoch": 0.14407376576807326, - "step": 2250 - }, - { - "loss": 2.3814, - "grad_norm": 2.1582136154174805, - "learning_rate": 5e-05, - "epoch": 0.1443939296920023, - "step": 2255 - }, - { - "loss": 2.4123, - "grad_norm": 2.0171945095062256, - "learning_rate": 5e-05, - "epoch": 0.14471409361593135, - "step": 2260 - }, - { - "loss": 2.3892, - "grad_norm": 1.9124054908752441, - "learning_rate": 5e-05, - "epoch": 0.1450342575398604, - "step": 2265 - }, - { - "loss": 2.4326, - "grad_norm": 1.9355947971343994, - "learning_rate": 5e-05, - "epoch": 0.14535442146378946, - "step": 2270 - }, - { - "loss": 2.4224, - "grad_norm": 2.030381679534912, - "learning_rate": 5e-05, - "epoch": 0.14567458538771852, - "step": 2275 - }, - { - "loss": 2.4016, - "grad_norm": 2.0123848915100098, - "learning_rate": 5e-05, - "epoch": 0.14599474931164758, - "step": 2280 - }, - { - "loss": 2.3838, - "grad_norm": 2.062603712081909, - "learning_rate": 5e-05, - "epoch": 0.1463149132355766, - "step": 2285 - }, - { - "loss": 2.3896, - "grad_norm": 2.0680532455444336, - "learning_rate": 5e-05, - "epoch": 0.14663507715950566, - "step": 2290 - }, - { - "loss": 2.4275, - "grad_norm": 1.846703290939331, - "learning_rate": 5e-05, - "epoch": 0.14695524108343472, - "step": 2295 - }, - { - "loss": 2.3873, - "grad_norm": 1.9122810363769531, - "learning_rate": 5e-05, - "epoch": 0.14727540500736377, - "step": 2300 - }, - { - "loss": 2.4004, - "grad_norm": 1.9584786891937256, - "learning_rate": 5e-05, - "epoch": 0.14759556893129283, - "step": 2305 - }, - { - "loss": 2.3838, - "grad_norm": 1.8874859809875488, - "learning_rate": 5e-05, - "epoch": 0.14791573285522186, - "step": 2310 - }, - { - "loss": 2.3581, - "grad_norm": 1.8960413932800293, - "learning_rate": 5e-05, - "epoch": 0.14823589677915092, - "step": 2315 - }, - { - "loss": 2.3889, - "grad_norm": 1.8831268548965454, - "learning_rate": 5e-05, - "epoch": 0.14855606070307997, - "step": 2320 - }, - { - "loss": 2.3608, - "grad_norm": 1.910288691520691, - "learning_rate": 5e-05, - "epoch": 0.14887622462700903, - "step": 2325 - }, - { - "loss": 2.4205, - "grad_norm": 1.9343372583389282, - "learning_rate": 5e-05, - "epoch": 0.14919638855093809, - "step": 2330 - }, - { - "loss": 2.3924, - "grad_norm": 1.953525424003601, - "learning_rate": 5e-05, - "epoch": 0.14951655247486714, - "step": 2335 - }, - { - "loss": 2.3593, - "grad_norm": 1.9582774639129639, - "learning_rate": 5e-05, - "epoch": 0.14983671639879617, - "step": 2340 - }, - { - "loss": 2.3897, - "grad_norm": 1.9290440082550049, - "learning_rate": 5e-05, - "epoch": 0.15015688032272523, - "step": 2345 - }, - { - "loss": 2.389, - "grad_norm": 1.846218228340149, - "learning_rate": 5e-05, - "epoch": 0.15047704424665428, - "step": 2350 - }, - { - "loss": 2.4196, - "grad_norm": 1.9014278650283813, - "learning_rate": 5e-05, - "epoch": 0.15079720817058334, - "step": 2355 - }, - { - "loss": 2.3938, - "grad_norm": 1.9431865215301514, - "learning_rate": 5e-05, - "epoch": 0.1511173720945124, - "step": 2360 - }, - { - "loss": 2.3931, - "grad_norm": 1.8423478603363037, - "learning_rate": 5e-05, - "epoch": 0.15143753601844145, - "step": 2365 - }, - { - "loss": 2.3864, - "grad_norm": 1.8788933753967285, - "learning_rate": 5e-05, - "epoch": 0.15175769994237048, - "step": 2370 - }, - { - "loss": 2.3934, - "grad_norm": 1.9234330654144287, - "learning_rate": 5e-05, - "epoch": 0.15207786386629954, - "step": 2375 - }, - { - "loss": 2.3706, - "grad_norm": 1.8926013708114624, - "learning_rate": 5e-05, - "epoch": 0.1523980277902286, - "step": 2380 - }, - { - "loss": 2.3912, - "grad_norm": 2.009702682495117, - "learning_rate": 5e-05, - "epoch": 0.15271819171415765, - "step": 2385 - }, - { - "loss": 2.4004, - "grad_norm": 2.00524640083313, - "learning_rate": 5e-05, - "epoch": 0.1530383556380867, - "step": 2390 - }, - { - "loss": 2.3845, - "grad_norm": 1.8410059213638306, - "learning_rate": 5e-05, - "epoch": 0.15335851956201574, - "step": 2395 - }, - { - "loss": 2.3933, - "grad_norm": 1.9169642925262451, - "learning_rate": 5e-05, - "epoch": 0.1536786834859448, - "step": 2400 - }, - { - "eval_loss": 2.2529916763305664, - "eval_runtime": 9.2707, - "eval_samples_per_second": 220.912, - "eval_steps_per_second": 27.614, - "epoch": 0.1536786834859448, - "step": 2400 - }, - { - "loss": 2.3935, - "grad_norm": 1.948587417602539, - "learning_rate": 5e-05, - "epoch": 0.15399884740987385, - "step": 2405 - }, - { - "loss": 2.4017, - "grad_norm": 2.016439914703369, - "learning_rate": 5e-05, - "epoch": 0.1543190113338029, - "step": 2410 - }, - { - "loss": 2.3605, - "grad_norm": 2.007875680923462, - "learning_rate": 5e-05, - "epoch": 0.15463917525773196, - "step": 2415 - }, - { - "loss": 2.3663, - "grad_norm": 1.9632021188735962, - "learning_rate": 5e-05, - "epoch": 0.15495933918166102, - "step": 2420 - }, - { - "loss": 2.3814, - "grad_norm": 1.9989489316940308, - "learning_rate": 5e-05, - "epoch": 0.15527950310559005, - "step": 2425 - }, - { - "loss": 2.3695, - "grad_norm": 2.0570878982543945, - "learning_rate": 5e-05, - "epoch": 0.1555996670295191, - "step": 2430 - }, - { - "loss": 2.3995, - "grad_norm": 2.0144171714782715, - "learning_rate": 5e-05, - "epoch": 0.15591983095344816, - "step": 2435 - }, - { - "loss": 2.4044, - "grad_norm": 1.8741555213928223, - "learning_rate": 5e-05, - "epoch": 0.15623999487737722, - "step": 2440 - }, - { - "loss": 2.3671, - "grad_norm": 1.8373762369155884, - "learning_rate": 5e-05, - "epoch": 0.15656015880130628, - "step": 2445 - }, - { - "loss": 2.3604, - "grad_norm": 1.9890358448028564, - "learning_rate": 5e-05, - "epoch": 0.15688032272523533, - "step": 2450 - }, - { - "loss": 2.3899, - "grad_norm": 2.008896827697754, - "learning_rate": 5e-05, - "epoch": 0.15720048664916436, - "step": 2455 - }, - { - "loss": 2.3925, - "grad_norm": 1.98505699634552, - "learning_rate": 5e-05, - "epoch": 0.15752065057309342, - "step": 2460 - }, - { - "loss": 2.3917, - "grad_norm": 1.958855152130127, - "learning_rate": 5e-05, - "epoch": 0.15784081449702247, - "step": 2465 - }, - { - "loss": 2.3918, - "grad_norm": 1.9323810338974, - "learning_rate": 5e-05, - "epoch": 0.15816097842095153, - "step": 2470 - }, - { - "loss": 2.398, - "grad_norm": 2.0385091304779053, - "learning_rate": 5e-05, - "epoch": 0.1584811423448806, - "step": 2475 - }, - { - "loss": 2.3735, - "grad_norm": 1.877455711364746, - "learning_rate": 5e-05, - "epoch": 0.15880130626880964, - "step": 2480 - }, - { - "loss": 2.4105, - "grad_norm": 1.8172228336334229, - "learning_rate": 5e-05, - "epoch": 0.15912147019273867, - "step": 2485 - }, - { - "loss": 2.3432, - "grad_norm": 1.9281107187271118, - "learning_rate": 5e-05, - "epoch": 0.15944163411666773, - "step": 2490 - }, - { - "loss": 2.4057, - "grad_norm": 2.0879902839660645, - "learning_rate": 5e-05, - "epoch": 0.1597617980405968, - "step": 2495 - }, - { - "loss": 2.3725, - "grad_norm": 1.9170490503311157, - "learning_rate": 5e-05, - "epoch": 0.16008196196452584, - "step": 2500 - }, - { - "loss": 2.3963, - "grad_norm": 1.973979115486145, - "learning_rate": 5e-05, - "epoch": 0.1604021258884549, - "step": 2505 - }, - { - "loss": 2.3596, - "grad_norm": 1.9528260231018066, - "learning_rate": 5e-05, - "epoch": 0.16072228981238393, - "step": 2510 - }, - { - "loss": 2.3587, - "grad_norm": 1.9395289421081543, - "learning_rate": 5e-05, - "epoch": 0.16104245373631298, - "step": 2515 - }, - { - "loss": 2.3813, - "grad_norm": 1.9283883571624756, - "learning_rate": 5e-05, - "epoch": 0.16136261766024204, - "step": 2520 - }, - { - "loss": 2.3981, - "grad_norm": 1.9027310609817505, - "learning_rate": 5e-05, - "epoch": 0.1616827815841711, - "step": 2525 - }, - { - "loss": 2.3892, - "grad_norm": 1.9189422130584717, - "learning_rate": 5e-05, - "epoch": 0.16200294550810015, - "step": 2530 - }, - { - "loss": 2.3914, - "grad_norm": 1.9815748929977417, - "learning_rate": 5e-05, - "epoch": 0.1623231094320292, - "step": 2535 - }, - { - "loss": 2.3935, - "grad_norm": 1.9846832752227783, - "learning_rate": 5e-05, - "epoch": 0.16264327335595824, - "step": 2540 - }, - { - "loss": 2.3638, - "grad_norm": 1.8738396167755127, - "learning_rate": 5e-05, - "epoch": 0.1629634372798873, - "step": 2545 - }, - { - "loss": 2.402, - "grad_norm": 1.9183320999145508, - "learning_rate": 5e-05, - "epoch": 0.16328360120381635, - "step": 2550 - }, - { - "loss": 2.3585, - "grad_norm": 1.9421886205673218, - "learning_rate": 5e-05, - "epoch": 0.1636037651277454, - "step": 2555 - }, - { - "loss": 2.372, - "grad_norm": 1.8870490789413452, - "learning_rate": 5e-05, - "epoch": 0.16392392905167447, - "step": 2560 - }, - { - "loss": 2.3954, - "grad_norm": 1.9618778228759766, - "learning_rate": 5e-05, - "epoch": 0.16424409297560352, - "step": 2565 - }, - { - "loss": 2.3422, - "grad_norm": 2.0231478214263916, - "learning_rate": 5e-05, - "epoch": 0.16456425689953255, - "step": 2570 - }, - { - "loss": 2.375, - "grad_norm": 2.0343830585479736, - "learning_rate": 5e-05, - "epoch": 0.1648844208234616, - "step": 2575 - }, - { - "loss": 2.3795, - "grad_norm": 1.8410435914993286, - "learning_rate": 5e-05, - "epoch": 0.16520458474739066, - "step": 2580 - }, - { - "loss": 2.3509, - "grad_norm": 1.8402718305587769, - "learning_rate": 5e-05, - "epoch": 0.16552474867131972, - "step": 2585 - }, - { - "loss": 2.3851, - "grad_norm": 1.9031217098236084, - "learning_rate": 5e-05, - "epoch": 0.16584491259524878, - "step": 2590 - }, - { - "loss": 2.3865, - "grad_norm": 1.8732967376708984, - "learning_rate": 5e-05, - "epoch": 0.1661650765191778, - "step": 2595 - }, - { - "loss": 2.3787, - "grad_norm": 1.8325210809707642, - "learning_rate": 5e-05, - "epoch": 0.16648524044310686, - "step": 2600 - }, - { - "eval_loss": 2.2488222122192383, - "eval_runtime": 9.2735, - "eval_samples_per_second": 220.844, - "eval_steps_per_second": 27.606, - "epoch": 0.16648524044310686, - "step": 2600 - }, - { - "loss": 2.3569, - "grad_norm": 1.9319522380828857, - "learning_rate": 5e-05, - "epoch": 0.16680540436703592, - "step": 2605 - }, - { - "loss": 2.3725, - "grad_norm": 1.997213363647461, - "learning_rate": 5e-05, - "epoch": 0.16712556829096498, - "step": 2610 - }, - { - "loss": 2.4022, - "grad_norm": 2.0918712615966797, - "learning_rate": 5e-05, - "epoch": 0.16744573221489403, - "step": 2615 - }, - { - "loss": 2.369, - "grad_norm": 2.015212297439575, - "learning_rate": 5e-05, - "epoch": 0.1677658961388231, - "step": 2620 - }, - { - "loss": 2.3813, - "grad_norm": 1.9604344367980957, - "learning_rate": 5e-05, - "epoch": 0.16808606006275212, - "step": 2625 - }, - { - "loss": 2.3668, - "grad_norm": 1.8466747999191284, - "learning_rate": 5e-05, - "epoch": 0.16840622398668117, - "step": 2630 - }, - { - "loss": 2.3645, - "grad_norm": 1.8700547218322754, - "learning_rate": 5e-05, - "epoch": 0.16872638791061023, - "step": 2635 - }, - { - "loss": 2.3847, - "grad_norm": 1.9464409351348877, - "learning_rate": 5e-05, - "epoch": 0.1690465518345393, - "step": 2640 - }, - { - "loss": 2.3664, - "grad_norm": 1.9160685539245605, - "learning_rate": 5e-05, - "epoch": 0.16936671575846834, - "step": 2645 - }, - { - "loss": 2.3949, - "grad_norm": 1.993518590927124, - "learning_rate": 5e-05, - "epoch": 0.1696868796823974, - "step": 2650 - }, - { - "loss": 2.3986, - "grad_norm": 1.8965723514556885, - "learning_rate": 5e-05, - "epoch": 0.17000704360632643, - "step": 2655 - }, - { - "loss": 2.3931, - "grad_norm": 1.8633148670196533, - "learning_rate": 5e-05, - "epoch": 0.1703272075302555, - "step": 2660 - }, - { - "loss": 2.3934, - "grad_norm": 1.8948819637298584, - "learning_rate": 5e-05, - "epoch": 0.17064737145418454, - "step": 2665 - }, - { - "loss": 2.3787, - "grad_norm": 1.9217243194580078, - "learning_rate": 5e-05, - "epoch": 0.1709675353781136, - "step": 2670 - }, - { - "loss": 2.3821, - "grad_norm": 1.774686574935913, - "learning_rate": 5e-05, - "epoch": 0.17128769930204266, - "step": 2675 - }, - { - "loss": 2.364, - "grad_norm": 1.9000036716461182, - "learning_rate": 5e-05, - "epoch": 0.17160786322597169, - "step": 2680 - }, - { - "loss": 2.3718, - "grad_norm": 1.8330143690109253, - "learning_rate": 5e-05, - "epoch": 0.17192802714990074, - "step": 2685 - }, - { - "loss": 2.4015, - "grad_norm": 1.9639065265655518, - "learning_rate": 5e-05, - "epoch": 0.1722481910738298, - "step": 2690 - }, - { - "loss": 2.403, - "grad_norm": 1.8496508598327637, - "learning_rate": 5e-05, - "epoch": 0.17256835499775885, - "step": 2695 - }, - { - "loss": 2.3548, - "grad_norm": 1.8958038091659546, - "learning_rate": 5e-05, - "epoch": 0.1728885189216879, - "step": 2700 - }, - { - "loss": 2.3648, - "grad_norm": 2.150702476501465, - "learning_rate": 5e-05, - "epoch": 0.17320868284561697, - "step": 2705 - }, - { - "loss": 2.3544, - "grad_norm": 2.0381345748901367, - "learning_rate": 5e-05, - "epoch": 0.173528846769546, - "step": 2710 - }, - { - "loss": 2.3726, - "grad_norm": 1.9142519235610962, - "learning_rate": 5e-05, - "epoch": 0.17384901069347505, - "step": 2715 - }, - { - "loss": 2.3721, - "grad_norm": 1.9022127389907837, - "learning_rate": 5e-05, - "epoch": 0.1741691746174041, - "step": 2720 - }, - { - "loss": 2.3595, - "grad_norm": 2.145447254180908, - "learning_rate": 5e-05, - "epoch": 0.17448933854133317, - "step": 2725 - }, - { - "loss": 2.3423, - "grad_norm": 1.8730753660202026, - "learning_rate": 5e-05, - "epoch": 0.17480950246526222, - "step": 2730 - }, - { - "loss": 2.3756, - "grad_norm": 1.9949947595596313, - "learning_rate": 5e-05, - "epoch": 0.17512966638919128, - "step": 2735 - }, - { - "loss": 2.3446, - "grad_norm": 1.9902111291885376, - "learning_rate": 5e-05, - "epoch": 0.1754498303131203, - "step": 2740 - }, - { - "loss": 2.3616, - "grad_norm": 1.9357950687408447, - "learning_rate": 5e-05, - "epoch": 0.17576999423704937, - "step": 2745 - }, - { - "loss": 2.3666, - "grad_norm": 1.9442518949508667, - "learning_rate": 5e-05, - "epoch": 0.17609015816097842, - "step": 2750 - }, - { - "loss": 2.3623, - "grad_norm": 1.8661247491836548, - "learning_rate": 5e-05, - "epoch": 0.17641032208490748, - "step": 2755 - }, - { - "loss": 2.368, - "grad_norm": 1.8535679578781128, - "learning_rate": 5e-05, - "epoch": 0.17673048600883653, - "step": 2760 - }, - { - "loss": 2.3872, - "grad_norm": 1.8927170038223267, - "learning_rate": 5e-05, - "epoch": 0.17705064993276556, - "step": 2765 - }, - { - "loss": 2.3464, - "grad_norm": 1.9512662887573242, - "learning_rate": 5e-05, - "epoch": 0.17737081385669462, - "step": 2770 - }, - { - "loss": 2.3607, - "grad_norm": 1.8841359615325928, - "learning_rate": 5e-05, - "epoch": 0.17769097778062368, - "step": 2775 - }, - { - "loss": 2.3779, - "grad_norm": 1.9043680429458618, - "learning_rate": 5e-05, - "epoch": 0.17801114170455273, - "step": 2780 - }, - { - "loss": 2.3836, - "grad_norm": 1.8490782976150513, - "learning_rate": 5e-05, - "epoch": 0.1783313056284818, - "step": 2785 - }, - { - "loss": 2.3739, - "grad_norm": 1.9332350492477417, - "learning_rate": 5e-05, - "epoch": 0.17865146955241085, - "step": 2790 - }, - { - "loss": 2.3407, - "grad_norm": 1.8520585298538208, - "learning_rate": 5e-05, - "epoch": 0.17897163347633988, - "step": 2795 - }, - { - "loss": 2.3733, - "grad_norm": 1.8878172636032104, - "learning_rate": 5e-05, - "epoch": 0.17929179740026893, - "step": 2800 - }, - { - "eval_loss": 2.2236533164978027, - "eval_runtime": 9.4962, - "eval_samples_per_second": 215.665, - "eval_steps_per_second": 26.958, - "epoch": 0.17929179740026893, - "step": 2800 - }, - { - "loss": 2.3377, - "grad_norm": 1.994707465171814, - "learning_rate": 5e-05, - "epoch": 0.179611961324198, - "step": 2805 - }, - { - "loss": 2.3754, - "grad_norm": 1.7752844095230103, - "learning_rate": 5e-05, - "epoch": 0.17993212524812705, - "step": 2810 - }, - { - "loss": 2.376, - "grad_norm": 1.9438122510910034, - "learning_rate": 5e-05, - "epoch": 0.1802522891720561, - "step": 2815 - }, - { - "loss": 2.3698, - "grad_norm": 1.9461045265197754, - "learning_rate": 5e-05, - "epoch": 0.18057245309598516, - "step": 2820 - }, - { - "loss": 2.3587, - "grad_norm": 1.92300283908844, - "learning_rate": 5e-05, - "epoch": 0.1808926170199142, - "step": 2825 - }, - { - "loss": 2.395, - "grad_norm": 2.004666328430176, - "learning_rate": 5e-05, - "epoch": 0.18121278094384324, - "step": 2830 - }, - { - "loss": 2.3882, - "grad_norm": 1.8876590728759766, - "learning_rate": 5e-05, - "epoch": 0.1815329448677723, - "step": 2835 - }, - { - "loss": 2.337, - "grad_norm": 1.9001890420913696, - "learning_rate": 5e-05, - "epoch": 0.18185310879170136, - "step": 2840 - }, - { - "loss": 2.3796, - "grad_norm": 1.876528024673462, - "learning_rate": 5e-05, - "epoch": 0.1821732727156304, - "step": 2845 - }, - { - "loss": 2.366, - "grad_norm": 1.977066159248352, - "learning_rate": 5e-05, - "epoch": 0.18249343663955944, - "step": 2850 - }, - { - "loss": 2.3681, - "grad_norm": 1.8940303325653076, - "learning_rate": 5e-05, - "epoch": 0.1828136005634885, - "step": 2855 - }, - { - "loss": 2.3721, - "grad_norm": 1.9290567636489868, - "learning_rate": 5e-05, - "epoch": 0.18313376448741756, - "step": 2860 - }, - { - "loss": 2.3555, - "grad_norm": 1.8955270051956177, - "learning_rate": 5e-05, - "epoch": 0.1834539284113466, - "step": 2865 - }, - { - "loss": 2.3675, - "grad_norm": 1.916295051574707, - "learning_rate": 5e-05, - "epoch": 0.18377409233527567, - "step": 2870 - }, - { - "loss": 2.3718, - "grad_norm": 2.090623617172241, - "learning_rate": 5e-05, - "epoch": 0.18409425625920472, - "step": 2875 - }, - { - "loss": 2.3617, - "grad_norm": 1.9669326543807983, - "learning_rate": 5e-05, - "epoch": 0.18441442018313375, - "step": 2880 - }, - { - "loss": 2.3693, - "grad_norm": 1.856131911277771, - "learning_rate": 5e-05, - "epoch": 0.1847345841070628, - "step": 2885 - }, - { - "loss": 2.3714, - "grad_norm": 1.9637651443481445, - "learning_rate": 5e-05, - "epoch": 0.18505474803099187, - "step": 2890 - }, - { - "loss": 2.3669, - "grad_norm": 2.030195951461792, - "learning_rate": 5e-05, - "epoch": 0.18537491195492092, - "step": 2895 - }, - { - "loss": 2.364, - "grad_norm": 1.9500765800476074, - "learning_rate": 5e-05, - "epoch": 0.18569507587884998, - "step": 2900 - }, - { - "loss": 2.3647, - "grad_norm": 1.9658929109573364, - "learning_rate": 5e-05, - "epoch": 0.18601523980277904, - "step": 2905 - }, - { - "loss": 2.3541, - "grad_norm": 1.9205372333526611, - "learning_rate": 5e-05, - "epoch": 0.18633540372670807, - "step": 2910 - }, - { - "loss": 2.3631, - "grad_norm": 1.9161553382873535, - "learning_rate": 5e-05, - "epoch": 0.18665556765063712, - "step": 2915 - }, - { - "loss": 2.3526, - "grad_norm": 1.9861873388290405, - "learning_rate": 5e-05, - "epoch": 0.18697573157456618, - "step": 2920 - }, - { - "loss": 2.3579, - "grad_norm": 1.9132648706436157, - "learning_rate": 5e-05, - "epoch": 0.18729589549849524, - "step": 2925 - }, - { - "loss": 2.358, - "grad_norm": 1.9785979986190796, - "learning_rate": 5e-05, - "epoch": 0.1876160594224243, - "step": 2930 - }, - { - "loss": 2.3736, - "grad_norm": 1.9504092931747437, - "learning_rate": 5e-05, - "epoch": 0.18793622334635332, - "step": 2935 - }, - { - "loss": 2.3665, - "grad_norm": 1.8407647609710693, - "learning_rate": 5e-05, - "epoch": 0.18825638727028238, - "step": 2940 - }, - { - "loss": 2.3351, - "grad_norm": 2.004072666168213, - "learning_rate": 5e-05, - "epoch": 0.18857655119421143, - "step": 2945 - }, - { - "loss": 2.3649, - "grad_norm": 1.8799525499343872, - "learning_rate": 5e-05, - "epoch": 0.1888967151181405, - "step": 2950 - }, - { - "loss": 2.351, - "grad_norm": 2.0074257850646973, - "learning_rate": 5e-05, - "epoch": 0.18921687904206955, - "step": 2955 - }, - { - "loss": 2.3236, - "grad_norm": 1.8301315307617188, - "learning_rate": 5e-05, - "epoch": 0.1895370429659986, - "step": 2960 - }, - { - "loss": 2.3545, - "grad_norm": 1.9088454246520996, - "learning_rate": 5e-05, - "epoch": 0.18985720688992763, - "step": 2965 - }, - { - "loss": 2.3666, - "grad_norm": 1.9445098638534546, - "learning_rate": 5e-05, - "epoch": 0.1901773708138567, - "step": 2970 - }, - { - "loss": 2.3593, - "grad_norm": 1.8038558959960938, - "learning_rate": 5e-05, - "epoch": 0.19049753473778575, - "step": 2975 - }, - { - "loss": 2.3387, - "grad_norm": 1.8952257633209229, - "learning_rate": 5e-05, - "epoch": 0.1908176986617148, - "step": 2980 - }, - { - "loss": 2.3742, - "grad_norm": 1.8767812252044678, - "learning_rate": 5e-05, - "epoch": 0.19113786258564386, - "step": 2985 - }, - { - "loss": 2.3786, - "grad_norm": 1.771638035774231, - "learning_rate": 5e-05, - "epoch": 0.19145802650957292, - "step": 2990 - }, - { - "loss": 2.3528, - "grad_norm": 1.9153273105621338, - "learning_rate": 5e-05, - "epoch": 0.19177819043350194, - "step": 2995 - }, - { - "loss": 2.3781, - "grad_norm": 1.9172663688659668, - "learning_rate": 5e-05, - "epoch": 0.192098354357431, - "step": 3000 - }, - { - "eval_loss": 2.209613800048828, - "eval_runtime": 12.6666, - "eval_samples_per_second": 161.685, - "eval_steps_per_second": 20.211, - "epoch": 0.192098354357431, - "step": 3000 - }, - { - "loss": 2.3762, - "grad_norm": 1.8722891807556152, - "learning_rate": 5e-05, - "epoch": 0.19241851828136006, - "step": 3005 - }, - { - "loss": 2.3861, - "grad_norm": 1.907089352607727, - "learning_rate": 5e-05, - "epoch": 0.1927386822052891, - "step": 3010 - }, - { - "loss": 2.3477, - "grad_norm": 1.982266902923584, - "learning_rate": 5e-05, - "epoch": 0.19305884612921817, - "step": 3015 - }, - { - "loss": 2.3555, - "grad_norm": 1.9882186651229858, - "learning_rate": 5e-05, - "epoch": 0.1933790100531472, - "step": 3020 - }, - { - "loss": 2.327, - "grad_norm": 1.8406589031219482, - "learning_rate": 5e-05, - "epoch": 0.19369917397707626, - "step": 3025 - }, - { - "loss": 2.3411, - "grad_norm": 1.9217675924301147, - "learning_rate": 5e-05, - "epoch": 0.1940193379010053, - "step": 3030 - }, - { - "loss": 2.3554, - "grad_norm": 1.9655086994171143, - "learning_rate": 5e-05, - "epoch": 0.19433950182493437, - "step": 3035 - }, - { - "loss": 2.373, - "grad_norm": 2.0582704544067383, - "learning_rate": 5e-05, - "epoch": 0.19465966574886343, - "step": 3040 - }, - { - "loss": 2.3424, - "grad_norm": 1.8587092161178589, - "learning_rate": 5e-05, - "epoch": 0.19497982967279248, - "step": 3045 - }, - { - "loss": 2.3319, - "grad_norm": 1.8777543306350708, - "learning_rate": 5e-05, - "epoch": 0.1952999935967215, - "step": 3050 - }, - { - "loss": 2.3365, - "grad_norm": 1.9621491432189941, - "learning_rate": 5e-05, - "epoch": 0.19562015752065057, - "step": 3055 - }, - { - "loss": 2.3459, - "grad_norm": 1.8719767332077026, - "learning_rate": 5e-05, - "epoch": 0.19594032144457962, - "step": 3060 - }, - { - "loss": 2.3487, - "grad_norm": 1.970376968383789, - "learning_rate": 5e-05, - "epoch": 0.19626048536850868, - "step": 3065 - }, - { - "loss": 2.3692, - "grad_norm": 1.8878765106201172, - "learning_rate": 5e-05, - "epoch": 0.19658064929243774, - "step": 3070 - }, - { - "loss": 2.3335, - "grad_norm": 1.9047449827194214, - "learning_rate": 5e-05, - "epoch": 0.1969008132163668, - "step": 3075 - }, - { - "loss": 2.3679, - "grad_norm": 1.8994492292404175, - "learning_rate": 5e-05, - "epoch": 0.19722097714029582, - "step": 3080 - }, - { - "loss": 2.3786, - "grad_norm": 1.8764537572860718, - "learning_rate": 5e-05, - "epoch": 0.19754114106422488, - "step": 3085 - }, - { - "loss": 2.3656, - "grad_norm": 1.937950849533081, - "learning_rate": 5e-05, - "epoch": 0.19786130498815394, - "step": 3090 - }, - { - "loss": 2.3501, - "grad_norm": 1.9067658185958862, - "learning_rate": 5e-05, - "epoch": 0.198181468912083, - "step": 3095 - }, - { - "loss": 2.3382, - "grad_norm": 1.9059247970581055, - "learning_rate": 5e-05, - "epoch": 0.19850163283601205, - "step": 3100 - }, - { - "loss": 2.3374, - "grad_norm": 1.9067003726959229, - "learning_rate": 5e-05, - "epoch": 0.19882179675994108, - "step": 3105 - }, - { - "loss": 2.3787, - "grad_norm": 1.860835075378418, - "learning_rate": 5e-05, - "epoch": 0.19914196068387013, - "step": 3110 - }, - { - "loss": 2.3622, - "grad_norm": 1.9090162515640259, - "learning_rate": 5e-05, - "epoch": 0.1994621246077992, - "step": 3115 - }, - { - "loss": 2.3465, - "grad_norm": 2.014218330383301, - "learning_rate": 5e-05, - "epoch": 0.19978228853172825, - "step": 3120 - }, - { - "loss": 2.3767, - "grad_norm": 1.9965459108352661, - "learning_rate": 5e-05, - "epoch": 0.2001024524556573, - "step": 3125 - }, - { - "loss": 2.3674, - "grad_norm": 1.9212764501571655, - "learning_rate": 5e-05, - "epoch": 0.20042261637958636, - "step": 3130 - }, - { - "loss": 2.3373, - "grad_norm": 1.9215936660766602, - "learning_rate": 5e-05, - "epoch": 0.2007427803035154, - "step": 3135 - }, - { - "loss": 2.3155, - "grad_norm": 1.9446462392807007, - "learning_rate": 5e-05, - "epoch": 0.20106294422744445, - "step": 3140 - }, - { - "loss": 2.3644, - "grad_norm": 1.8948733806610107, - "learning_rate": 5e-05, - "epoch": 0.2013831081513735, - "step": 3145 - }, - { - "loss": 2.3357, - "grad_norm": 1.919724702835083, - "learning_rate": 5e-05, - "epoch": 0.20170327207530256, - "step": 3150 - }, - { - "loss": 2.3539, - "grad_norm": 1.8508602380752563, - "learning_rate": 5e-05, - "epoch": 0.20202343599923162, - "step": 3155 - }, - { - "loss": 2.3601, - "grad_norm": 2.0129966735839844, - "learning_rate": 5e-05, - "epoch": 0.20234359992316067, - "step": 3160 - }, - { - "loss": 2.3601, - "grad_norm": 1.693015456199646, - "learning_rate": 5e-05, - "epoch": 0.2026637638470897, - "step": 3165 - }, - { - "loss": 2.3558, - "grad_norm": 1.956392526626587, - "learning_rate": 5e-05, - "epoch": 0.20298392777101876, - "step": 3170 - }, - { - "loss": 2.338, - "grad_norm": 1.818519949913025, - "learning_rate": 5e-05, - "epoch": 0.2033040916949478, - "step": 3175 - }, - { - "loss": 2.3717, - "grad_norm": 1.8295650482177734, - "learning_rate": 5e-05, - "epoch": 0.20362425561887687, - "step": 3180 - }, - { - "loss": 2.3375, - "grad_norm": 2.044961929321289, - "learning_rate": 5e-05, - "epoch": 0.20394441954280593, - "step": 3185 - }, - { - "loss": 2.3358, - "grad_norm": 2.061805248260498, - "learning_rate": 5e-05, - "epoch": 0.20426458346673496, - "step": 3190 - }, - { - "loss": 2.3462, - "grad_norm": 1.870125651359558, - "learning_rate": 5e-05, - "epoch": 0.204584747390664, - "step": 3195 - }, - { - "loss": 2.3482, - "grad_norm": 1.9724948406219482, - "learning_rate": 5e-05, - "epoch": 0.20490491131459307, - "step": 3200 - }, - { - "eval_loss": 2.2023611068725586, - "eval_runtime": 12.9284, - "eval_samples_per_second": 158.411, - "eval_steps_per_second": 19.801, - "epoch": 0.20490491131459307, - "step": 3200 - }, - { - "loss": 2.3575, - "grad_norm": 1.840522289276123, - "learning_rate": 5e-05, - "epoch": 0.20522507523852213, - "step": 3205 - }, - { - "loss": 2.35, - "grad_norm": 1.815750241279602, - "learning_rate": 5e-05, - "epoch": 0.20554523916245118, - "step": 3210 - }, - { - "loss": 2.3852, - "grad_norm": 1.8655439615249634, - "learning_rate": 5e-05, - "epoch": 0.20586540308638024, - "step": 3215 - }, - { - "loss": 2.3345, - "grad_norm": 1.8097730875015259, - "learning_rate": 5e-05, - "epoch": 0.20618556701030927, - "step": 3220 - }, - { - "loss": 2.3516, - "grad_norm": 1.7540837526321411, - "learning_rate": 5e-05, - "epoch": 0.20650573093423832, - "step": 3225 - }, - { - "loss": 2.3371, - "grad_norm": 1.8675291538238525, - "learning_rate": 5e-05, - "epoch": 0.20682589485816738, - "step": 3230 - }, - { - "loss": 2.352, - "grad_norm": 2.067999839782715, - "learning_rate": 5e-05, - "epoch": 0.20714605878209644, - "step": 3235 - }, - { - "loss": 2.3439, - "grad_norm": 1.8403364419937134, - "learning_rate": 5e-05, - "epoch": 0.2074662227060255, - "step": 3240 - }, - { - "loss": 2.3299, - "grad_norm": 1.8896580934524536, - "learning_rate": 5e-05, - "epoch": 0.20778638662995455, - "step": 3245 - }, - { - "loss": 2.3492, - "grad_norm": 1.941513180732727, - "learning_rate": 5e-05, - "epoch": 0.20810655055388358, - "step": 3250 - }, - { - "loss": 2.3744, - "grad_norm": 1.916695237159729, - "learning_rate": 5e-05, - "epoch": 0.20842671447781264, - "step": 3255 - }, - { - "loss": 2.3438, - "grad_norm": 1.907235026359558, - "learning_rate": 5e-05, - "epoch": 0.2087468784017417, - "step": 3260 - }, - { - "loss": 2.3503, - "grad_norm": 1.864814281463623, - "learning_rate": 5e-05, - "epoch": 0.20906704232567075, - "step": 3265 - }, - { - "loss": 2.3345, - "grad_norm": 1.8227028846740723, - "learning_rate": 5e-05, - "epoch": 0.2093872062495998, - "step": 3270 - }, - { - "loss": 2.3648, - "grad_norm": 1.9340879917144775, - "learning_rate": 5e-05, - "epoch": 0.20970737017352883, - "step": 3275 - }, - { - "loss": 2.3289, - "grad_norm": 1.8352042436599731, - "learning_rate": 5e-05, - "epoch": 0.2100275340974579, - "step": 3280 - }, - { - "loss": 2.3582, - "grad_norm": 1.7997504472732544, - "learning_rate": 5e-05, - "epoch": 0.21034769802138695, - "step": 3285 - }, - { - "loss": 2.3383, - "grad_norm": 1.8354027271270752, - "learning_rate": 5e-05, - "epoch": 0.210667861945316, - "step": 3290 - }, - { - "loss": 2.3495, - "grad_norm": 1.9071873426437378, - "learning_rate": 5e-05, - "epoch": 0.21098802586924506, - "step": 3295 - }, - { - "loss": 2.3667, - "grad_norm": 1.8804770708084106, - "learning_rate": 5e-05, - "epoch": 0.21130818979317412, - "step": 3300 - }, - { - "loss": 2.348, - "grad_norm": 1.8979647159576416, - "learning_rate": 5e-05, - "epoch": 0.21162835371710315, - "step": 3305 - }, - { - "loss": 2.3424, - "grad_norm": 1.9298757314682007, - "learning_rate": 5e-05, - "epoch": 0.2119485176410322, - "step": 3310 - }, - { - "loss": 2.3626, - "grad_norm": 2.027535915374756, - "learning_rate": 5e-05, - "epoch": 0.21226868156496126, - "step": 3315 - }, - { - "loss": 2.3596, - "grad_norm": 1.896079421043396, - "learning_rate": 5e-05, - "epoch": 0.21258884548889032, - "step": 3320 - }, - { - "loss": 2.3383, - "grad_norm": 1.798487901687622, - "learning_rate": 5e-05, - "epoch": 0.21290900941281937, - "step": 3325 - }, - { - "loss": 2.3482, - "grad_norm": 1.9177759885787964, - "learning_rate": 5e-05, - "epoch": 0.21322917333674843, - "step": 3330 - }, - { - "loss": 2.3124, - "grad_norm": 1.9621219635009766, - "learning_rate": 5e-05, - "epoch": 0.21354933726067746, - "step": 3335 - }, - { - "loss": 2.3662, - "grad_norm": 1.9927774667739868, - "learning_rate": 5e-05, - "epoch": 0.21386950118460651, - "step": 3340 - }, - { - "loss": 2.3203, - "grad_norm": 1.8306477069854736, - "learning_rate": 5e-05, - "epoch": 0.21418966510853557, - "step": 3345 - }, - { - "loss": 2.3709, - "grad_norm": 1.922379732131958, - "learning_rate": 5e-05, - "epoch": 0.21450982903246463, - "step": 3350 - }, - { - "loss": 2.3456, - "grad_norm": 1.8414316177368164, - "learning_rate": 5e-05, - "epoch": 0.21482999295639368, - "step": 3355 - }, - { - "loss": 2.331, - "grad_norm": 1.847821831703186, - "learning_rate": 5e-05, - "epoch": 0.2151501568803227, - "step": 3360 - }, - { - "loss": 2.3428, - "grad_norm": 1.8807631731033325, - "learning_rate": 5e-05, - "epoch": 0.21547032080425177, - "step": 3365 - }, - { - "loss": 2.335, - "grad_norm": 1.9684065580368042, - "learning_rate": 5e-05, - "epoch": 0.21579048472818083, - "step": 3370 - }, - { - "loss": 2.3302, - "grad_norm": 1.9757294654846191, - "learning_rate": 5e-05, - "epoch": 0.21611064865210988, - "step": 3375 - }, - { - "loss": 2.3542, - "grad_norm": 2.087277412414551, - "learning_rate": 5e-05, - "epoch": 0.21643081257603894, - "step": 3380 - }, - { - "loss": 2.3651, - "grad_norm": 2.1057560443878174, - "learning_rate": 5e-05, - "epoch": 0.216750976499968, - "step": 3385 - }, - { - "loss": 2.3234, - "grad_norm": 1.805690884590149, - "learning_rate": 5e-05, - "epoch": 0.21707114042389702, - "step": 3390 - }, - { - "loss": 2.3306, - "grad_norm": 1.82710862159729, - "learning_rate": 5e-05, - "epoch": 0.21739130434782608, - "step": 3395 - }, - { - "loss": 2.3316, - "grad_norm": 1.8435773849487305, - "learning_rate": 5e-05, - "epoch": 0.21771146827175514, - "step": 3400 - }, - { - "eval_loss": 2.198260545730591, - "eval_runtime": 12.4233, - "eval_samples_per_second": 164.852, - "eval_steps_per_second": 20.606, - "epoch": 0.21771146827175514, - "step": 3400 - }, - { - "loss": 2.3341, - "grad_norm": 1.901654839515686, - "learning_rate": 5e-05, - "epoch": 0.2180316321956842, - "step": 3405 - }, - { - "loss": 2.3537, - "grad_norm": 1.9241116046905518, - "learning_rate": 5e-05, - "epoch": 0.21835179611961325, - "step": 3410 - }, - { - "loss": 2.3616, - "grad_norm": 1.9076368808746338, - "learning_rate": 5e-05, - "epoch": 0.2186719600435423, - "step": 3415 - }, - { - "loss": 2.3516, - "grad_norm": 1.940434217453003, - "learning_rate": 5e-05, - "epoch": 0.21899212396747134, - "step": 3420 - }, - { - "loss": 2.3462, - "grad_norm": 1.9202423095703125, - "learning_rate": 5e-05, - "epoch": 0.2193122878914004, - "step": 3425 - }, - { - "loss": 2.3302, - "grad_norm": 1.8307172060012817, - "learning_rate": 5e-05, - "epoch": 0.21963245181532945, - "step": 3430 - }, - { - "loss": 2.3675, - "grad_norm": 1.825799584388733, - "learning_rate": 5e-05, - "epoch": 0.2199526157392585, - "step": 3435 - }, - { - "loss": 2.3283, - "grad_norm": 1.83090341091156, - "learning_rate": 5e-05, - "epoch": 0.22027277966318756, - "step": 3440 - }, - { - "loss": 2.3698, - "grad_norm": 1.8049718141555786, - "learning_rate": 5e-05, - "epoch": 0.2205929435871166, - "step": 3445 - }, - { - "loss": 2.3352, - "grad_norm": 2.047487735748291, - "learning_rate": 5e-05, - "epoch": 0.22091310751104565, - "step": 3450 - }, - { - "loss": 2.3209, - "grad_norm": 1.8851560354232788, - "learning_rate": 5e-05, - "epoch": 0.2212332714349747, - "step": 3455 - }, - { - "loss": 2.3278, - "grad_norm": 1.781719446182251, - "learning_rate": 5e-05, - "epoch": 0.22155343535890376, - "step": 3460 - }, - { - "loss": 2.3379, - "grad_norm": 1.822160243988037, - "learning_rate": 5e-05, - "epoch": 0.22187359928283282, - "step": 3465 - }, - { - "loss": 2.3341, - "grad_norm": 1.7682366371154785, - "learning_rate": 5e-05, - "epoch": 0.22219376320676187, - "step": 3470 - }, - { - "loss": 2.3617, - "grad_norm": 2.0239579677581787, - "learning_rate": 5e-05, - "epoch": 0.2225139271306909, - "step": 3475 - }, - { - "loss": 2.3376, - "grad_norm": 1.9589248895645142, - "learning_rate": 5e-05, - "epoch": 0.22283409105461996, - "step": 3480 - }, - { - "loss": 2.325, - "grad_norm": 1.8296499252319336, - "learning_rate": 5e-05, - "epoch": 0.22315425497854902, - "step": 3485 - }, - { - "loss": 2.3318, - "grad_norm": 1.7005376815795898, - "learning_rate": 5e-05, - "epoch": 0.22347441890247807, - "step": 3490 - }, - { - "loss": 2.3311, - "grad_norm": 1.8033257722854614, - "learning_rate": 5e-05, - "epoch": 0.22379458282640713, - "step": 3495 - }, - { - "loss": 2.3697, - "grad_norm": 1.813000202178955, - "learning_rate": 5e-05, - "epoch": 0.22411474675033619, - "step": 3500 - }, - { - "loss": 2.3146, - "grad_norm": 1.8140153884887695, - "learning_rate": 5e-05, - "epoch": 0.22443491067426521, - "step": 3505 - }, - { - "loss": 2.3511, - "grad_norm": 1.8404661417007446, - "learning_rate": 5e-05, - "epoch": 0.22475507459819427, - "step": 3510 - }, - { - "loss": 2.3413, - "grad_norm": 1.7392330169677734, - "learning_rate": 5e-05, - "epoch": 0.22507523852212333, - "step": 3515 - }, - { - "loss": 2.382, - "grad_norm": 1.833692193031311, - "learning_rate": 5e-05, - "epoch": 0.22539540244605238, - "step": 3520 - }, - { - "loss": 2.3425, - "grad_norm": 1.841347575187683, - "learning_rate": 5e-05, - "epoch": 0.22571556636998144, - "step": 3525 - }, - { - "loss": 2.3298, - "grad_norm": 1.789963960647583, - "learning_rate": 5e-05, - "epoch": 0.22603573029391047, - "step": 3530 - }, - { - "loss": 2.3307, - "grad_norm": 1.7983555793762207, - "learning_rate": 5e-05, - "epoch": 0.22635589421783953, - "step": 3535 - }, - { - "loss": 2.3397, - "grad_norm": 2.009568452835083, - "learning_rate": 5e-05, - "epoch": 0.22667605814176858, - "step": 3540 - }, - { - "loss": 2.314, - "grad_norm": 1.8560316562652588, - "learning_rate": 5e-05, - "epoch": 0.22699622206569764, - "step": 3545 - }, - { - "loss": 2.3296, - "grad_norm": 1.8345584869384766, - "learning_rate": 5e-05, - "epoch": 0.2273163859896267, - "step": 3550 - }, - { - "loss": 2.3488, - "grad_norm": 1.8176889419555664, - "learning_rate": 5e-05, - "epoch": 0.22763654991355575, - "step": 3555 - }, - { - "loss": 2.3174, - "grad_norm": 1.8083289861679077, - "learning_rate": 5e-05, - "epoch": 0.22795671383748478, - "step": 3560 - }, - { - "loss": 2.3544, - "grad_norm": 1.8767235279083252, - "learning_rate": 5e-05, - "epoch": 0.22827687776141384, - "step": 3565 - }, - { - "loss": 2.3371, - "grad_norm": 1.7118626832962036, - "learning_rate": 5e-05, - "epoch": 0.2285970416853429, - "step": 3570 - }, - { - "loss": 2.3282, - "grad_norm": 1.8422014713287354, - "learning_rate": 5e-05, - "epoch": 0.22891720560927195, - "step": 3575 - }, - { - "loss": 2.3305, - "grad_norm": 1.787657380104065, - "learning_rate": 5e-05, - "epoch": 0.229237369533201, - "step": 3580 - }, - { - "loss": 2.3791, - "grad_norm": 1.8549391031265259, - "learning_rate": 5e-05, - "epoch": 0.22955753345713006, - "step": 3585 - }, - { - "loss": 2.3312, - "grad_norm": 1.923996090888977, - "learning_rate": 5e-05, - "epoch": 0.2298776973810591, - "step": 3590 - }, - { - "loss": 2.3529, - "grad_norm": 1.9095416069030762, - "learning_rate": 5e-05, - "epoch": 0.23019786130498815, - "step": 3595 - }, - { - "loss": 2.3178, - "grad_norm": 1.8591622114181519, - "learning_rate": 5e-05, - "epoch": 0.2305180252289172, - "step": 3600 - }, - { - "eval_loss": 2.183711528778076, - "eval_runtime": 9.2433, - "eval_samples_per_second": 221.566, - "eval_steps_per_second": 27.696, - "epoch": 0.2305180252289172, - "step": 3600 - }, - { - "loss": 2.3165, - "grad_norm": 1.8757052421569824, - "learning_rate": 5e-05, - "epoch": 0.23083818915284626, - "step": 3605 - }, - { - "loss": 2.3444, - "grad_norm": 1.7773499488830566, - "learning_rate": 5e-05, - "epoch": 0.23115835307677532, - "step": 3610 - }, - { - "loss": 2.3176, - "grad_norm": 1.9299156665802002, - "learning_rate": 5e-05, - "epoch": 0.23147851700070435, - "step": 3615 - }, - { - "loss": 2.3268, - "grad_norm": 1.8555759191513062, - "learning_rate": 5e-05, - "epoch": 0.2317986809246334, - "step": 3620 - }, - { - "loss": 2.3094, - "grad_norm": 1.8463343381881714, - "learning_rate": 5e-05, - "epoch": 0.23211884484856246, - "step": 3625 - }, - { - "loss": 2.3227, - "grad_norm": 1.9021217823028564, - "learning_rate": 5e-05, - "epoch": 0.23243900877249152, - "step": 3630 - }, - { - "loss": 2.33, - "grad_norm": 1.8267807960510254, - "learning_rate": 5e-05, - "epoch": 0.23275917269642057, - "step": 3635 - }, - { - "loss": 2.3597, - "grad_norm": 1.7418160438537598, - "learning_rate": 5e-05, - "epoch": 0.23307933662034963, - "step": 3640 - }, - { - "loss": 2.3096, - "grad_norm": 1.9065451622009277, - "learning_rate": 5e-05, - "epoch": 0.23339950054427866, - "step": 3645 - }, - { - "loss": 2.3189, - "grad_norm": 1.8539282083511353, - "learning_rate": 5e-05, - "epoch": 0.23371966446820772, - "step": 3650 - }, - { - "loss": 2.3683, - "grad_norm": 1.8925061225891113, - "learning_rate": 5e-05, - "epoch": 0.23403982839213677, - "step": 3655 - }, - { - "loss": 2.3056, - "grad_norm": 1.8763203620910645, - "learning_rate": 5e-05, - "epoch": 0.23435999231606583, - "step": 3660 - }, - { - "loss": 2.3628, - "grad_norm": 1.9830697774887085, - "learning_rate": 5e-05, - "epoch": 0.23468015623999489, - "step": 3665 - }, - { - "loss": 2.3377, - "grad_norm": 1.957559585571289, - "learning_rate": 5e-05, - "epoch": 0.23500032016392394, - "step": 3670 - }, - { - "loss": 2.3105, - "grad_norm": 1.8156100511550903, - "learning_rate": 5e-05, - "epoch": 0.23532048408785297, - "step": 3675 - }, - { - "loss": 2.3339, - "grad_norm": 1.8064128160476685, - "learning_rate": 5e-05, - "epoch": 0.23564064801178203, - "step": 3680 - }, - { - "loss": 2.3292, - "grad_norm": 1.7643941640853882, - "learning_rate": 5e-05, - "epoch": 0.23596081193571108, - "step": 3685 - }, - { - "loss": 2.3173, - "grad_norm": 1.9001303911209106, - "learning_rate": 5e-05, - "epoch": 0.23628097585964014, - "step": 3690 - }, - { - "loss": 2.319, - "grad_norm": 1.7739763259887695, - "learning_rate": 5e-05, - "epoch": 0.2366011397835692, - "step": 3695 - }, - { - "loss": 2.3218, - "grad_norm": 1.7484267950057983, - "learning_rate": 5e-05, - "epoch": 0.23692130370749823, - "step": 3700 - }, - { - "loss": 2.3221, - "grad_norm": 1.8267314434051514, - "learning_rate": 5e-05, - "epoch": 0.23724146763142728, - "step": 3705 - }, - { - "loss": 2.3376, - "grad_norm": 1.9269976615905762, - "learning_rate": 5e-05, - "epoch": 0.23756163155535634, - "step": 3710 - }, - { - "loss": 2.3206, - "grad_norm": 1.820557951927185, - "learning_rate": 5e-05, - "epoch": 0.2378817954792854, - "step": 3715 - }, - { - "loss": 2.3379, - "grad_norm": 1.788172960281372, - "learning_rate": 5e-05, - "epoch": 0.23820195940321445, - "step": 3720 - }, - { - "loss": 2.339, - "grad_norm": 1.866925835609436, - "learning_rate": 5e-05, - "epoch": 0.2385221233271435, - "step": 3725 - }, - { - "loss": 2.3227, - "grad_norm": 1.9489960670471191, - "learning_rate": 5e-05, - "epoch": 0.23884228725107254, - "step": 3730 - }, - { - "loss": 2.3115, - "grad_norm": 1.8640096187591553, - "learning_rate": 5e-05, - "epoch": 0.2391624511750016, - "step": 3735 - }, - { - "loss": 2.3011, - "grad_norm": 1.8240951299667358, - "learning_rate": 5e-05, - "epoch": 0.23948261509893065, - "step": 3740 - }, - { - "loss": 2.3257, - "grad_norm": 1.8693873882293701, - "learning_rate": 5e-05, - "epoch": 0.2398027790228597, - "step": 3745 - }, - { - "loss": 2.3027, - "grad_norm": 1.879884958267212, - "learning_rate": 5e-05, - "epoch": 0.24012294294678876, - "step": 3750 - }, - { - "loss": 2.3408, - "grad_norm": 1.8470027446746826, - "learning_rate": 5e-05, - "epoch": 0.24044310687071782, - "step": 3755 - }, - { - "loss": 2.2935, - "grad_norm": 1.8507801294326782, - "learning_rate": 5e-05, - "epoch": 0.24076327079464685, - "step": 3760 - }, - { - "loss": 2.3283, - "grad_norm": 1.8579989671707153, - "learning_rate": 5e-05, - "epoch": 0.2410834347185759, - "step": 3765 - }, - { - "loss": 2.3095, - "grad_norm": 1.8650803565979004, - "learning_rate": 5e-05, - "epoch": 0.24140359864250496, - "step": 3770 - }, - { - "loss": 2.3032, - "grad_norm": 1.8193062543869019, - "learning_rate": 5e-05, - "epoch": 0.24172376256643402, - "step": 3775 - }, - { - "loss": 2.3259, - "grad_norm": 2.0027434825897217, - "learning_rate": 5e-05, - "epoch": 0.24204392649036308, - "step": 3780 - }, - { - "loss": 2.3105, - "grad_norm": 1.9422210454940796, - "learning_rate": 5e-05, - "epoch": 0.2423640904142921, - "step": 3785 - }, - { - "loss": 2.3438, - "grad_norm": 1.9346174001693726, - "learning_rate": 5e-05, - "epoch": 0.24268425433822116, - "step": 3790 - }, - { - "loss": 2.317, - "grad_norm": 1.8896723985671997, - "learning_rate": 5e-05, - "epoch": 0.24300441826215022, - "step": 3795 - }, - { - "loss": 2.3141, - "grad_norm": 1.8078259229660034, - "learning_rate": 5e-05, - "epoch": 0.24332458218607927, - "step": 3800 - }, - { - "eval_loss": 2.184438705444336, - "eval_runtime": 10.614, - "eval_samples_per_second": 192.952, - "eval_steps_per_second": 24.119, - "epoch": 0.24332458218607927, - "step": 3800 - }, - { - "loss": 2.3237, - "grad_norm": 1.9326097965240479, - "learning_rate": 5e-05, - "epoch": 0.24364474611000833, - "step": 3805 - }, - { - "loss": 2.3287, - "grad_norm": 1.8264923095703125, - "learning_rate": 5e-05, - "epoch": 0.2439649100339374, - "step": 3810 - }, - { - "loss": 2.3117, - "grad_norm": 1.9702720642089844, - "learning_rate": 5e-05, - "epoch": 0.24428507395786642, - "step": 3815 - }, - { - "loss": 2.306, - "grad_norm": 1.810170292854309, - "learning_rate": 5e-05, - "epoch": 0.24460523788179547, - "step": 3820 - }, - { - "loss": 2.3408, - "grad_norm": 1.8189213275909424, - "learning_rate": 5e-05, - "epoch": 0.24492540180572453, - "step": 3825 - }, - { - "loss": 2.3131, - "grad_norm": 1.8908005952835083, - "learning_rate": 5e-05, - "epoch": 0.2452455657296536, - "step": 3830 - }, - { - "loss": 2.339, - "grad_norm": 1.8011490106582642, - "learning_rate": 5e-05, - "epoch": 0.24556572965358264, - "step": 3835 - }, - { - "loss": 2.3175, - "grad_norm": 1.7648205757141113, - "learning_rate": 5e-05, - "epoch": 0.2458858935775117, - "step": 3840 - }, - { - "loss": 2.325, - "grad_norm": 1.8377348184585571, - "learning_rate": 5e-05, - "epoch": 0.24620605750144073, - "step": 3845 - }, - { - "loss": 2.2859, - "grad_norm": 1.8196038007736206, - "learning_rate": 5e-05, - "epoch": 0.24652622142536978, - "step": 3850 - }, - { - "loss": 2.331, - "grad_norm": 1.7730222940444946, - "learning_rate": 5e-05, - "epoch": 0.24684638534929884, - "step": 3855 - }, - { - "loss": 2.3507, - "grad_norm": 1.8816814422607422, - "learning_rate": 5e-05, - "epoch": 0.2471665492732279, - "step": 3860 - }, - { - "loss": 2.3348, - "grad_norm": 1.842856526374817, - "learning_rate": 5e-05, - "epoch": 0.24748671319715695, - "step": 3865 - }, - { - "loss": 2.3169, - "grad_norm": 1.792005181312561, - "learning_rate": 5e-05, - "epoch": 0.24780687712108598, - "step": 3870 - }, - { - "loss": 2.3106, - "grad_norm": 1.7900562286376953, - "learning_rate": 5e-05, - "epoch": 0.24812704104501504, - "step": 3875 - }, - { - "loss": 2.3323, - "grad_norm": 1.7834545373916626, - "learning_rate": 5e-05, - "epoch": 0.2484472049689441, - "step": 3880 - }, - { - "loss": 2.3434, - "grad_norm": 1.8184958696365356, - "learning_rate": 5e-05, - "epoch": 0.24876736889287315, - "step": 3885 - }, - { - "loss": 2.3054, - "grad_norm": 1.8056672811508179, - "learning_rate": 5e-05, - "epoch": 0.2490875328168022, - "step": 3890 - }, - { - "loss": 2.3238, - "grad_norm": 1.8434021472930908, - "learning_rate": 5e-05, - "epoch": 0.24940769674073127, - "step": 3895 - }, - { - "loss": 2.3059, - "grad_norm": 1.7343634366989136, - "learning_rate": 5e-05, - "epoch": 0.2497278606646603, - "step": 3900 - }, - { - "loss": 2.325, - "grad_norm": 1.7279424667358398, - "learning_rate": 5e-05, - "epoch": 0.2500480245885894, - "step": 3905 - }, - { - "loss": 2.323, - "grad_norm": 1.7715774774551392, - "learning_rate": 5e-05, - "epoch": 0.2503681885125184, - "step": 3910 - }, - { - "loss": 2.2872, - "grad_norm": 1.7786765098571777, - "learning_rate": 5e-05, - "epoch": 0.25068835243644744, - "step": 3915 - }, - { - "loss": 2.3408, - "grad_norm": 1.8894507884979248, - "learning_rate": 5e-05, - "epoch": 0.2510085163603765, - "step": 3920 - }, - { - "loss": 2.3193, - "grad_norm": 1.8058632612228394, - "learning_rate": 5e-05, - "epoch": 0.25132868028430555, - "step": 3925 - }, - { - "loss": 2.3168, - "grad_norm": 1.7818254232406616, - "learning_rate": 5e-05, - "epoch": 0.25164884420823463, - "step": 3930 - }, - { - "loss": 2.3393, - "grad_norm": 1.7840033769607544, - "learning_rate": 5e-05, - "epoch": 0.25196900813216366, - "step": 3935 - }, - { - "loss": 2.3405, - "grad_norm": 1.8498218059539795, - "learning_rate": 5e-05, - "epoch": 0.2522891720560927, - "step": 3940 - }, - { - "loss": 2.3186, - "grad_norm": 1.827964425086975, - "learning_rate": 5e-05, - "epoch": 0.2526093359800218, - "step": 3945 - }, - { - "loss": 2.3069, - "grad_norm": 1.8498241901397705, - "learning_rate": 5e-05, - "epoch": 0.2529294999039508, - "step": 3950 - }, - { - "loss": 2.3419, - "grad_norm": 1.7726775407791138, - "learning_rate": 5e-05, - "epoch": 0.2532496638278799, - "step": 3955 - }, - { - "loss": 2.3047, - "grad_norm": 1.9088823795318604, - "learning_rate": 5e-05, - "epoch": 0.2535698277518089, - "step": 3960 - }, - { - "loss": 2.334, - "grad_norm": 1.8803976774215698, - "learning_rate": 5e-05, - "epoch": 0.253889991675738, - "step": 3965 - }, - { - "loss": 2.2902, - "grad_norm": 1.914686679840088, - "learning_rate": 5e-05, - "epoch": 0.25421015559966703, - "step": 3970 - }, - { - "loss": 2.3233, - "grad_norm": 1.8192038536071777, - "learning_rate": 5e-05, - "epoch": 0.25453031952359606, - "step": 3975 - }, - { - "loss": 2.3367, - "grad_norm": 1.8976246118545532, - "learning_rate": 5e-05, - "epoch": 0.25485048344752514, - "step": 3980 - }, - { - "loss": 2.2671, - "grad_norm": 1.7621504068374634, - "learning_rate": 5e-05, - "epoch": 0.2551706473714542, - "step": 3985 - }, - { - "loss": 2.3326, - "grad_norm": 1.912398338317871, - "learning_rate": 5e-05, - "epoch": 0.25549081129538326, - "step": 3990 - }, - { - "loss": 2.3008, - "grad_norm": 2.027517557144165, - "learning_rate": 5e-05, - "epoch": 0.2558109752193123, - "step": 3995 - }, - { - "loss": 2.3146, - "grad_norm": 1.934006690979004, - "learning_rate": 5e-05, - "epoch": 0.2561311391432413, - "step": 4000 - }, - { - "eval_loss": 2.177450180053711, - "eval_runtime": 9.6248, - "eval_samples_per_second": 212.784, - "eval_steps_per_second": 26.598, - "epoch": 0.2561311391432413, - "step": 4000 - }, - { - "loss": 2.2979, - "grad_norm": 1.8951339721679688, - "learning_rate": 5e-05, - "epoch": 0.2564513030671704, - "step": 4005 - }, - { - "loss": 2.3171, - "grad_norm": 1.7967989444732666, - "learning_rate": 5e-05, - "epoch": 0.25677146699109943, - "step": 4010 - }, - { - "loss": 2.3077, - "grad_norm": 1.878688097000122, - "learning_rate": 5e-05, - "epoch": 0.2570916309150285, - "step": 4015 - }, - { - "loss": 2.2935, - "grad_norm": 1.855326771736145, - "learning_rate": 5e-05, - "epoch": 0.25741179483895754, - "step": 4020 - }, - { - "loss": 2.3404, - "grad_norm": 1.797782063484192, - "learning_rate": 5e-05, - "epoch": 0.25773195876288657, - "step": 4025 - }, - { - "loss": 2.2913, - "grad_norm": 1.9316190481185913, - "learning_rate": 5e-05, - "epoch": 0.25805212268681565, - "step": 4030 - }, - { - "loss": 2.306, - "grad_norm": 1.7924227714538574, - "learning_rate": 5e-05, - "epoch": 0.2583722866107447, - "step": 4035 - }, - { - "loss": 2.3185, - "grad_norm": 1.8619980812072754, - "learning_rate": 5e-05, - "epoch": 0.25869245053467377, - "step": 4040 - }, - { - "loss": 2.3279, - "grad_norm": 1.931126594543457, - "learning_rate": 5e-05, - "epoch": 0.2590126144586028, - "step": 4045 - }, - { - "loss": 2.3288, - "grad_norm": 1.8355220556259155, - "learning_rate": 5e-05, - "epoch": 0.2593327783825319, - "step": 4050 - }, - { - "loss": 2.3015, - "grad_norm": 1.8821378946304321, - "learning_rate": 5e-05, - "epoch": 0.2596529423064609, - "step": 4055 - }, - { - "loss": 2.3181, - "grad_norm": 1.929376482963562, - "learning_rate": 5e-05, - "epoch": 0.25997310623038994, - "step": 4060 - }, - { - "loss": 2.3188, - "grad_norm": 1.8176177740097046, - "learning_rate": 5e-05, - "epoch": 0.260293270154319, - "step": 4065 - }, - { - "loss": 2.3164, - "grad_norm": 1.7493705749511719, - "learning_rate": 5e-05, - "epoch": 0.26061343407824805, - "step": 4070 - }, - { - "loss": 2.2848, - "grad_norm": 1.8640022277832031, - "learning_rate": 5e-05, - "epoch": 0.26093359800217714, - "step": 4075 - }, - { - "loss": 2.3192, - "grad_norm": 1.8012224435806274, - "learning_rate": 5e-05, - "epoch": 0.26125376192610616, - "step": 4080 - }, - { - "loss": 2.3191, - "grad_norm": 1.9000436067581177, - "learning_rate": 5e-05, - "epoch": 0.2615739258500352, - "step": 4085 - }, - { - "loss": 2.3032, - "grad_norm": 1.8532963991165161, - "learning_rate": 5e-05, - "epoch": 0.2618940897739643, - "step": 4090 - }, - { - "loss": 2.326, - "grad_norm": 1.8395898342132568, - "learning_rate": 5e-05, - "epoch": 0.2622142536978933, - "step": 4095 - }, - { - "loss": 2.3354, - "grad_norm": 1.7998299598693848, - "learning_rate": 5e-05, - "epoch": 0.2625344176218224, - "step": 4100 - }, - { - "loss": 2.3093, - "grad_norm": 1.8509643077850342, - "learning_rate": 5e-05, - "epoch": 0.2628545815457514, - "step": 4105 - }, - { - "loss": 2.3287, - "grad_norm": 1.9016660451889038, - "learning_rate": 5e-05, - "epoch": 0.26317474546968045, - "step": 4110 - }, - { - "loss": 2.2871, - "grad_norm": 1.8604185581207275, - "learning_rate": 5e-05, - "epoch": 0.26349490939360953, - "step": 4115 - }, - { - "loss": 2.3072, - "grad_norm": 1.842264175415039, - "learning_rate": 5e-05, - "epoch": 0.26381507331753856, - "step": 4120 - }, - { - "loss": 2.3312, - "grad_norm": 1.7420934438705444, - "learning_rate": 5e-05, - "epoch": 0.26413523724146765, - "step": 4125 - }, - { - "loss": 2.3131, - "grad_norm": 1.7676818370819092, - "learning_rate": 5e-05, - "epoch": 0.2644554011653967, - "step": 4130 - }, - { - "loss": 2.3026, - "grad_norm": 1.7885444164276123, - "learning_rate": 5e-05, - "epoch": 0.26477556508932576, - "step": 4135 - }, - { - "loss": 2.3201, - "grad_norm": 1.6721593141555786, - "learning_rate": 5e-05, - "epoch": 0.2650957290132548, - "step": 4140 - }, - { - "loss": 2.3047, - "grad_norm": 1.8331459760665894, - "learning_rate": 5e-05, - "epoch": 0.2654158929371838, - "step": 4145 - }, - { - "loss": 2.3259, - "grad_norm": 1.735121250152588, - "learning_rate": 5e-05, - "epoch": 0.2657360568611129, - "step": 4150 - }, - { - "loss": 2.2825, - "grad_norm": 1.8103950023651123, - "learning_rate": 5e-05, - "epoch": 0.26605622078504193, - "step": 4155 - }, - { - "loss": 2.3094, - "grad_norm": 1.833533525466919, - "learning_rate": 5e-05, - "epoch": 0.266376384708971, - "step": 4160 - }, - { - "loss": 2.3258, - "grad_norm": 1.7850996255874634, - "learning_rate": 5e-05, - "epoch": 0.26669654863290004, - "step": 4165 - }, - { - "loss": 2.314, - "grad_norm": 1.8073853254318237, - "learning_rate": 5e-05, - "epoch": 0.2670167125568291, - "step": 4170 - }, - { - "loss": 2.33, - "grad_norm": 1.7849806547164917, - "learning_rate": 5e-05, - "epoch": 0.26733687648075816, - "step": 4175 - }, - { - "loss": 2.3151, - "grad_norm": 1.7261276245117188, - "learning_rate": 5e-05, - "epoch": 0.2676570404046872, - "step": 4180 - }, - { - "loss": 2.3036, - "grad_norm": 1.763243556022644, - "learning_rate": 5e-05, - "epoch": 0.26797720432861627, - "step": 4185 - }, - { - "loss": 2.3306, - "grad_norm": 1.8771343231201172, - "learning_rate": 5e-05, - "epoch": 0.2682973682525453, - "step": 4190 - }, - { - "loss": 2.3093, - "grad_norm": 1.8824447393417358, - "learning_rate": 5e-05, - "epoch": 0.2686175321764743, - "step": 4195 - }, - { - "loss": 2.3243, - "grad_norm": 1.757983684539795, - "learning_rate": 5e-05, - "epoch": 0.2689376961004034, - "step": 4200 - }, - { - "eval_loss": 2.162459135055542, - "eval_runtime": 12.641, - "eval_samples_per_second": 162.012, - "eval_steps_per_second": 20.252, - "epoch": 0.2689376961004034, - "step": 4200 - }, - { - "loss": 2.3164, - "grad_norm": 1.7963429689407349, - "learning_rate": 5e-05, - "epoch": 0.26925786002433244, - "step": 4205 - }, - { - "loss": 2.3378, - "grad_norm": 1.9342796802520752, - "learning_rate": 5e-05, - "epoch": 0.2695780239482615, - "step": 4210 - }, - { - "loss": 2.3121, - "grad_norm": 1.863183856010437, - "learning_rate": 5e-05, - "epoch": 0.26989818787219055, - "step": 4215 - }, - { - "loss": 2.2946, - "grad_norm": 1.7715400457382202, - "learning_rate": 5e-05, - "epoch": 0.27021835179611964, - "step": 4220 - }, - { - "loss": 2.2986, - "grad_norm": 1.806707739830017, - "learning_rate": 5e-05, - "epoch": 0.27053851572004867, - "step": 4225 - }, - { - "loss": 2.3095, - "grad_norm": 1.8026810884475708, - "learning_rate": 5e-05, - "epoch": 0.2708586796439777, - "step": 4230 - }, - { - "loss": 2.3165, - "grad_norm": 1.7845863103866577, - "learning_rate": 5e-05, - "epoch": 0.2711788435679068, - "step": 4235 - }, - { - "loss": 2.3014, - "grad_norm": 1.864893913269043, - "learning_rate": 5e-05, - "epoch": 0.2714990074918358, - "step": 4240 - }, - { - "loss": 2.3238, - "grad_norm": 1.7305742502212524, - "learning_rate": 5e-05, - "epoch": 0.2718191714157649, - "step": 4245 - }, - { - "loss": 2.291, - "grad_norm": 1.7948355674743652, - "learning_rate": 5e-05, - "epoch": 0.2721393353396939, - "step": 4250 - }, - { - "loss": 2.3144, - "grad_norm": 1.7819257974624634, - "learning_rate": 5e-05, - "epoch": 0.27245949926362295, - "step": 4255 - }, - { - "loss": 2.327, - "grad_norm": 1.8523086309432983, - "learning_rate": 5e-05, - "epoch": 0.27277966318755204, - "step": 4260 - }, - { - "loss": 2.3293, - "grad_norm": 1.7722643613815308, - "learning_rate": 5e-05, - "epoch": 0.27309982711148106, - "step": 4265 - }, - { - "loss": 2.303, - "grad_norm": 1.8291378021240234, - "learning_rate": 5e-05, - "epoch": 0.27341999103541015, - "step": 4270 - }, - { - "loss": 2.2893, - "grad_norm": 1.8277583122253418, - "learning_rate": 5e-05, - "epoch": 0.2737401549593392, - "step": 4275 - }, - { - "loss": 2.3348, - "grad_norm": 1.8024441003799438, - "learning_rate": 5e-05, - "epoch": 0.2740603188832682, - "step": 4280 - }, - { - "loss": 2.3024, - "grad_norm": 1.8651007413864136, - "learning_rate": 5e-05, - "epoch": 0.2743804828071973, - "step": 4285 - }, - { - "loss": 2.327, - "grad_norm": 1.8744381666183472, - "learning_rate": 5e-05, - "epoch": 0.2747006467311263, - "step": 4290 - }, - { - "loss": 2.3225, - "grad_norm": 1.8396573066711426, - "learning_rate": 5e-05, - "epoch": 0.2750208106550554, - "step": 4295 - }, - { - "loss": 2.2918, - "grad_norm": 1.7585549354553223, - "learning_rate": 5e-05, - "epoch": 0.27534097457898443, - "step": 4300 - }, - { - "loss": 2.3135, - "grad_norm": 1.8332717418670654, - "learning_rate": 5e-05, - "epoch": 0.2756611385029135, - "step": 4305 - }, - { - "loss": 2.3176, - "grad_norm": 1.8986752033233643, - "learning_rate": 5e-05, - "epoch": 0.27598130242684255, - "step": 4310 - }, - { - "loss": 2.2758, - "grad_norm": 1.7375805377960205, - "learning_rate": 5e-05, - "epoch": 0.2763014663507716, - "step": 4315 - }, - { - "loss": 2.2786, - "grad_norm": 1.838408350944519, - "learning_rate": 5e-05, - "epoch": 0.27662163027470066, - "step": 4320 - }, - { - "loss": 2.3119, - "grad_norm": 1.9200383424758911, - "learning_rate": 5e-05, - "epoch": 0.2769417941986297, - "step": 4325 - }, - { - "loss": 2.3082, - "grad_norm": 1.8573769330978394, - "learning_rate": 5e-05, - "epoch": 0.27726195812255877, - "step": 4330 - }, - { - "loss": 2.2946, - "grad_norm": 1.819273829460144, - "learning_rate": 5e-05, - "epoch": 0.2775821220464878, - "step": 4335 - }, - { - "loss": 2.28, - "grad_norm": 1.8978952169418335, - "learning_rate": 5e-05, - "epoch": 0.27790228597041683, - "step": 4340 - }, - { - "loss": 2.3058, - "grad_norm": 1.7782193422317505, - "learning_rate": 5e-05, - "epoch": 0.2782224498943459, - "step": 4345 - }, - { - "loss": 2.3189, - "grad_norm": 1.831231951713562, - "learning_rate": 5e-05, - "epoch": 0.27854261381827494, - "step": 4350 - }, - { - "loss": 2.314, - "grad_norm": 1.830064296722412, - "learning_rate": 5e-05, - "epoch": 0.278862777742204, - "step": 4355 - }, - { - "loss": 2.2983, - "grad_norm": 1.8492834568023682, - "learning_rate": 5e-05, - "epoch": 0.27918294166613306, - "step": 4360 - }, - { - "loss": 2.3379, - "grad_norm": 1.841322898864746, - "learning_rate": 5e-05, - "epoch": 0.2795031055900621, - "step": 4365 - }, - { - "loss": 2.3309, - "grad_norm": 1.8109886646270752, - "learning_rate": 5e-05, - "epoch": 0.27982326951399117, - "step": 4370 - }, - { - "loss": 2.2967, - "grad_norm": 1.9388337135314941, - "learning_rate": 5e-05, - "epoch": 0.2801434334379202, - "step": 4375 - }, - { - "loss": 2.3163, - "grad_norm": 1.8596948385238647, - "learning_rate": 5e-05, - "epoch": 0.2804635973618493, - "step": 4380 - }, - { - "loss": 2.2899, - "grad_norm": 1.8299187421798706, - "learning_rate": 5e-05, - "epoch": 0.2807837612857783, - "step": 4385 - }, - { - "loss": 2.3007, - "grad_norm": 1.819220781326294, - "learning_rate": 5e-05, - "epoch": 0.2811039252097074, - "step": 4390 - }, - { - "loss": 2.312, - "grad_norm": 1.8518681526184082, - "learning_rate": 5e-05, - "epoch": 0.2814240891336364, - "step": 4395 - }, - { - "loss": 2.3215, - "grad_norm": 1.8841506242752075, - "learning_rate": 5e-05, - "epoch": 0.28174425305756545, - "step": 4400 - }, - { - "eval_loss": 2.1684622764587402, - "eval_runtime": 9.6418, - "eval_samples_per_second": 212.408, - "eval_steps_per_second": 26.551, - "epoch": 0.28174425305756545, - "step": 4400 - }, - { - "loss": 2.2986, - "grad_norm": 1.71231210231781, - "learning_rate": 5e-05, - "epoch": 0.28206441698149454, - "step": 4405 - }, - { - "loss": 2.2829, - "grad_norm": 1.7305104732513428, - "learning_rate": 5e-05, - "epoch": 0.28238458090542357, - "step": 4410 - }, - { - "loss": 2.2749, - "grad_norm": 1.8442025184631348, - "learning_rate": 5e-05, - "epoch": 0.28270474482935265, - "step": 4415 - }, - { - "loss": 2.3002, - "grad_norm": 1.8370575904846191, - "learning_rate": 5e-05, - "epoch": 0.2830249087532817, - "step": 4420 - }, - { - "loss": 2.2997, - "grad_norm": 1.8042954206466675, - "learning_rate": 5e-05, - "epoch": 0.2833450726772107, - "step": 4425 - }, - { - "loss": 2.3224, - "grad_norm": 1.7841765880584717, - "learning_rate": 5e-05, - "epoch": 0.2836652366011398, - "step": 4430 - }, - { - "loss": 2.3205, - "grad_norm": 1.6575603485107422, - "learning_rate": 5e-05, - "epoch": 0.2839854005250688, - "step": 4435 - }, - { - "loss": 2.301, - "grad_norm": 1.8698128461837769, - "learning_rate": 5e-05, - "epoch": 0.2843055644489979, - "step": 4440 - }, - { - "loss": 2.3192, - "grad_norm": 1.8466641902923584, - "learning_rate": 5e-05, - "epoch": 0.28462572837292693, - "step": 4445 - }, - { - "loss": 2.2704, - "grad_norm": 1.7934186458587646, - "learning_rate": 5e-05, - "epoch": 0.28494589229685596, - "step": 4450 - }, - { - "loss": 2.3225, - "grad_norm": 1.770643949508667, - "learning_rate": 5e-05, - "epoch": 0.28526605622078505, - "step": 4455 - }, - { - "loss": 2.3238, - "grad_norm": 1.7914665937423706, - "learning_rate": 5e-05, - "epoch": 0.2855862201447141, - "step": 4460 - }, - { - "loss": 2.3193, - "grad_norm": 1.7819799184799194, - "learning_rate": 5e-05, - "epoch": 0.28590638406864316, - "step": 4465 - }, - { - "loss": 2.2642, - "grad_norm": 1.7854515314102173, - "learning_rate": 5e-05, - "epoch": 0.2862265479925722, - "step": 4470 - }, - { - "loss": 2.3097, - "grad_norm": 1.703332543373108, - "learning_rate": 5e-05, - "epoch": 0.2865467119165013, - "step": 4475 - }, - { - "loss": 2.3122, - "grad_norm": 1.7654129266738892, - "learning_rate": 5e-05, - "epoch": 0.2868668758404303, - "step": 4480 - }, - { - "loss": 2.3142, - "grad_norm": 1.8920791149139404, - "learning_rate": 5e-05, - "epoch": 0.28718703976435933, - "step": 4485 - }, - { - "loss": 2.3208, - "grad_norm": 1.824573278427124, - "learning_rate": 5e-05, - "epoch": 0.2875072036882884, - "step": 4490 - }, - { - "loss": 2.3027, - "grad_norm": 1.7249481678009033, - "learning_rate": 5e-05, - "epoch": 0.28782736761221744, - "step": 4495 - }, - { - "loss": 2.2931, - "grad_norm": 1.8988478183746338, - "learning_rate": 5e-05, - "epoch": 0.28814753153614653, - "step": 4500 - }, - { - "loss": 2.3057, - "grad_norm": 1.8929831981658936, - "learning_rate": 5e-05, - "epoch": 0.28846769546007556, - "step": 4505 - }, - { - "loss": 2.313, - "grad_norm": 1.8110840320587158, - "learning_rate": 5e-05, - "epoch": 0.2887878593840046, - "step": 4510 - }, - { - "loss": 2.2986, - "grad_norm": 1.7756503820419312, - "learning_rate": 5e-05, - "epoch": 0.28910802330793367, - "step": 4515 - }, - { - "loss": 2.2993, - "grad_norm": 1.8048218488693237, - "learning_rate": 5e-05, - "epoch": 0.2894281872318627, - "step": 4520 - }, - { - "loss": 2.283, - "grad_norm": 1.7407152652740479, - "learning_rate": 5e-05, - "epoch": 0.2897483511557918, - "step": 4525 - }, - { - "loss": 2.3091, - "grad_norm": 1.909650206565857, - "learning_rate": 5e-05, - "epoch": 0.2900685150797208, - "step": 4530 - }, - { - "loss": 2.3102, - "grad_norm": 1.803214430809021, - "learning_rate": 5e-05, - "epoch": 0.29038867900364984, - "step": 4535 - }, - { - "loss": 2.2942, - "grad_norm": 1.9347703456878662, - "learning_rate": 5e-05, - "epoch": 0.2907088429275789, - "step": 4540 - }, - { - "loss": 2.2967, - "grad_norm": 1.8654407262802124, - "learning_rate": 5e-05, - "epoch": 0.29102900685150795, - "step": 4545 - }, - { - "loss": 2.3208, - "grad_norm": 1.7373706102371216, - "learning_rate": 5e-05, - "epoch": 0.29134917077543704, - "step": 4550 - }, - { - "loss": 2.3188, - "grad_norm": 1.7621229887008667, - "learning_rate": 5e-05, - "epoch": 0.29166933469936607, - "step": 4555 - }, - { - "loss": 2.2971, - "grad_norm": 1.7987569570541382, - "learning_rate": 5e-05, - "epoch": 0.29198949862329515, - "step": 4560 - }, - { - "loss": 2.2926, - "grad_norm": 1.8752938508987427, - "learning_rate": 5e-05, - "epoch": 0.2923096625472242, - "step": 4565 - }, - { - "loss": 2.294, - "grad_norm": 1.809169888496399, - "learning_rate": 5e-05, - "epoch": 0.2926298264711532, - "step": 4570 - }, - { - "loss": 2.3059, - "grad_norm": 1.8496021032333374, - "learning_rate": 5e-05, - "epoch": 0.2929499903950823, - "step": 4575 - }, - { - "loss": 2.2714, - "grad_norm": 1.8275306224822998, - "learning_rate": 5e-05, - "epoch": 0.2932701543190113, - "step": 4580 - }, - { - "loss": 2.2797, - "grad_norm": 1.8231137990951538, - "learning_rate": 5e-05, - "epoch": 0.2935903182429404, - "step": 4585 - }, - { - "loss": 2.2901, - "grad_norm": 1.7881653308868408, - "learning_rate": 5e-05, - "epoch": 0.29391048216686944, - "step": 4590 - }, - { - "loss": 2.3043, - "grad_norm": 1.9115880727767944, - "learning_rate": 5e-05, - "epoch": 0.29423064609079846, - "step": 4595 - }, - { - "loss": 2.3292, - "grad_norm": 1.8070696592330933, - "learning_rate": 5e-05, - "epoch": 0.29455081001472755, - "step": 4600 - }, - { - "eval_loss": 2.1569859981536865, - "eval_runtime": 9.5207, - "eval_samples_per_second": 215.111, - "eval_steps_per_second": 26.889, - "epoch": 0.29455081001472755, - "step": 4600 - }, - { - "loss": 2.32, - "grad_norm": 1.7979247570037842, - "learning_rate": 5e-05, - "epoch": 0.2948709739386566, - "step": 4605 - }, - { - "loss": 2.2949, - "grad_norm": 1.7743096351623535, - "learning_rate": 5e-05, - "epoch": 0.29519113786258566, - "step": 4610 - }, - { - "loss": 2.2916, - "grad_norm": 1.7690064907073975, - "learning_rate": 5e-05, - "epoch": 0.2955113017865147, - "step": 4615 - }, - { - "loss": 2.3084, - "grad_norm": 1.9324722290039062, - "learning_rate": 5e-05, - "epoch": 0.2958314657104437, - "step": 4620 - }, - { - "loss": 2.2975, - "grad_norm": 1.7818751335144043, - "learning_rate": 5e-05, - "epoch": 0.2961516296343728, - "step": 4625 - }, - { - "loss": 2.2689, - "grad_norm": 1.7577718496322632, - "learning_rate": 5e-05, - "epoch": 0.29647179355830183, - "step": 4630 - }, - { - "loss": 2.2863, - "grad_norm": 1.7863922119140625, - "learning_rate": 5e-05, - "epoch": 0.2967919574822309, - "step": 4635 - }, - { - "loss": 2.2954, - "grad_norm": 1.8004027605056763, - "learning_rate": 5e-05, - "epoch": 0.29711212140615995, - "step": 4640 - }, - { - "loss": 2.3267, - "grad_norm": 1.7635235786437988, - "learning_rate": 5e-05, - "epoch": 0.29743228533008903, - "step": 4645 - }, - { - "loss": 2.3047, - "grad_norm": 1.814304232597351, - "learning_rate": 5e-05, - "epoch": 0.29775244925401806, - "step": 4650 - }, - { - "loss": 2.2948, - "grad_norm": 1.8464570045471191, - "learning_rate": 5e-05, - "epoch": 0.2980726131779471, - "step": 4655 - }, - { - "loss": 2.2944, - "grad_norm": 1.7262645959854126, - "learning_rate": 5e-05, - "epoch": 0.29839277710187617, - "step": 4660 - }, - { - "loss": 2.3012, - "grad_norm": 1.7750794887542725, - "learning_rate": 5e-05, - "epoch": 0.2987129410258052, - "step": 4665 - }, - { - "loss": 2.2965, - "grad_norm": 1.7386796474456787, - "learning_rate": 5e-05, - "epoch": 0.2990331049497343, - "step": 4670 - }, - { - "loss": 2.2986, - "grad_norm": 1.8167015314102173, - "learning_rate": 5e-05, - "epoch": 0.2993532688736633, - "step": 4675 - }, - { - "loss": 2.2928, - "grad_norm": 1.7372899055480957, - "learning_rate": 5e-05, - "epoch": 0.29967343279759234, - "step": 4680 - }, - { - "loss": 2.2767, - "grad_norm": 1.793278694152832, - "learning_rate": 5e-05, - "epoch": 0.2999935967215214, - "step": 4685 - }, - { - "loss": 2.2887, - "grad_norm": 1.93364679813385, - "learning_rate": 5e-05, - "epoch": 0.30031376064545046, - "step": 4690 - }, - { - "loss": 2.3015, - "grad_norm": 1.7167513370513916, - "learning_rate": 5e-05, - "epoch": 0.30063392456937954, - "step": 4695 - }, - { - "loss": 2.2635, - "grad_norm": 1.7310161590576172, - "learning_rate": 5e-05, - "epoch": 0.30095408849330857, - "step": 4700 - }, - { - "loss": 2.2942, - "grad_norm": 1.7478691339492798, - "learning_rate": 5e-05, - "epoch": 0.3012742524172376, - "step": 4705 - }, - { - "loss": 2.2552, - "grad_norm": 1.8167970180511475, - "learning_rate": 5e-05, - "epoch": 0.3015944163411667, - "step": 4710 - }, - { - "loss": 2.2812, - "grad_norm": 1.6952241659164429, - "learning_rate": 5e-05, - "epoch": 0.3019145802650957, - "step": 4715 - }, - { - "loss": 2.2908, - "grad_norm": 1.7196714878082275, - "learning_rate": 5e-05, - "epoch": 0.3022347441890248, - "step": 4720 - }, - { - "loss": 2.2867, - "grad_norm": 1.7747132778167725, - "learning_rate": 5e-05, - "epoch": 0.3025549081129538, - "step": 4725 - }, - { - "loss": 2.2695, - "grad_norm": 1.8552742004394531, - "learning_rate": 5e-05, - "epoch": 0.3028750720368829, - "step": 4730 - }, - { - "loss": 2.264, - "grad_norm": 1.719187617301941, - "learning_rate": 5e-05, - "epoch": 0.30319523596081194, - "step": 4735 - }, - { - "loss": 2.2794, - "grad_norm": 1.8311821222305298, - "learning_rate": 5e-05, - "epoch": 0.30351539988474097, - "step": 4740 - }, - { - "loss": 2.2649, - "grad_norm": 1.7115150690078735, - "learning_rate": 5e-05, - "epoch": 0.30383556380867005, - "step": 4745 - }, - { - "loss": 2.299, - "grad_norm": 1.7218992710113525, - "learning_rate": 5e-05, - "epoch": 0.3041557277325991, - "step": 4750 - }, - { - "loss": 2.3031, - "grad_norm": 1.7692986726760864, - "learning_rate": 5e-05, - "epoch": 0.30447589165652816, - "step": 4755 - }, - { - "loss": 2.3133, - "grad_norm": 1.7613261938095093, - "learning_rate": 5e-05, - "epoch": 0.3047960555804572, - "step": 4760 - }, - { - "loss": 2.3096, - "grad_norm": 1.762600064277649, - "learning_rate": 5e-05, - "epoch": 0.3051162195043862, - "step": 4765 - }, - { - "loss": 2.2789, - "grad_norm": 1.7768152952194214, - "learning_rate": 5e-05, - "epoch": 0.3054363834283153, - "step": 4770 - }, - { - "loss": 2.3094, - "grad_norm": 1.8207039833068848, - "learning_rate": 5e-05, - "epoch": 0.30575654735224433, - "step": 4775 - }, - { - "loss": 2.2842, - "grad_norm": 1.7720569372177124, - "learning_rate": 5e-05, - "epoch": 0.3060767112761734, - "step": 4780 - }, - { - "loss": 2.3167, - "grad_norm": 1.7781318426132202, - "learning_rate": 5e-05, - "epoch": 0.30639687520010245, - "step": 4785 - }, - { - "loss": 2.2924, - "grad_norm": 1.797167181968689, - "learning_rate": 5e-05, - "epoch": 0.3067170391240315, - "step": 4790 - }, - { - "loss": 2.2818, - "grad_norm": 1.7862792015075684, - "learning_rate": 5e-05, - "epoch": 0.30703720304796056, - "step": 4795 - }, - { - "loss": 2.2911, - "grad_norm": 1.913051724433899, - "learning_rate": 5e-05, - "epoch": 0.3073573669718896, - "step": 4800 - }, - { - "eval_loss": 2.1567511558532715, - "eval_runtime": 9.5969, - "eval_samples_per_second": 213.403, - "eval_steps_per_second": 26.675, - "epoch": 0.3073573669718896, - "step": 4800 - }, - { - "loss": 2.3157, - "grad_norm": 1.9030219316482544, - "learning_rate": 5e-05, - "epoch": 0.3076775308958187, - "step": 4805 - }, - { - "loss": 2.2967, - "grad_norm": 1.981708288192749, - "learning_rate": 5e-05, - "epoch": 0.3079976948197477, - "step": 4810 - }, - { - "loss": 2.2553, - "grad_norm": 1.7999526262283325, - "learning_rate": 5e-05, - "epoch": 0.3083178587436768, - "step": 4815 - }, - { - "loss": 2.2993, - "grad_norm": 1.7089029550552368, - "learning_rate": 5e-05, - "epoch": 0.3086380226676058, - "step": 4820 - }, - { - "loss": 2.2782, - "grad_norm": 1.7940775156021118, - "learning_rate": 5e-05, - "epoch": 0.30895818659153484, - "step": 4825 - }, - { - "loss": 2.2651, - "grad_norm": 1.896036148071289, - "learning_rate": 5e-05, - "epoch": 0.30927835051546393, - "step": 4830 - }, - { - "loss": 2.2781, - "grad_norm": 1.7181426286697388, - "learning_rate": 5e-05, - "epoch": 0.30959851443939296, - "step": 4835 - }, - { - "loss": 2.3079, - "grad_norm": 1.7568175792694092, - "learning_rate": 5e-05, - "epoch": 0.30991867836332204, - "step": 4840 - }, - { - "loss": 2.3256, - "grad_norm": 1.7117818593978882, - "learning_rate": 5e-05, - "epoch": 0.31023884228725107, - "step": 4845 - }, - { - "loss": 2.2652, - "grad_norm": 1.8535692691802979, - "learning_rate": 5e-05, - "epoch": 0.3105590062111801, - "step": 4850 - }, - { - "loss": 2.3082, - "grad_norm": 1.8518201112747192, - "learning_rate": 5e-05, - "epoch": 0.3108791701351092, - "step": 4855 - }, - { - "loss": 2.2893, - "grad_norm": 1.875934362411499, - "learning_rate": 5e-05, - "epoch": 0.3111993340590382, - "step": 4860 - }, - { - "loss": 2.2672, - "grad_norm": 1.743920087814331, - "learning_rate": 5e-05, - "epoch": 0.3115194979829673, - "step": 4865 - }, - { - "loss": 2.2895, - "grad_norm": 1.7549186944961548, - "learning_rate": 5e-05, - "epoch": 0.3118396619068963, - "step": 4870 - }, - { - "loss": 2.282, - "grad_norm": 1.7128772735595703, - "learning_rate": 5e-05, - "epoch": 0.31215982583082535, - "step": 4875 - }, - { - "loss": 2.3079, - "grad_norm": 1.7349681854248047, - "learning_rate": 5e-05, - "epoch": 0.31247998975475444, - "step": 4880 - }, - { - "loss": 2.2719, - "grad_norm": 1.8261305093765259, - "learning_rate": 5e-05, - "epoch": 0.31280015367868347, - "step": 4885 - }, - { - "loss": 2.2801, - "grad_norm": 1.7922636270523071, - "learning_rate": 5e-05, - "epoch": 0.31312031760261255, - "step": 4890 - }, - { - "loss": 2.2963, - "grad_norm": 1.8621406555175781, - "learning_rate": 5e-05, - "epoch": 0.3134404815265416, - "step": 4895 - }, - { - "loss": 2.2842, - "grad_norm": 1.7960196733474731, - "learning_rate": 5e-05, - "epoch": 0.31376064545047067, - "step": 4900 - }, - { - "loss": 2.2535, - "grad_norm": 1.7535030841827393, - "learning_rate": 5e-05, - "epoch": 0.3140808093743997, - "step": 4905 - }, - { - "loss": 2.2706, - "grad_norm": 1.7128777503967285, - "learning_rate": 5e-05, - "epoch": 0.3144009732983287, - "step": 4910 - }, - { - "loss": 2.2963, - "grad_norm": 1.7324950695037842, - "learning_rate": 5e-05, - "epoch": 0.3147211372222578, - "step": 4915 - }, - { - "loss": 2.2996, - "grad_norm": 1.759783387184143, - "learning_rate": 5e-05, - "epoch": 0.31504130114618684, - "step": 4920 - }, - { - "loss": 2.2934, - "grad_norm": 1.799742579460144, - "learning_rate": 5e-05, - "epoch": 0.3153614650701159, - "step": 4925 - }, - { - "loss": 2.2862, - "grad_norm": 1.726730465888977, - "learning_rate": 5e-05, - "epoch": 0.31568162899404495, - "step": 4930 - }, - { - "loss": 2.2912, - "grad_norm": 1.8253145217895508, - "learning_rate": 5e-05, - "epoch": 0.316001792917974, - "step": 4935 - }, - { - "loss": 2.3154, - "grad_norm": 1.7888239622116089, - "learning_rate": 5e-05, - "epoch": 0.31632195684190306, - "step": 4940 - }, - { - "loss": 2.291, - "grad_norm": 1.818763256072998, - "learning_rate": 5e-05, - "epoch": 0.3166421207658321, - "step": 4945 - }, - { - "loss": 2.2879, - "grad_norm": 1.679724097251892, - "learning_rate": 5e-05, - "epoch": 0.3169622846897612, - "step": 4950 - }, - { - "loss": 2.276, - "grad_norm": 1.7187193632125854, - "learning_rate": 5e-05, - "epoch": 0.3172824486136902, - "step": 4955 - }, - { - "loss": 2.3043, - "grad_norm": 1.8776874542236328, - "learning_rate": 5e-05, - "epoch": 0.3176026125376193, - "step": 4960 - }, - { - "loss": 2.299, - "grad_norm": 1.8387751579284668, - "learning_rate": 5e-05, - "epoch": 0.3179227764615483, - "step": 4965 - }, - { - "loss": 2.2942, - "grad_norm": 1.8348480463027954, - "learning_rate": 5e-05, - "epoch": 0.31824294038547735, - "step": 4970 - }, - { - "loss": 2.2789, - "grad_norm": 1.76790189743042, - "learning_rate": 5e-05, - "epoch": 0.31856310430940643, - "step": 4975 - }, - { - "loss": 2.2821, - "grad_norm": 1.7413114309310913, - "learning_rate": 5e-05, - "epoch": 0.31888326823333546, - "step": 4980 - }, - { - "loss": 2.2802, - "grad_norm": 1.720826268196106, - "learning_rate": 5e-05, - "epoch": 0.31920343215726454, - "step": 4985 - }, - { - "loss": 2.3128, - "grad_norm": 1.6995984315872192, - "learning_rate": 5e-05, - "epoch": 0.3195235960811936, - "step": 4990 - }, - { - "loss": 2.3075, - "grad_norm": 1.8378366231918335, - "learning_rate": 5e-05, - "epoch": 0.3198437600051226, - "step": 4995 - }, - { - "loss": 2.3002, - "grad_norm": 1.812118411064148, - "learning_rate": 5e-05, - "epoch": 0.3201639239290517, - "step": 5000 - }, - { - "eval_loss": 2.1544718742370605, - "eval_runtime": 9.2883, - "eval_samples_per_second": 220.493, - "eval_steps_per_second": 27.562, - "epoch": 0.3201639239290517, - "step": 5000 - }, - { - "loss": 2.2662, - "grad_norm": 1.87315833568573, - "learning_rate": 5e-05, - "epoch": 0.3204840878529807, - "step": 5005 - }, - { - "loss": 2.3016, - "grad_norm": 1.7631300687789917, - "learning_rate": 5e-05, - "epoch": 0.3208042517769098, - "step": 5010 - }, - { - "loss": 2.3032, - "grad_norm": 1.8889778852462769, - "learning_rate": 5e-05, - "epoch": 0.32112441570083883, - "step": 5015 - }, - { - "loss": 2.3028, - "grad_norm": 1.7224068641662598, - "learning_rate": 5e-05, - "epoch": 0.32144457962476786, - "step": 5020 - }, - { - "loss": 2.2573, - "grad_norm": 1.7411279678344727, - "learning_rate": 5e-05, - "epoch": 0.32176474354869694, - "step": 5025 - }, - { - "loss": 2.2797, - "grad_norm": 2.0215229988098145, - "learning_rate": 5e-05, - "epoch": 0.32208490747262597, - "step": 5030 - }, - { - "loss": 2.2683, - "grad_norm": 1.7024788856506348, - "learning_rate": 5e-05, - "epoch": 0.32240507139655505, - "step": 5035 - }, - { - "loss": 2.2709, - "grad_norm": 1.871773600578308, - "learning_rate": 5e-05, - "epoch": 0.3227252353204841, - "step": 5040 - }, - { - "loss": 2.28, - "grad_norm": 1.672973394393921, - "learning_rate": 5e-05, - "epoch": 0.32304539924441317, - "step": 5045 - }, - { - "loss": 2.2829, - "grad_norm": 1.955171823501587, - "learning_rate": 5e-05, - "epoch": 0.3233655631683422, - "step": 5050 - }, - { - "loss": 2.2957, - "grad_norm": 1.820365071296692, - "learning_rate": 5e-05, - "epoch": 0.3236857270922712, - "step": 5055 - }, - { - "loss": 2.2904, - "grad_norm": 1.8486545085906982, - "learning_rate": 5e-05, - "epoch": 0.3240058910162003, - "step": 5060 - }, - { - "loss": 2.297, - "grad_norm": 1.7132619619369507, - "learning_rate": 5e-05, - "epoch": 0.32432605494012934, - "step": 5065 - }, - { - "loss": 2.2782, - "grad_norm": 1.8040876388549805, - "learning_rate": 5e-05, - "epoch": 0.3246462188640584, - "step": 5070 - }, - { - "loss": 2.2734, - "grad_norm": 1.7798943519592285, - "learning_rate": 5e-05, - "epoch": 0.32496638278798745, - "step": 5075 - }, - { - "loss": 2.2636, - "grad_norm": 1.7690693140029907, - "learning_rate": 5e-05, - "epoch": 0.3252865467119165, - "step": 5080 - }, - { - "loss": 2.2689, - "grad_norm": 1.793588638305664, - "learning_rate": 5e-05, - "epoch": 0.32560671063584556, - "step": 5085 - }, - { - "loss": 2.3057, - "grad_norm": 1.7516108751296997, - "learning_rate": 5e-05, - "epoch": 0.3259268745597746, - "step": 5090 - }, - { - "loss": 2.2766, - "grad_norm": 1.7585774660110474, - "learning_rate": 5e-05, - "epoch": 0.3262470384837037, - "step": 5095 - }, - { - "loss": 2.3102, - "grad_norm": 1.7661858797073364, - "learning_rate": 5e-05, - "epoch": 0.3265672024076327, - "step": 5100 - }, - { - "loss": 2.3072, - "grad_norm": 1.7506427764892578, - "learning_rate": 5e-05, - "epoch": 0.32688736633156174, - "step": 5105 - }, - { - "loss": 2.2912, - "grad_norm": 1.7459840774536133, - "learning_rate": 5e-05, - "epoch": 0.3272075302554908, - "step": 5110 - }, - { - "loss": 2.2875, - "grad_norm": 1.7619469165802002, - "learning_rate": 5e-05, - "epoch": 0.32752769417941985, - "step": 5115 - }, - { - "loss": 2.275, - "grad_norm": 1.7539411783218384, - "learning_rate": 5e-05, - "epoch": 0.32784785810334893, - "step": 5120 - }, - { - "loss": 2.2907, - "grad_norm": 1.726323127746582, - "learning_rate": 5e-05, - "epoch": 0.32816802202727796, - "step": 5125 - }, - { - "loss": 2.2491, - "grad_norm": 1.6898913383483887, - "learning_rate": 5e-05, - "epoch": 0.32848818595120705, - "step": 5130 - }, - { - "loss": 2.3008, - "grad_norm": 1.7721654176712036, - "learning_rate": 5e-05, - "epoch": 0.3288083498751361, - "step": 5135 - }, - { - "loss": 2.2871, - "grad_norm": 1.859742522239685, - "learning_rate": 5e-05, - "epoch": 0.3291285137990651, - "step": 5140 - }, - { - "loss": 2.2713, - "grad_norm": 1.8325496912002563, - "learning_rate": 5e-05, - "epoch": 0.3294486777229942, - "step": 5145 - }, - { - "loss": 2.2931, - "grad_norm": 1.845747709274292, - "learning_rate": 5e-05, - "epoch": 0.3297688416469232, - "step": 5150 - }, - { - "loss": 2.2746, - "grad_norm": 1.8495094776153564, - "learning_rate": 5e-05, - "epoch": 0.3300890055708523, - "step": 5155 - }, - { - "loss": 2.2889, - "grad_norm": 1.7559887170791626, - "learning_rate": 5e-05, - "epoch": 0.33040916949478133, - "step": 5160 - }, - { - "loss": 2.3083, - "grad_norm": 1.7825977802276611, - "learning_rate": 5e-05, - "epoch": 0.33072933341871036, - "step": 5165 - }, - { - "loss": 2.2705, - "grad_norm": 1.788145899772644, - "learning_rate": 5e-05, - "epoch": 0.33104949734263944, - "step": 5170 - }, - { - "loss": 2.2735, - "grad_norm": 1.755177617073059, - "learning_rate": 5e-05, - "epoch": 0.33136966126656847, - "step": 5175 - }, - { - "loss": 2.2677, - "grad_norm": 1.7675113677978516, - "learning_rate": 5e-05, - "epoch": 0.33168982519049756, - "step": 5180 - }, - { - "loss": 2.3125, - "grad_norm": 1.7566906213760376, - "learning_rate": 5e-05, - "epoch": 0.3320099891144266, - "step": 5185 - }, - { - "loss": 2.2741, - "grad_norm": 1.8251054286956787, - "learning_rate": 5e-05, - "epoch": 0.3323301530383556, - "step": 5190 - }, - { - "loss": 2.255, - "grad_norm": 1.815388560295105, - "learning_rate": 5e-05, - "epoch": 0.3326503169622847, - "step": 5195 - }, - { - "loss": 2.3014, - "grad_norm": 1.8509902954101562, - "learning_rate": 5e-05, - "epoch": 0.3329704808862137, - "step": 5200 - }, - { - "eval_loss": 2.1533737182617188, - "eval_runtime": 13.3544, - "eval_samples_per_second": 153.358, - "eval_steps_per_second": 19.17, - "epoch": 0.3329704808862137, - "step": 5200 - }, - { - "loss": 2.2773, - "grad_norm": 1.8529142141342163, - "learning_rate": 5e-05, - "epoch": 0.3332906448101428, - "step": 5205 - }, - { - "loss": 2.2882, - "grad_norm": 1.8580735921859741, - "learning_rate": 5e-05, - "epoch": 0.33361080873407184, - "step": 5210 - }, - { - "loss": 2.2652, - "grad_norm": 1.8027414083480835, - "learning_rate": 5e-05, - "epoch": 0.3339309726580009, - "step": 5215 - }, - { - "loss": 2.2714, - "grad_norm": 1.7679603099822998, - "learning_rate": 5e-05, - "epoch": 0.33425113658192995, - "step": 5220 - }, - { - "loss": 2.2793, - "grad_norm": 1.730897068977356, - "learning_rate": 5e-05, - "epoch": 0.334571300505859, - "step": 5225 - }, - { - "loss": 2.275, - "grad_norm": 1.7817909717559814, - "learning_rate": 5e-05, - "epoch": 0.33489146442978807, - "step": 5230 - }, - { - "loss": 2.27, - "grad_norm": 1.763421893119812, - "learning_rate": 5e-05, - "epoch": 0.3352116283537171, - "step": 5235 - }, - { - "loss": 2.2856, - "grad_norm": 1.7344940900802612, - "learning_rate": 5e-05, - "epoch": 0.3355317922776462, - "step": 5240 - }, - { - "loss": 2.2903, - "grad_norm": 1.8662790060043335, - "learning_rate": 5e-05, - "epoch": 0.3358519562015752, - "step": 5245 - }, - { - "loss": 2.2768, - "grad_norm": 1.7917147874832153, - "learning_rate": 5e-05, - "epoch": 0.33617212012550424, - "step": 5250 - }, - { - "loss": 2.2903, - "grad_norm": 1.734060525894165, - "learning_rate": 5e-05, - "epoch": 0.3364922840494333, - "step": 5255 - }, - { - "loss": 2.268, - "grad_norm": 1.7482142448425293, - "learning_rate": 5e-05, - "epoch": 0.33681244797336235, - "step": 5260 - }, - { - "loss": 2.2548, - "grad_norm": 1.819955825805664, - "learning_rate": 5e-05, - "epoch": 0.33713261189729143, - "step": 5265 - }, - { - "loss": 2.2723, - "grad_norm": 1.7646571397781372, - "learning_rate": 5e-05, - "epoch": 0.33745277582122046, - "step": 5270 - }, - { - "loss": 2.2785, - "grad_norm": 1.7525885105133057, - "learning_rate": 5e-05, - "epoch": 0.3377729397451495, - "step": 5275 - }, - { - "loss": 2.2877, - "grad_norm": 1.8543511629104614, - "learning_rate": 5e-05, - "epoch": 0.3380931036690786, - "step": 5280 - }, - { - "loss": 2.2604, - "grad_norm": 1.848857045173645, - "learning_rate": 5e-05, - "epoch": 0.3384132675930076, - "step": 5285 - }, - { - "loss": 2.2763, - "grad_norm": 1.8210421800613403, - "learning_rate": 5e-05, - "epoch": 0.3387334315169367, - "step": 5290 - }, - { - "loss": 2.2962, - "grad_norm": 1.717044472694397, - "learning_rate": 5e-05, - "epoch": 0.3390535954408657, - "step": 5295 - }, - { - "loss": 2.2777, - "grad_norm": 1.7110569477081299, - "learning_rate": 5e-05, - "epoch": 0.3393737593647948, - "step": 5300 - }, - { - "loss": 2.2395, - "grad_norm": 1.7508400678634644, - "learning_rate": 5e-05, - "epoch": 0.33969392328872383, - "step": 5305 - }, - { - "loss": 2.2731, - "grad_norm": 1.8925203084945679, - "learning_rate": 5e-05, - "epoch": 0.34001408721265286, - "step": 5310 - }, - { - "loss": 2.3024, - "grad_norm": 1.7870714664459229, - "learning_rate": 5e-05, - "epoch": 0.34033425113658194, - "step": 5315 - }, - { - "loss": 2.2829, - "grad_norm": 1.744795799255371, - "learning_rate": 5e-05, - "epoch": 0.340654415060511, - "step": 5320 - }, - { - "loss": 2.2815, - "grad_norm": 1.7675684690475464, - "learning_rate": 5e-05, - "epoch": 0.34097457898444006, - "step": 5325 - }, - { - "loss": 2.2803, - "grad_norm": 1.8785274028778076, - "learning_rate": 5e-05, - "epoch": 0.3412947429083691, - "step": 5330 - }, - { - "loss": 2.2804, - "grad_norm": 1.818994402885437, - "learning_rate": 5e-05, - "epoch": 0.3416149068322981, - "step": 5335 - }, - { - "loss": 2.3124, - "grad_norm": 1.9585684537887573, - "learning_rate": 5e-05, - "epoch": 0.3419350707562272, - "step": 5340 - }, - { - "loss": 2.2672, - "grad_norm": 1.770952820777893, - "learning_rate": 5e-05, - "epoch": 0.34225523468015623, - "step": 5345 - }, - { - "loss": 2.2677, - "grad_norm": 1.6577550172805786, - "learning_rate": 5e-05, - "epoch": 0.3425753986040853, - "step": 5350 - }, - { - "loss": 2.2676, - "grad_norm": 1.9661815166473389, - "learning_rate": 5e-05, - "epoch": 0.34289556252801434, - "step": 5355 - }, - { - "loss": 2.2868, - "grad_norm": 1.7929357290267944, - "learning_rate": 5e-05, - "epoch": 0.34321572645194337, - "step": 5360 - }, - { - "loss": 2.2842, - "grad_norm": 1.8188828229904175, - "learning_rate": 5e-05, - "epoch": 0.34353589037587245, - "step": 5365 - }, - { - "loss": 2.2513, - "grad_norm": 1.781710147857666, - "learning_rate": 5e-05, - "epoch": 0.3438560542998015, - "step": 5370 - }, - { - "loss": 2.2664, - "grad_norm": 1.8323997259140015, - "learning_rate": 5e-05, - "epoch": 0.34417621822373057, - "step": 5375 - }, - { - "loss": 2.2698, - "grad_norm": 1.8200249671936035, - "learning_rate": 5e-05, - "epoch": 0.3444963821476596, - "step": 5380 - }, - { - "loss": 2.2822, - "grad_norm": 1.749212622642517, - "learning_rate": 5e-05, - "epoch": 0.3448165460715887, - "step": 5385 - }, - { - "loss": 2.2588, - "grad_norm": 2.007263422012329, - "learning_rate": 5e-05, - "epoch": 0.3451367099955177, - "step": 5390 - }, - { - "loss": 2.2757, - "grad_norm": 1.8585344552993774, - "learning_rate": 5e-05, - "epoch": 0.34545687391944674, - "step": 5395 - }, - { - "loss": 2.2768, - "grad_norm": 1.923609733581543, - "learning_rate": 5e-05, - "epoch": 0.3457770378433758, - "step": 5400 - }, - { - "eval_loss": 2.1260647773742676, - "eval_runtime": 9.2705, - "eval_samples_per_second": 220.916, - "eval_steps_per_second": 27.615, - "epoch": 0.3457770378433758, - "step": 5400 - }, - { - "loss": 2.291, - "grad_norm": 1.8628069162368774, - "learning_rate": 5e-05, - "epoch": 0.34609720176730485, - "step": 5405 - }, - { - "loss": 2.276, - "grad_norm": 1.712012529373169, - "learning_rate": 5e-05, - "epoch": 0.34641736569123394, - "step": 5410 - }, - { - "loss": 2.2774, - "grad_norm": 1.8341697454452515, - "learning_rate": 5e-05, - "epoch": 0.34673752961516296, - "step": 5415 - }, - { - "loss": 2.2809, - "grad_norm": 1.7650182247161865, - "learning_rate": 5e-05, - "epoch": 0.347057693539092, - "step": 5420 - }, - { - "loss": 2.2893, - "grad_norm": 1.7278627157211304, - "learning_rate": 5e-05, - "epoch": 0.3473778574630211, - "step": 5425 - }, - { - "loss": 2.2507, - "grad_norm": 1.6427825689315796, - "learning_rate": 5e-05, - "epoch": 0.3476980213869501, - "step": 5430 - }, - { - "loss": 2.2821, - "grad_norm": 1.7547065019607544, - "learning_rate": 5e-05, - "epoch": 0.3480181853108792, - "step": 5435 - }, - { - "loss": 2.2893, - "grad_norm": 1.7182886600494385, - "learning_rate": 5e-05, - "epoch": 0.3483383492348082, - "step": 5440 - }, - { - "loss": 2.2876, - "grad_norm": 1.84799063205719, - "learning_rate": 5e-05, - "epoch": 0.34865851315873725, - "step": 5445 - }, - { - "loss": 2.2712, - "grad_norm": 1.6814558506011963, - "learning_rate": 5e-05, - "epoch": 0.34897867708266633, - "step": 5450 - }, - { - "loss": 2.29, - "grad_norm": 1.881182074546814, - "learning_rate": 5e-05, - "epoch": 0.34929884100659536, - "step": 5455 - }, - { - "loss": 2.2206, - "grad_norm": 1.8951886892318726, - "learning_rate": 5e-05, - "epoch": 0.34961900493052445, - "step": 5460 - }, - { - "loss": 2.2433, - "grad_norm": 1.8026858568191528, - "learning_rate": 5e-05, - "epoch": 0.3499391688544535, - "step": 5465 - }, - { - "loss": 2.2556, - "grad_norm": 1.7248677015304565, - "learning_rate": 5e-05, - "epoch": 0.35025933277838256, - "step": 5470 - }, - { - "loss": 2.2637, - "grad_norm": 1.7217531204223633, - "learning_rate": 5e-05, - "epoch": 0.3505794967023116, - "step": 5475 - }, - { - "loss": 2.2854, - "grad_norm": 1.7552876472473145, - "learning_rate": 5e-05, - "epoch": 0.3508996606262406, - "step": 5480 - }, - { - "loss": 2.262, - "grad_norm": 1.7518340349197388, - "learning_rate": 5e-05, - "epoch": 0.3512198245501697, - "step": 5485 - }, - { - "loss": 2.2613, - "grad_norm": 1.8185194730758667, - "learning_rate": 5e-05, - "epoch": 0.35153998847409873, - "step": 5490 - }, - { - "loss": 2.2867, - "grad_norm": 1.7345361709594727, - "learning_rate": 5e-05, - "epoch": 0.3518601523980278, - "step": 5495 - }, - { - "loss": 2.2498, - "grad_norm": 1.7367279529571533, - "learning_rate": 5e-05, - "epoch": 0.35218031632195684, - "step": 5500 - }, - { - "loss": 2.3032, - "grad_norm": 1.7460354566574097, - "learning_rate": 5e-05, - "epoch": 0.35250048024588587, - "step": 5505 - }, - { - "loss": 2.279, - "grad_norm": 1.6836531162261963, - "learning_rate": 5e-05, - "epoch": 0.35282064416981496, - "step": 5510 - }, - { - "loss": 2.2791, - "grad_norm": 1.7619463205337524, - "learning_rate": 5e-05, - "epoch": 0.353140808093744, - "step": 5515 - }, - { - "loss": 2.2974, - "grad_norm": 1.8177152872085571, - "learning_rate": 5e-05, - "epoch": 0.35346097201767307, - "step": 5520 - }, - { - "loss": 2.3094, - "grad_norm": 1.7507604360580444, - "learning_rate": 5e-05, - "epoch": 0.3537811359416021, - "step": 5525 - }, - { - "loss": 2.2745, - "grad_norm": 1.7359153032302856, - "learning_rate": 5e-05, - "epoch": 0.3541012998655311, - "step": 5530 - }, - { - "loss": 2.2713, - "grad_norm": 1.7324638366699219, - "learning_rate": 5e-05, - "epoch": 0.3544214637894602, - "step": 5535 - }, - { - "loss": 2.3063, - "grad_norm": 1.7245142459869385, - "learning_rate": 5e-05, - "epoch": 0.35474162771338924, - "step": 5540 - }, - { - "loss": 2.2258, - "grad_norm": 1.699273943901062, - "learning_rate": 5e-05, - "epoch": 0.3550617916373183, - "step": 5545 - }, - { - "loss": 2.2459, - "grad_norm": 1.653936505317688, - "learning_rate": 5e-05, - "epoch": 0.35538195556124735, - "step": 5550 - }, - { - "loss": 2.2984, - "grad_norm": 1.7689787149429321, - "learning_rate": 5e-05, - "epoch": 0.35570211948517644, - "step": 5555 - }, - { - "loss": 2.2585, - "grad_norm": 1.693535327911377, - "learning_rate": 5e-05, - "epoch": 0.35602228340910547, - "step": 5560 - }, - { - "loss": 2.2832, - "grad_norm": 1.801584243774414, - "learning_rate": 5e-05, - "epoch": 0.3563424473330345, - "step": 5565 - }, - { - "loss": 2.3029, - "grad_norm": 1.8008770942687988, - "learning_rate": 5e-05, - "epoch": 0.3566626112569636, - "step": 5570 - }, - { - "loss": 2.3078, - "grad_norm": 1.7314320802688599, - "learning_rate": 5e-05, - "epoch": 0.3569827751808926, - "step": 5575 - }, - { - "loss": 2.2484, - "grad_norm": 1.7883455753326416, - "learning_rate": 5e-05, - "epoch": 0.3573029391048217, - "step": 5580 - }, - { - "loss": 2.2453, - "grad_norm": 1.7167059183120728, - "learning_rate": 5e-05, - "epoch": 0.3576231030287507, - "step": 5585 - }, - { - "loss": 2.2577, - "grad_norm": 1.7459754943847656, - "learning_rate": 5e-05, - "epoch": 0.35794326695267975, - "step": 5590 - }, - { - "loss": 2.2555, - "grad_norm": 1.783430576324463, - "learning_rate": 5e-05, - "epoch": 0.35826343087660883, - "step": 5595 - }, - { - "loss": 2.2625, - "grad_norm": 1.8094017505645752, - "learning_rate": 5e-05, - "epoch": 0.35858359480053786, - "step": 5600 - }, - { - "eval_loss": 2.1376986503601074, - "eval_runtime": 9.2886, - "eval_samples_per_second": 220.484, - "eval_steps_per_second": 27.561, - "epoch": 0.35858359480053786, - "step": 5600 - }, - { - "loss": 2.2669, - "grad_norm": 1.796750783920288, - "learning_rate": 5e-05, - "epoch": 0.35890375872446695, - "step": 5605 - }, - { - "loss": 2.2834, - "grad_norm": 1.7852609157562256, - "learning_rate": 5e-05, - "epoch": 0.359223922648396, - "step": 5610 - }, - { - "loss": 2.2566, - "grad_norm": 1.7483196258544922, - "learning_rate": 5e-05, - "epoch": 0.359544086572325, - "step": 5615 - }, - { - "loss": 2.2819, - "grad_norm": 1.7469184398651123, - "learning_rate": 5e-05, - "epoch": 0.3598642504962541, - "step": 5620 - }, - { - "loss": 2.253, - "grad_norm": 1.787428855895996, - "learning_rate": 5e-05, - "epoch": 0.3601844144201831, - "step": 5625 - }, - { - "loss": 2.2558, - "grad_norm": 1.7420175075531006, - "learning_rate": 5e-05, - "epoch": 0.3605045783441122, - "step": 5630 - }, - { - "loss": 2.2465, - "grad_norm": 1.676102638244629, - "learning_rate": 5e-05, - "epoch": 0.36082474226804123, - "step": 5635 - }, - { - "loss": 2.2594, - "grad_norm": 1.754003882408142, - "learning_rate": 5e-05, - "epoch": 0.3611449061919703, - "step": 5640 - }, - { - "loss": 2.2558, - "grad_norm": 1.7780991792678833, - "learning_rate": 5e-05, - "epoch": 0.36146507011589935, - "step": 5645 - }, - { - "loss": 2.2613, - "grad_norm": 1.7494131326675415, - "learning_rate": 5e-05, - "epoch": 0.3617852340398284, - "step": 5650 - }, - { - "loss": 2.2511, - "grad_norm": 1.8119771480560303, - "learning_rate": 5e-05, - "epoch": 0.36210539796375746, - "step": 5655 - }, - { - "loss": 2.2891, - "grad_norm": 1.8024489879608154, - "learning_rate": 5e-05, - "epoch": 0.3624255618876865, - "step": 5660 - }, - { - "loss": 2.2737, - "grad_norm": 1.7026606798171997, - "learning_rate": 5e-05, - "epoch": 0.36274572581161557, - "step": 5665 - }, - { - "loss": 2.3008, - "grad_norm": 1.7064659595489502, - "learning_rate": 5e-05, - "epoch": 0.3630658897355446, - "step": 5670 - }, - { - "loss": 2.2796, - "grad_norm": 1.7445411682128906, - "learning_rate": 5e-05, - "epoch": 0.36338605365947363, - "step": 5675 - }, - { - "loss": 2.2406, - "grad_norm": 1.7404433488845825, - "learning_rate": 5e-05, - "epoch": 0.3637062175834027, - "step": 5680 - }, - { - "loss": 2.2742, - "grad_norm": 1.6843476295471191, - "learning_rate": 5e-05, - "epoch": 0.36402638150733174, - "step": 5685 - }, - { - "loss": 2.2763, - "grad_norm": 1.8461291790008545, - "learning_rate": 5e-05, - "epoch": 0.3643465454312608, - "step": 5690 - }, - { - "loss": 2.2594, - "grad_norm": 1.7500439882278442, - "learning_rate": 5e-05, - "epoch": 0.36466670935518986, - "step": 5695 - }, - { - "loss": 2.2495, - "grad_norm": 1.7546688318252563, - "learning_rate": 5e-05, - "epoch": 0.3649868732791189, - "step": 5700 - }, - { - "loss": 2.268, - "grad_norm": 1.7128827571868896, - "learning_rate": 5e-05, - "epoch": 0.36530703720304797, - "step": 5705 - }, - { - "loss": 2.2716, - "grad_norm": 1.8002029657363892, - "learning_rate": 5e-05, - "epoch": 0.365627201126977, - "step": 5710 - }, - { - "loss": 2.2689, - "grad_norm": 1.7871887683868408, - "learning_rate": 5e-05, - "epoch": 0.3659473650509061, - "step": 5715 - }, - { - "loss": 2.2444, - "grad_norm": 1.801291584968567, - "learning_rate": 5e-05, - "epoch": 0.3662675289748351, - "step": 5720 - }, - { - "loss": 2.2788, - "grad_norm": 1.8185909986495972, - "learning_rate": 5e-05, - "epoch": 0.3665876928987642, - "step": 5725 - }, - { - "loss": 2.2746, - "grad_norm": 1.7295774221420288, - "learning_rate": 5e-05, - "epoch": 0.3669078568226932, - "step": 5730 - }, - { - "loss": 2.2953, - "grad_norm": 1.7250750064849854, - "learning_rate": 5e-05, - "epoch": 0.36722802074662225, - "step": 5735 - }, - { - "loss": 2.2689, - "grad_norm": 1.7358938455581665, - "learning_rate": 5e-05, - "epoch": 0.36754818467055134, - "step": 5740 - }, - { - "loss": 2.2594, - "grad_norm": 1.7297829389572144, - "learning_rate": 5e-05, - "epoch": 0.36786834859448037, - "step": 5745 - }, - { - "loss": 2.2729, - "grad_norm": 1.788424015045166, - "learning_rate": 5e-05, - "epoch": 0.36818851251840945, - "step": 5750 - }, - { - "loss": 2.2442, - "grad_norm": 1.873340368270874, - "learning_rate": 5e-05, - "epoch": 0.3685086764423385, - "step": 5755 - }, - { - "loss": 2.249, - "grad_norm": 1.7489144802093506, - "learning_rate": 5e-05, - "epoch": 0.3688288403662675, - "step": 5760 - }, - { - "loss": 2.2845, - "grad_norm": 1.7094459533691406, - "learning_rate": 5e-05, - "epoch": 0.3691490042901966, - "step": 5765 - }, - { - "loss": 2.2626, - "grad_norm": 1.7959952354431152, - "learning_rate": 5e-05, - "epoch": 0.3694691682141256, - "step": 5770 - }, - { - "loss": 2.261, - "grad_norm": 1.7271146774291992, - "learning_rate": 5e-05, - "epoch": 0.3697893321380547, - "step": 5775 - }, - { - "loss": 2.2694, - "grad_norm": 1.7205613851547241, - "learning_rate": 5e-05, - "epoch": 0.37010949606198373, - "step": 5780 - }, - { - "loss": 2.2703, - "grad_norm": 1.6520004272460938, - "learning_rate": 5e-05, - "epoch": 0.37042965998591276, - "step": 5785 - }, - { - "loss": 2.2531, - "grad_norm": 1.6878688335418701, - "learning_rate": 5e-05, - "epoch": 0.37074982390984185, - "step": 5790 - }, - { - "loss": 2.2859, - "grad_norm": 1.7531139850616455, - "learning_rate": 5e-05, - "epoch": 0.3710699878337709, - "step": 5795 - }, - { - "loss": 2.2609, - "grad_norm": 1.7352375984191895, - "learning_rate": 5e-05, - "epoch": 0.37139015175769996, - "step": 5800 - }, - { - "eval_loss": 2.1418533325195312, - "eval_runtime": 12.9494, - "eval_samples_per_second": 158.154, - "eval_steps_per_second": 19.769, - "epoch": 0.37139015175769996, - "step": 5800 - }, - { - "loss": 2.278, - "grad_norm": 1.7465990781784058, - "learning_rate": 5e-05, - "epoch": 0.371710315681629, - "step": 5805 - }, - { - "loss": 2.2576, - "grad_norm": 1.6427454948425293, - "learning_rate": 5e-05, - "epoch": 0.3720304796055581, - "step": 5810 - }, - { - "loss": 2.2879, - "grad_norm": 1.6827012300491333, - "learning_rate": 5e-05, - "epoch": 0.3723506435294871, - "step": 5815 - }, - { - "loss": 2.2817, - "grad_norm": 1.6802785396575928, - "learning_rate": 5e-05, - "epoch": 0.37267080745341613, - "step": 5820 - }, - { - "loss": 2.2624, - "grad_norm": 1.764146089553833, - "learning_rate": 5e-05, - "epoch": 0.3729909713773452, - "step": 5825 - }, - { - "loss": 2.2748, - "grad_norm": 1.7563925981521606, - "learning_rate": 5e-05, - "epoch": 0.37331113530127424, - "step": 5830 - }, - { - "loss": 2.2679, - "grad_norm": 1.7826206684112549, - "learning_rate": 5e-05, - "epoch": 0.37363129922520333, - "step": 5835 - }, - { - "loss": 2.2631, - "grad_norm": 1.7394565343856812, - "learning_rate": 5e-05, - "epoch": 0.37395146314913236, - "step": 5840 - }, - { - "loss": 2.2668, - "grad_norm": 1.702976942062378, - "learning_rate": 5e-05, - "epoch": 0.3742716270730614, - "step": 5845 - }, - { - "loss": 2.2615, - "grad_norm": 1.8101780414581299, - "learning_rate": 5e-05, - "epoch": 0.37459179099699047, - "step": 5850 - }, - { - "loss": 2.2914, - "grad_norm": 1.7696033716201782, - "learning_rate": 5e-05, - "epoch": 0.3749119549209195, - "step": 5855 - }, - { - "loss": 2.2536, - "grad_norm": 1.7243146896362305, - "learning_rate": 5e-05, - "epoch": 0.3752321188448486, - "step": 5860 - }, - { - "loss": 2.2839, - "grad_norm": 1.707695722579956, - "learning_rate": 5e-05, - "epoch": 0.3755522827687776, - "step": 5865 - }, - { - "loss": 2.2604, - "grad_norm": 1.650211215019226, - "learning_rate": 5e-05, - "epoch": 0.37587244669270664, - "step": 5870 - }, - { - "loss": 2.258, - "grad_norm": 1.6194339990615845, - "learning_rate": 5e-05, - "epoch": 0.3761926106166357, - "step": 5875 - }, - { - "loss": 2.2542, - "grad_norm": 1.7091882228851318, - "learning_rate": 5e-05, - "epoch": 0.37651277454056475, - "step": 5880 - }, - { - "loss": 2.2661, - "grad_norm": 1.733975887298584, - "learning_rate": 5e-05, - "epoch": 0.37683293846449384, - "step": 5885 - }, - { - "loss": 2.2536, - "grad_norm": 1.7769482135772705, - "learning_rate": 5e-05, - "epoch": 0.37715310238842287, - "step": 5890 - }, - { - "loss": 2.2515, - "grad_norm": 1.7663599252700806, - "learning_rate": 5e-05, - "epoch": 0.37747326631235195, - "step": 5895 - }, - { - "loss": 2.2979, - "grad_norm": 1.6576106548309326, - "learning_rate": 5e-05, - "epoch": 0.377793430236281, - "step": 5900 - }, - { - "loss": 2.2524, - "grad_norm": 1.838011384010315, - "learning_rate": 5e-05, - "epoch": 0.37811359416021, - "step": 5905 - }, - { - "loss": 2.2539, - "grad_norm": 1.7713699340820312, - "learning_rate": 5e-05, - "epoch": 0.3784337580841391, - "step": 5910 - }, - { - "loss": 2.2644, - "grad_norm": 1.765184760093689, - "learning_rate": 5e-05, - "epoch": 0.3787539220080681, - "step": 5915 - }, - { - "loss": 2.2128, - "grad_norm": 1.70463228225708, - "learning_rate": 5e-05, - "epoch": 0.3790740859319972, - "step": 5920 - }, - { - "loss": 2.2651, - "grad_norm": 1.689228892326355, - "learning_rate": 5e-05, - "epoch": 0.37939424985592624, - "step": 5925 - }, - { - "loss": 2.2368, - "grad_norm": 1.7535511255264282, - "learning_rate": 5e-05, - "epoch": 0.37971441377985526, - "step": 5930 - }, - { - "loss": 2.2507, - "grad_norm": 1.768235445022583, - "learning_rate": 5e-05, - "epoch": 0.38003457770378435, - "step": 5935 - }, - { - "loss": 2.2313, - "grad_norm": 1.7316277027130127, - "learning_rate": 5e-05, - "epoch": 0.3803547416277134, - "step": 5940 - }, - { - "loss": 2.2901, - "grad_norm": 1.7372463941574097, - "learning_rate": 5e-05, - "epoch": 0.38067490555164246, - "step": 5945 - }, - { - "loss": 2.2767, - "grad_norm": 1.8195472955703735, - "learning_rate": 5e-05, - "epoch": 0.3809950694755715, - "step": 5950 - }, - { - "loss": 2.2819, - "grad_norm": 1.7599300146102905, - "learning_rate": 5e-05, - "epoch": 0.3813152333995005, - "step": 5955 - }, - { - "loss": 2.2495, - "grad_norm": 1.763772964477539, - "learning_rate": 5e-05, - "epoch": 0.3816353973234296, - "step": 5960 - }, - { - "loss": 2.2689, - "grad_norm": 1.6997100114822388, - "learning_rate": 5e-05, - "epoch": 0.38195556124735863, - "step": 5965 - }, - { - "loss": 2.2784, - "grad_norm": 1.6891993284225464, - "learning_rate": 5e-05, - "epoch": 0.3822757251712877, - "step": 5970 - }, - { - "loss": 2.289, - "grad_norm": 1.7187187671661377, - "learning_rate": 5e-05, - "epoch": 0.38259588909521675, - "step": 5975 - }, - { - "loss": 2.2425, - "grad_norm": 1.6827722787857056, - "learning_rate": 5e-05, - "epoch": 0.38291605301914583, - "step": 5980 - }, - { - "loss": 2.2499, - "grad_norm": 1.6614289283752441, - "learning_rate": 5e-05, - "epoch": 0.38323621694307486, - "step": 5985 - }, - { - "loss": 2.2559, - "grad_norm": 1.7182048559188843, - "learning_rate": 5e-05, - "epoch": 0.3835563808670039, - "step": 5990 - }, - { - "loss": 2.2781, - "grad_norm": 1.7572559118270874, - "learning_rate": 5e-05, - "epoch": 0.38387654479093297, - "step": 5995 - }, - { - "loss": 2.26, - "grad_norm": 1.8257755041122437, - "learning_rate": 5e-05, - "epoch": 0.384196708714862, - "step": 6000 - }, - { - "eval_loss": 2.11156964302063, - "eval_runtime": 9.0951, - "eval_samples_per_second": 225.177, - "eval_steps_per_second": 28.147, - "epoch": 0.384196708714862, - "step": 6000 - }, - { - "loss": 2.235, - "grad_norm": 1.7780883312225342, - "learning_rate": 5e-05, - "epoch": 0.3845168726387911, - "step": 6005 - }, - { - "loss": 2.2434, - "grad_norm": 1.7068132162094116, - "learning_rate": 5e-05, - "epoch": 0.3848370365627201, - "step": 6010 - }, - { - "loss": 2.2748, - "grad_norm": 1.6712257862091064, - "learning_rate": 5e-05, - "epoch": 0.38515720048664914, - "step": 6015 - }, - { - "loss": 2.2457, - "grad_norm": 1.7343010902404785, - "learning_rate": 5e-05, - "epoch": 0.3854773644105782, - "step": 6020 - }, - { - "loss": 2.2694, - "grad_norm": 1.8106725215911865, - "learning_rate": 5e-05, - "epoch": 0.38579752833450726, - "step": 6025 - }, - { - "loss": 2.2828, - "grad_norm": 1.7711716890335083, - "learning_rate": 5e-05, - "epoch": 0.38611769225843634, - "step": 6030 - }, - { - "loss": 2.259, - "grad_norm": 1.7112571001052856, - "learning_rate": 5e-05, - "epoch": 0.38643785618236537, - "step": 6035 - }, - { - "loss": 2.2748, - "grad_norm": 1.7668615579605103, - "learning_rate": 5e-05, - "epoch": 0.3867580201062944, - "step": 6040 - }, - { - "loss": 2.2853, - "grad_norm": 1.67672598361969, - "learning_rate": 5e-05, - "epoch": 0.3870781840302235, - "step": 6045 - }, - { - "loss": 2.2871, - "grad_norm": 1.7809470891952515, - "learning_rate": 5e-05, - "epoch": 0.3873983479541525, - "step": 6050 - }, - { - "loss": 2.2709, - "grad_norm": 1.785502552986145, - "learning_rate": 5e-05, - "epoch": 0.3877185118780816, - "step": 6055 - }, - { - "loss": 2.2826, - "grad_norm": 1.725252389907837, - "learning_rate": 5e-05, - "epoch": 0.3880386758020106, - "step": 6060 - }, - { - "loss": 2.2652, - "grad_norm": 1.7655749320983887, - "learning_rate": 5e-05, - "epoch": 0.3883588397259397, - "step": 6065 - }, - { - "loss": 2.2745, - "grad_norm": 1.845263123512268, - "learning_rate": 5e-05, - "epoch": 0.38867900364986874, - "step": 6070 - }, - { - "loss": 2.2355, - "grad_norm": 1.7035220861434937, - "learning_rate": 5e-05, - "epoch": 0.38899916757379777, - "step": 6075 - }, - { - "loss": 2.2324, - "grad_norm": 1.7222847938537598, - "learning_rate": 5e-05, - "epoch": 0.38931933149772685, - "step": 6080 - }, - { - "loss": 2.2685, - "grad_norm": 1.8165398836135864, - "learning_rate": 5e-05, - "epoch": 0.3896394954216559, - "step": 6085 - }, - { - "loss": 2.2682, - "grad_norm": 1.6270705461502075, - "learning_rate": 5e-05, - "epoch": 0.38995965934558496, - "step": 6090 - }, - { - "loss": 2.2508, - "grad_norm": 1.7037124633789062, - "learning_rate": 5e-05, - "epoch": 0.390279823269514, - "step": 6095 - }, - { - "loss": 2.2543, - "grad_norm": 1.648330569267273, - "learning_rate": 5e-05, - "epoch": 0.390599987193443, - "step": 6100 - }, - { - "loss": 2.2578, - "grad_norm": 1.7594226598739624, - "learning_rate": 5e-05, - "epoch": 0.3909201511173721, - "step": 6105 - }, - { - "loss": 2.223, - "grad_norm": 1.8519033193588257, - "learning_rate": 5e-05, - "epoch": 0.39124031504130113, - "step": 6110 - }, - { - "loss": 2.239, - "grad_norm": 1.7203348875045776, - "learning_rate": 5e-05, - "epoch": 0.3915604789652302, - "step": 6115 - }, - { - "loss": 2.2772, - "grad_norm": 1.6320827007293701, - "learning_rate": 5e-05, - "epoch": 0.39188064288915925, - "step": 6120 - }, - { - "loss": 2.2555, - "grad_norm": 1.7894231081008911, - "learning_rate": 5e-05, - "epoch": 0.3922008068130883, - "step": 6125 - }, - { - "loss": 2.2534, - "grad_norm": 1.8454432487487793, - "learning_rate": 5e-05, - "epoch": 0.39252097073701736, - "step": 6130 - }, - { - "loss": 2.2645, - "grad_norm": 1.7246161699295044, - "learning_rate": 5e-05, - "epoch": 0.3928411346609464, - "step": 6135 - }, - { - "loss": 2.2586, - "grad_norm": 1.765830159187317, - "learning_rate": 5e-05, - "epoch": 0.3931612985848755, - "step": 6140 - }, - { - "loss": 2.2669, - "grad_norm": 1.6727474927902222, - "learning_rate": 5e-05, - "epoch": 0.3934814625088045, - "step": 6145 - }, - { - "loss": 2.2517, - "grad_norm": 1.69596529006958, - "learning_rate": 5e-05, - "epoch": 0.3938016264327336, - "step": 6150 - }, - { - "loss": 2.2718, - "grad_norm": 1.6889044046401978, - "learning_rate": 5e-05, - "epoch": 0.3941217903566626, - "step": 6155 - }, - { - "loss": 2.2556, - "grad_norm": 1.627752661705017, - "learning_rate": 5e-05, - "epoch": 0.39444195428059164, - "step": 6160 - }, - { - "loss": 2.2455, - "grad_norm": 1.7724320888519287, - "learning_rate": 5e-05, - "epoch": 0.39476211820452073, - "step": 6165 - }, - { - "loss": 2.2798, - "grad_norm": 1.7310431003570557, - "learning_rate": 5e-05, - "epoch": 0.39508228212844976, - "step": 6170 - }, - { - "loss": 2.24, - "grad_norm": 1.7662495374679565, - "learning_rate": 5e-05, - "epoch": 0.39540244605237884, - "step": 6175 - }, - { - "loss": 2.2278, - "grad_norm": 1.6548069715499878, - "learning_rate": 5e-05, - "epoch": 0.39572260997630787, - "step": 6180 - }, - { - "loss": 2.2691, - "grad_norm": 1.7314773797988892, - "learning_rate": 5e-05, - "epoch": 0.3960427739002369, - "step": 6185 - }, - { - "loss": 2.2485, - "grad_norm": 1.737084984779358, - "learning_rate": 5e-05, - "epoch": 0.396362937824166, - "step": 6190 - }, - { - "loss": 2.2649, - "grad_norm": 1.7657032012939453, - "learning_rate": 5e-05, - "epoch": 0.396683101748095, - "step": 6195 - }, - { - "loss": 2.2166, - "grad_norm": 1.7243016958236694, - "learning_rate": 5e-05, - "epoch": 0.3970032656720241, - "step": 6200 - }, - { - "eval_loss": 2.1302952766418457, - "eval_runtime": 13.3134, - "eval_samples_per_second": 153.83, - "eval_steps_per_second": 19.229, - "epoch": 0.3970032656720241, - "step": 6200 - }, - { - "loss": 2.2638, - "grad_norm": 1.847778081893921, - "learning_rate": 5e-05, - "epoch": 0.3973234295959531, - "step": 6205 - }, - { - "loss": 2.2624, - "grad_norm": 1.7355600595474243, - "learning_rate": 5e-05, - "epoch": 0.39764359351988215, - "step": 6210 - }, - { - "loss": 2.2611, - "grad_norm": 1.7112786769866943, - "learning_rate": 5e-05, - "epoch": 0.39796375744381124, - "step": 6215 - }, - { - "loss": 2.2695, - "grad_norm": 1.7076542377471924, - "learning_rate": 5e-05, - "epoch": 0.39828392136774027, - "step": 6220 - }, - { - "loss": 2.2448, - "grad_norm": 1.7070058584213257, - "learning_rate": 5e-05, - "epoch": 0.39860408529166935, - "step": 6225 - }, - { - "loss": 2.2445, - "grad_norm": 1.7254059314727783, - "learning_rate": 5e-05, - "epoch": 0.3989242492155984, - "step": 6230 - }, - { - "loss": 2.2435, - "grad_norm": 1.7010166645050049, - "learning_rate": 5e-05, - "epoch": 0.39924441313952747, - "step": 6235 - }, - { - "loss": 2.2417, - "grad_norm": 1.7349189519882202, - "learning_rate": 5e-05, - "epoch": 0.3995645770634565, - "step": 6240 - }, - { - "loss": 2.2544, - "grad_norm": 1.812296748161316, - "learning_rate": 5e-05, - "epoch": 0.3998847409873855, - "step": 6245 - }, - { - "loss": 2.2425, - "grad_norm": 1.7517497539520264, - "learning_rate": 5e-05, - "epoch": 0.4002049049113146, - "step": 6250 - }, - { - "loss": 2.2405, - "grad_norm": 1.7381399869918823, - "learning_rate": 5e-05, - "epoch": 0.40052506883524364, - "step": 6255 - }, - { - "loss": 2.2862, - "grad_norm": 1.7130184173583984, - "learning_rate": 5e-05, - "epoch": 0.4008452327591727, - "step": 6260 - }, - { - "loss": 2.2525, - "grad_norm": 1.766489028930664, - "learning_rate": 5e-05, - "epoch": 0.40116539668310175, - "step": 6265 - }, - { - "loss": 2.2372, - "grad_norm": 1.6739976406097412, - "learning_rate": 5e-05, - "epoch": 0.4014855606070308, - "step": 6270 - }, - { - "loss": 2.2145, - "grad_norm": 1.6726338863372803, - "learning_rate": 5e-05, - "epoch": 0.40180572453095986, - "step": 6275 - }, - { - "loss": 2.2622, - "grad_norm": 1.8587597608566284, - "learning_rate": 5e-05, - "epoch": 0.4021258884548889, - "step": 6280 - }, - { - "loss": 2.252, - "grad_norm": 1.7975720167160034, - "learning_rate": 5e-05, - "epoch": 0.402446052378818, - "step": 6285 - }, - { - "loss": 2.2645, - "grad_norm": 1.6847093105316162, - "learning_rate": 5e-05, - "epoch": 0.402766216302747, - "step": 6290 - }, - { - "loss": 2.213, - "grad_norm": 1.6540334224700928, - "learning_rate": 5e-05, - "epoch": 0.40308638022667603, - "step": 6295 - }, - { - "loss": 2.2244, - "grad_norm": 1.7274028062820435, - "learning_rate": 5e-05, - "epoch": 0.4034065441506051, - "step": 6300 - }, - { - "loss": 2.2298, - "grad_norm": 1.738688588142395, - "learning_rate": 5e-05, - "epoch": 0.40372670807453415, - "step": 6305 - }, - { - "loss": 2.2655, - "grad_norm": 1.7383339405059814, - "learning_rate": 5e-05, - "epoch": 0.40404687199846323, - "step": 6310 - }, - { - "loss": 2.2595, - "grad_norm": 1.6844770908355713, - "learning_rate": 5e-05, - "epoch": 0.40436703592239226, - "step": 6315 - }, - { - "loss": 2.2474, - "grad_norm": 1.6770930290222168, - "learning_rate": 5e-05, - "epoch": 0.40468719984632134, - "step": 6320 - }, - { - "loss": 2.2467, - "grad_norm": 1.775538682937622, - "learning_rate": 5e-05, - "epoch": 0.4050073637702504, - "step": 6325 - }, - { - "loss": 2.2238, - "grad_norm": 1.7302964925765991, - "learning_rate": 5e-05, - "epoch": 0.4053275276941794, - "step": 6330 - }, - { - "loss": 2.2545, - "grad_norm": 1.7631667852401733, - "learning_rate": 5e-05, - "epoch": 0.4056476916181085, - "step": 6335 - }, - { - "loss": 2.2462, - "grad_norm": 1.704640507698059, - "learning_rate": 5e-05, - "epoch": 0.4059678555420375, - "step": 6340 - }, - { - "loss": 2.2356, - "grad_norm": 1.849134087562561, - "learning_rate": 5e-05, - "epoch": 0.4062880194659666, - "step": 6345 - }, - { - "loss": 2.2713, - "grad_norm": 1.764115810394287, - "learning_rate": 5e-05, - "epoch": 0.4066081833898956, - "step": 6350 - }, - { - "loss": 2.268, - "grad_norm": 1.653106451034546, - "learning_rate": 5e-05, - "epoch": 0.40692834731382466, - "step": 6355 - }, - { - "loss": 2.2242, - "grad_norm": 1.816038727760315, - "learning_rate": 5e-05, - "epoch": 0.40724851123775374, - "step": 6360 - }, - { - "loss": 2.2568, - "grad_norm": 1.6303045749664307, - "learning_rate": 5e-05, - "epoch": 0.40756867516168277, - "step": 6365 - }, - { - "loss": 2.2756, - "grad_norm": 1.6868770122528076, - "learning_rate": 5e-05, - "epoch": 0.40788883908561185, - "step": 6370 - }, - { - "loss": 2.2556, - "grad_norm": 1.711913824081421, - "learning_rate": 5e-05, - "epoch": 0.4082090030095409, - "step": 6375 - }, - { - "loss": 2.2355, - "grad_norm": 1.6835819482803345, - "learning_rate": 5e-05, - "epoch": 0.4085291669334699, - "step": 6380 - }, - { - "loss": 2.2559, - "grad_norm": 1.7608588933944702, - "learning_rate": 5e-05, - "epoch": 0.408849330857399, - "step": 6385 - }, - { - "loss": 2.252, - "grad_norm": 1.6766396760940552, - "learning_rate": 5e-05, - "epoch": 0.409169494781328, - "step": 6390 - }, - { - "loss": 2.2475, - "grad_norm": 1.6831003427505493, - "learning_rate": 5e-05, - "epoch": 0.4094896587052571, - "step": 6395 - }, - { - "loss": 2.2639, - "grad_norm": 1.7193617820739746, - "learning_rate": 5e-05, - "epoch": 0.40980982262918614, - "step": 6400 - }, - { - "eval_loss": 2.11531400680542, - "eval_runtime": 9.5033, - "eval_samples_per_second": 215.505, - "eval_steps_per_second": 26.938, - "epoch": 0.40980982262918614, - "step": 6400 - }, - { - "loss": 2.2536, - "grad_norm": 1.6074965000152588, - "learning_rate": 5e-05, - "epoch": 0.4101299865531152, - "step": 6405 - }, - { - "loss": 2.2417, - "grad_norm": 1.6199990510940552, - "learning_rate": 5e-05, - "epoch": 0.41045015047704425, - "step": 6410 - }, - { - "loss": 2.2629, - "grad_norm": 1.6224853992462158, - "learning_rate": 5e-05, - "epoch": 0.4107703144009733, - "step": 6415 - }, - { - "loss": 2.2491, - "grad_norm": 1.779128909111023, - "learning_rate": 5e-05, - "epoch": 0.41109047832490236, - "step": 6420 - }, - { - "loss": 2.2301, - "grad_norm": 1.7013006210327148, - "learning_rate": 5e-05, - "epoch": 0.4114106422488314, - "step": 6425 - }, - { - "loss": 2.2517, - "grad_norm": 1.745300531387329, - "learning_rate": 5e-05, - "epoch": 0.4117308061727605, - "step": 6430 - }, - { - "loss": 2.2743, - "grad_norm": 1.670337438583374, - "learning_rate": 5e-05, - "epoch": 0.4120509700966895, - "step": 6435 - }, - { - "loss": 2.2511, - "grad_norm": 1.7760534286499023, - "learning_rate": 5e-05, - "epoch": 0.41237113402061853, - "step": 6440 - }, - { - "loss": 2.2577, - "grad_norm": 1.7097136974334717, - "learning_rate": 5e-05, - "epoch": 0.4126912979445476, - "step": 6445 - }, - { - "loss": 2.2357, - "grad_norm": 1.738032341003418, - "learning_rate": 5e-05, - "epoch": 0.41301146186847665, - "step": 6450 - }, - { - "loss": 2.2212, - "grad_norm": 1.6849381923675537, - "learning_rate": 5e-05, - "epoch": 0.41333162579240573, - "step": 6455 - }, - { - "loss": 2.233, - "grad_norm": 1.8453466892242432, - "learning_rate": 5e-05, - "epoch": 0.41365178971633476, - "step": 6460 - }, - { - "loss": 2.2687, - "grad_norm": 1.7124505043029785, - "learning_rate": 5e-05, - "epoch": 0.4139719536402638, - "step": 6465 - }, - { - "loss": 2.2342, - "grad_norm": 1.730618953704834, - "learning_rate": 5e-05, - "epoch": 0.4142921175641929, - "step": 6470 - }, - { - "loss": 2.2618, - "grad_norm": 1.7143526077270508, - "learning_rate": 5e-05, - "epoch": 0.4146122814881219, - "step": 6475 - }, - { - "loss": 2.267, - "grad_norm": 1.7568591833114624, - "learning_rate": 5e-05, - "epoch": 0.414932445412051, - "step": 6480 - }, - { - "loss": 2.2626, - "grad_norm": 1.663020372390747, - "learning_rate": 5e-05, - "epoch": 0.41525260933598, - "step": 6485 - }, - { - "loss": 2.2408, - "grad_norm": 1.6989688873291016, - "learning_rate": 5e-05, - "epoch": 0.4155727732599091, - "step": 6490 - }, - { - "loss": 2.2769, - "grad_norm": 1.648116946220398, - "learning_rate": 5e-05, - "epoch": 0.41589293718383813, - "step": 6495 - }, - { - "loss": 2.2234, - "grad_norm": 1.7310383319854736, - "learning_rate": 5e-05, - "epoch": 0.41621310110776716, - "step": 6500 - }, - { - "loss": 2.2395, - "grad_norm": 1.7397419214248657, - "learning_rate": 5e-05, - "epoch": 0.41653326503169624, - "step": 6505 - }, - { - "loss": 2.237, - "grad_norm": 1.7233692407608032, - "learning_rate": 5e-05, - "epoch": 0.41685342895562527, - "step": 6510 - }, - { - "loss": 2.2458, - "grad_norm": 1.7460954189300537, - "learning_rate": 5e-05, - "epoch": 0.41717359287955436, - "step": 6515 - }, - { - "loss": 2.2398, - "grad_norm": 1.770958423614502, - "learning_rate": 5e-05, - "epoch": 0.4174937568034834, - "step": 6520 - }, - { - "loss": 2.2217, - "grad_norm": 1.7674636840820312, - "learning_rate": 5e-05, - "epoch": 0.4178139207274124, - "step": 6525 - }, - { - "loss": 2.2629, - "grad_norm": 1.7418832778930664, - "learning_rate": 5e-05, - "epoch": 0.4181340846513415, - "step": 6530 - }, - { - "loss": 2.2547, - "grad_norm": 1.6848324537277222, - "learning_rate": 5e-05, - "epoch": 0.4184542485752705, - "step": 6535 - }, - { - "loss": 2.2029, - "grad_norm": 1.698730707168579, - "learning_rate": 5e-05, - "epoch": 0.4187744124991996, - "step": 6540 - }, - { - "loss": 2.2331, - "grad_norm": 1.7850102186203003, - "learning_rate": 5e-05, - "epoch": 0.41909457642312864, - "step": 6545 - }, - { - "loss": 2.2741, - "grad_norm": 1.8512533903121948, - "learning_rate": 5e-05, - "epoch": 0.41941474034705767, - "step": 6550 - }, - { - "loss": 2.2612, - "grad_norm": 1.7491039037704468, - "learning_rate": 5e-05, - "epoch": 0.41973490427098675, - "step": 6555 - }, - { - "loss": 2.2617, - "grad_norm": 1.7620813846588135, - "learning_rate": 5e-05, - "epoch": 0.4200550681949158, - "step": 6560 - }, - { - "loss": 2.2338, - "grad_norm": 1.7340549230575562, - "learning_rate": 5e-05, - "epoch": 0.42037523211884487, - "step": 6565 - }, - { - "loss": 2.2702, - "grad_norm": 1.6962077617645264, - "learning_rate": 5e-05, - "epoch": 0.4206953960427739, - "step": 6570 - }, - { - "loss": 2.2338, - "grad_norm": 1.6991527080535889, - "learning_rate": 5e-05, - "epoch": 0.421015559966703, - "step": 6575 - }, - { - "loss": 2.2522, - "grad_norm": 1.74476158618927, - "learning_rate": 5e-05, - "epoch": 0.421335723890632, - "step": 6580 - }, - { - "loss": 2.2406, - "grad_norm": 1.763519287109375, - "learning_rate": 5e-05, - "epoch": 0.42165588781456104, - "step": 6585 - }, - { - "loss": 2.2464, - "grad_norm": 1.675957202911377, - "learning_rate": 5e-05, - "epoch": 0.4219760517384901, - "step": 6590 - }, - { - "loss": 2.2329, - "grad_norm": 1.7178364992141724, - "learning_rate": 5e-05, - "epoch": 0.42229621566241915, - "step": 6595 - }, - { - "loss": 2.25, - "grad_norm": 1.843867301940918, - "learning_rate": 5e-05, - "epoch": 0.42261637958634823, - "step": 6600 - }, - { - "eval_loss": 2.110640048980713, - "eval_runtime": 9.6155, - "eval_samples_per_second": 212.989, - "eval_steps_per_second": 26.624, - "epoch": 0.42261637958634823, - "step": 6600 - }, - { - "loss": 2.2297, - "grad_norm": 1.850877046585083, - "learning_rate": 5e-05, - "epoch": 0.42293654351027726, - "step": 6605 - }, - { - "loss": 2.2479, - "grad_norm": 1.7398591041564941, - "learning_rate": 5e-05, - "epoch": 0.4232567074342063, - "step": 6610 - }, - { - "loss": 2.2746, - "grad_norm": 1.7509093284606934, - "learning_rate": 5e-05, - "epoch": 0.4235768713581354, - "step": 6615 - }, - { - "loss": 2.2588, - "grad_norm": 1.7495396137237549, - "learning_rate": 5e-05, - "epoch": 0.4238970352820644, - "step": 6620 - }, - { - "loss": 2.2274, - "grad_norm": 1.6394826173782349, - "learning_rate": 5e-05, - "epoch": 0.4242171992059935, - "step": 6625 - }, - { - "loss": 2.2229, - "grad_norm": 1.7712039947509766, - "learning_rate": 5e-05, - "epoch": 0.4245373631299225, - "step": 6630 - }, - { - "loss": 2.2761, - "grad_norm": 1.6803395748138428, - "learning_rate": 5e-05, - "epoch": 0.42485752705385155, - "step": 6635 - }, - { - "loss": 2.2622, - "grad_norm": 1.7649556398391724, - "learning_rate": 5e-05, - "epoch": 0.42517769097778063, - "step": 6640 - }, - { - "loss": 2.246, - "grad_norm": 1.8420475721359253, - "learning_rate": 5e-05, - "epoch": 0.42549785490170966, - "step": 6645 - }, - { - "loss": 2.2621, - "grad_norm": 1.8346716165542603, - "learning_rate": 5e-05, - "epoch": 0.42581801882563874, - "step": 6650 - }, - { - "loss": 2.2548, - "grad_norm": 1.6930170059204102, - "learning_rate": 5e-05, - "epoch": 0.4261381827495678, - "step": 6655 - }, - { - "loss": 2.2308, - "grad_norm": 1.7325392961502075, - "learning_rate": 5e-05, - "epoch": 0.42645834667349686, - "step": 6660 - }, - { - "loss": 2.2261, - "grad_norm": 1.6914280652999878, - "learning_rate": 5e-05, - "epoch": 0.4267785105974259, - "step": 6665 - }, - { - "loss": 2.2445, - "grad_norm": 1.7235634326934814, - "learning_rate": 5e-05, - "epoch": 0.4270986745213549, - "step": 6670 - }, - { - "loss": 2.2277, - "grad_norm": 1.6718071699142456, - "learning_rate": 5e-05, - "epoch": 0.427418838445284, - "step": 6675 - }, - { - "loss": 2.245, - "grad_norm": 1.6824864149093628, - "learning_rate": 5e-05, - "epoch": 0.42773900236921303, - "step": 6680 - }, - { - "loss": 2.2075, - "grad_norm": 1.6548774242401123, - "learning_rate": 5e-05, - "epoch": 0.4280591662931421, - "step": 6685 - }, - { - "loss": 2.2419, - "grad_norm": 1.6627106666564941, - "learning_rate": 5e-05, - "epoch": 0.42837933021707114, - "step": 6690 - }, - { - "loss": 2.2613, - "grad_norm": 1.6999781131744385, - "learning_rate": 5e-05, - "epoch": 0.42869949414100017, - "step": 6695 - }, - { - "loss": 2.2318, - "grad_norm": 1.8439428806304932, - "learning_rate": 5e-05, - "epoch": 0.42901965806492925, - "step": 6700 - }, - { - "loss": 2.2611, - "grad_norm": 1.7762128114700317, - "learning_rate": 5e-05, - "epoch": 0.4293398219888583, - "step": 6705 - }, - { - "loss": 2.2322, - "grad_norm": 1.7934534549713135, - "learning_rate": 5e-05, - "epoch": 0.42965998591278737, - "step": 6710 - }, - { - "loss": 2.2503, - "grad_norm": 1.6476908922195435, - "learning_rate": 5e-05, - "epoch": 0.4299801498367164, - "step": 6715 - }, - { - "loss": 2.2723, - "grad_norm": 1.757597804069519, - "learning_rate": 5e-05, - "epoch": 0.4303003137606454, - "step": 6720 - }, - { - "loss": 2.2294, - "grad_norm": 1.6976431608200073, - "learning_rate": 5e-05, - "epoch": 0.4306204776845745, - "step": 6725 - }, - { - "loss": 2.2551, - "grad_norm": 1.81328284740448, - "learning_rate": 5e-05, - "epoch": 0.43094064160850354, - "step": 6730 - }, - { - "loss": 2.2393, - "grad_norm": 1.7719358205795288, - "learning_rate": 5e-05, - "epoch": 0.4312608055324326, - "step": 6735 - }, - { - "loss": 2.2426, - "grad_norm": 1.8044530153274536, - "learning_rate": 5e-05, - "epoch": 0.43158096945636165, - "step": 6740 - }, - { - "loss": 2.2435, - "grad_norm": 1.760985255241394, - "learning_rate": 5e-05, - "epoch": 0.43190113338029074, - "step": 6745 - }, - { - "loss": 2.2433, - "grad_norm": 1.7239030599594116, - "learning_rate": 5e-05, - "epoch": 0.43222129730421976, - "step": 6750 - }, - { - "loss": 2.2013, - "grad_norm": 1.7211287021636963, - "learning_rate": 5e-05, - "epoch": 0.4325414612281488, - "step": 6755 - }, - { - "loss": 2.2658, - "grad_norm": 1.7456564903259277, - "learning_rate": 5e-05, - "epoch": 0.4328616251520779, - "step": 6760 - }, - { - "loss": 2.2225, - "grad_norm": 1.7644805908203125, - "learning_rate": 5e-05, - "epoch": 0.4331817890760069, - "step": 6765 - }, - { - "loss": 2.2455, - "grad_norm": 1.6574170589447021, - "learning_rate": 5e-05, - "epoch": 0.433501952999936, - "step": 6770 - }, - { - "loss": 2.2565, - "grad_norm": 1.673085331916809, - "learning_rate": 5e-05, - "epoch": 0.433822116923865, - "step": 6775 - }, - { - "loss": 2.2561, - "grad_norm": 1.7815220355987549, - "learning_rate": 5e-05, - "epoch": 0.43414228084779405, - "step": 6780 - }, - { - "loss": 2.2375, - "grad_norm": 1.7830764055252075, - "learning_rate": 5e-05, - "epoch": 0.43446244477172313, - "step": 6785 - }, - { - "loss": 2.2547, - "grad_norm": 1.7634798288345337, - "learning_rate": 5e-05, - "epoch": 0.43478260869565216, - "step": 6790 - }, - { - "loss": 2.2365, - "grad_norm": 1.771299123764038, - "learning_rate": 5e-05, - "epoch": 0.43510277261958125, - "step": 6795 - }, - { - "loss": 2.2699, - "grad_norm": 1.708333134651184, - "learning_rate": 5e-05, - "epoch": 0.4354229365435103, - "step": 6800 - }, - { - "eval_loss": 2.0987234115600586, - "eval_runtime": 9.3179, - "eval_samples_per_second": 219.792, - "eval_steps_per_second": 27.474, - "epoch": 0.4354229365435103, - "step": 6800 - }, - { - "loss": 2.2188, - "grad_norm": 1.8107043504714966, - "learning_rate": 5e-05, - "epoch": 0.4357431004674393, - "step": 6805 - }, - { - "loss": 2.2213, - "grad_norm": 1.707737922668457, - "learning_rate": 5e-05, - "epoch": 0.4360632643913684, - "step": 6810 - }, - { - "loss": 2.261, - "grad_norm": 1.8159151077270508, - "learning_rate": 5e-05, - "epoch": 0.4363834283152974, - "step": 6815 - }, - { - "loss": 2.2857, - "grad_norm": 1.6932034492492676, - "learning_rate": 5e-05, - "epoch": 0.4367035922392265, - "step": 6820 - }, - { - "loss": 2.222, - "grad_norm": 1.8024814128875732, - "learning_rate": 5e-05, - "epoch": 0.43702375616315553, - "step": 6825 - }, - { - "loss": 2.2387, - "grad_norm": 1.7472243309020996, - "learning_rate": 5e-05, - "epoch": 0.4373439200870846, - "step": 6830 - }, - { - "loss": 2.2135, - "grad_norm": 1.7393558025360107, - "learning_rate": 5e-05, - "epoch": 0.43766408401101364, - "step": 6835 - }, - { - "loss": 2.2406, - "grad_norm": 1.8635519742965698, - "learning_rate": 5e-05, - "epoch": 0.43798424793494267, - "step": 6840 - }, - { - "loss": 2.2176, - "grad_norm": 1.757818579673767, - "learning_rate": 5e-05, - "epoch": 0.43830441185887176, - "step": 6845 - }, - { - "loss": 2.2769, - "grad_norm": 1.670522928237915, - "learning_rate": 5e-05, - "epoch": 0.4386245757828008, - "step": 6850 - }, - { - "loss": 2.2687, - "grad_norm": 1.708299160003662, - "learning_rate": 5e-05, - "epoch": 0.43894473970672987, - "step": 6855 - }, - { - "loss": 2.2174, - "grad_norm": 1.6819125413894653, - "learning_rate": 5e-05, - "epoch": 0.4392649036306589, - "step": 6860 - }, - { - "loss": 2.2336, - "grad_norm": 1.7067598104476929, - "learning_rate": 5e-05, - "epoch": 0.4395850675545879, - "step": 6865 - }, - { - "loss": 2.2244, - "grad_norm": 1.6839826107025146, - "learning_rate": 5e-05, - "epoch": 0.439905231478517, - "step": 6870 - }, - { - "loss": 2.2274, - "grad_norm": 1.8001630306243896, - "learning_rate": 5e-05, - "epoch": 0.44022539540244604, - "step": 6875 - }, - { - "loss": 2.2402, - "grad_norm": 1.7565686702728271, - "learning_rate": 5e-05, - "epoch": 0.4405455593263751, - "step": 6880 - }, - { - "loss": 2.2296, - "grad_norm": 1.6886423826217651, - "learning_rate": 5e-05, - "epoch": 0.44086572325030415, - "step": 6885 - }, - { - "loss": 2.2304, - "grad_norm": 1.713010311126709, - "learning_rate": 5e-05, - "epoch": 0.4411858871742332, - "step": 6890 - }, - { - "loss": 2.2443, - "grad_norm": 1.6640557050704956, - "learning_rate": 5e-05, - "epoch": 0.44150605109816227, - "step": 6895 - }, - { - "loss": 2.223, - "grad_norm": 1.6528607606887817, - "learning_rate": 5e-05, - "epoch": 0.4418262150220913, - "step": 6900 - }, - { - "loss": 2.2264, - "grad_norm": 1.6679795980453491, - "learning_rate": 5e-05, - "epoch": 0.4421463789460204, - "step": 6905 - }, - { - "loss": 2.2393, - "grad_norm": 1.7212443351745605, - "learning_rate": 5e-05, - "epoch": 0.4424665428699494, - "step": 6910 - }, - { - "loss": 2.236, - "grad_norm": 1.671025037765503, - "learning_rate": 5e-05, - "epoch": 0.4427867067938785, - "step": 6915 - }, - { - "loss": 2.2512, - "grad_norm": 1.6580772399902344, - "learning_rate": 5e-05, - "epoch": 0.4431068707178075, - "step": 6920 - }, - { - "loss": 2.2237, - "grad_norm": 1.8094478845596313, - "learning_rate": 5e-05, - "epoch": 0.44342703464173655, - "step": 6925 - }, - { - "loss": 2.2487, - "grad_norm": 1.6925034523010254, - "learning_rate": 5e-05, - "epoch": 0.44374719856566563, - "step": 6930 - }, - { - "loss": 2.2323, - "grad_norm": 1.6939678192138672, - "learning_rate": 5e-05, - "epoch": 0.44406736248959466, - "step": 6935 - }, - { - "loss": 2.1992, - "grad_norm": 1.750412940979004, - "learning_rate": 5e-05, - "epoch": 0.44438752641352375, - "step": 6940 - }, - { - "loss": 2.2182, - "grad_norm": 1.6810964345932007, - "learning_rate": 5e-05, - "epoch": 0.4447076903374528, - "step": 6945 - }, - { - "loss": 2.26, - "grad_norm": 1.6222447156906128, - "learning_rate": 5e-05, - "epoch": 0.4450278542613818, - "step": 6950 - }, - { - "loss": 2.237, - "grad_norm": 1.71504545211792, - "learning_rate": 5e-05, - "epoch": 0.4453480181853109, - "step": 6955 - }, - { - "loss": 2.2384, - "grad_norm": 1.7647539377212524, - "learning_rate": 5e-05, - "epoch": 0.4456681821092399, - "step": 6960 - }, - { - "loss": 2.2321, - "grad_norm": 1.815050482749939, - "learning_rate": 5e-05, - "epoch": 0.445988346033169, - "step": 6965 - }, - { - "loss": 2.2339, - "grad_norm": 1.8233994245529175, - "learning_rate": 5e-05, - "epoch": 0.44630850995709803, - "step": 6970 - }, - { - "loss": 2.2443, - "grad_norm": 1.7368268966674805, - "learning_rate": 5e-05, - "epoch": 0.44662867388102706, - "step": 6975 - }, - { - "loss": 2.2406, - "grad_norm": 1.6867866516113281, - "learning_rate": 5e-05, - "epoch": 0.44694883780495615, - "step": 6980 - }, - { - "loss": 2.2212, - "grad_norm": 1.633429765701294, - "learning_rate": 5e-05, - "epoch": 0.4472690017288852, - "step": 6985 - }, - { - "loss": 2.2543, - "grad_norm": 1.6579304933547974, - "learning_rate": 5e-05, - "epoch": 0.44758916565281426, - "step": 6990 - }, - { - "loss": 2.2339, - "grad_norm": 1.6452136039733887, - "learning_rate": 5e-05, - "epoch": 0.4479093295767433, - "step": 6995 - }, - { - "loss": 2.23, - "grad_norm": 1.670894980430603, - "learning_rate": 5e-05, - "epoch": 0.44822949350067237, - "step": 7000 - }, - { - "eval_loss": 2.09128475189209, - "eval_runtime": 9.5706, - "eval_samples_per_second": 213.988, - "eval_steps_per_second": 26.749, - "epoch": 0.44822949350067237, - "step": 7000 - }, - { - "loss": 2.2297, - "grad_norm": 1.6882041692733765, - "learning_rate": 5e-05, - "epoch": 0.4485496574246014, - "step": 7005 - }, - { - "loss": 2.217, - "grad_norm": 1.721706748008728, - "learning_rate": 5e-05, - "epoch": 0.44886982134853043, - "step": 7010 - }, - { - "loss": 2.2618, - "grad_norm": 1.7233175039291382, - "learning_rate": 5e-05, - "epoch": 0.4491899852724595, - "step": 7015 - }, - { - "loss": 2.2284, - "grad_norm": 1.7620608806610107, - "learning_rate": 5e-05, - "epoch": 0.44951014919638854, - "step": 7020 - }, - { - "loss": 2.2098, - "grad_norm": 1.692572832107544, - "learning_rate": 5e-05, - "epoch": 0.4498303131203176, - "step": 7025 - }, - { - "loss": 2.2384, - "grad_norm": 1.8099894523620605, - "learning_rate": 5e-05, - "epoch": 0.45015047704424666, - "step": 7030 - }, - { - "loss": 2.2435, - "grad_norm": 1.7223204374313354, - "learning_rate": 5e-05, - "epoch": 0.4504706409681757, - "step": 7035 - }, - { - "loss": 2.2161, - "grad_norm": 1.692357063293457, - "learning_rate": 5e-05, - "epoch": 0.45079080489210477, - "step": 7040 - }, - { - "loss": 2.2449, - "grad_norm": 1.6669970750808716, - "learning_rate": 5e-05, - "epoch": 0.4511109688160338, - "step": 7045 - }, - { - "loss": 2.2364, - "grad_norm": 1.719010353088379, - "learning_rate": 5e-05, - "epoch": 0.4514311327399629, - "step": 7050 - }, - { - "loss": 2.2415, - "grad_norm": 1.6553248167037964, - "learning_rate": 5e-05, - "epoch": 0.4517512966638919, - "step": 7055 - }, - { - "loss": 2.2294, - "grad_norm": 1.6735188961029053, - "learning_rate": 5e-05, - "epoch": 0.45207146058782094, - "step": 7060 - }, - { - "loss": 2.2371, - "grad_norm": 1.6721162796020508, - "learning_rate": 5e-05, - "epoch": 0.45239162451175, - "step": 7065 - }, - { - "loss": 2.2304, - "grad_norm": 1.7263718843460083, - "learning_rate": 5e-05, - "epoch": 0.45271178843567905, - "step": 7070 - }, - { - "loss": 2.2442, - "grad_norm": 1.7064590454101562, - "learning_rate": 5e-05, - "epoch": 0.45303195235960814, - "step": 7075 - }, - { - "loss": 2.2804, - "grad_norm": 1.7297579050064087, - "learning_rate": 5e-05, - "epoch": 0.45335211628353717, - "step": 7080 - }, - { - "loss": 2.213, - "grad_norm": 1.6997263431549072, - "learning_rate": 5e-05, - "epoch": 0.45367228020746625, - "step": 7085 - }, - { - "loss": 2.2262, - "grad_norm": 1.6889290809631348, - "learning_rate": 5e-05, - "epoch": 0.4539924441313953, - "step": 7090 - }, - { - "loss": 2.2396, - "grad_norm": 1.6912401914596558, - "learning_rate": 5e-05, - "epoch": 0.4543126080553243, - "step": 7095 - }, - { - "loss": 2.2471, - "grad_norm": 1.6478922367095947, - "learning_rate": 5e-05, - "epoch": 0.4546327719792534, - "step": 7100 - }, - { - "loss": 2.1881, - "grad_norm": 1.6519252061843872, - "learning_rate": 5e-05, - "epoch": 0.4549529359031824, - "step": 7105 - }, - { - "loss": 2.2228, - "grad_norm": 1.7075591087341309, - "learning_rate": 5e-05, - "epoch": 0.4552730998271115, - "step": 7110 - }, - { - "loss": 2.2266, - "grad_norm": 1.64700448513031, - "learning_rate": 5e-05, - "epoch": 0.45559326375104053, - "step": 7115 - }, - { - "loss": 2.222, - "grad_norm": 1.6712502241134644, - "learning_rate": 5e-05, - "epoch": 0.45591342767496956, - "step": 7120 - }, - { - "loss": 2.2108, - "grad_norm": 1.7444020509719849, - "learning_rate": 5e-05, - "epoch": 0.45623359159889865, - "step": 7125 - }, - { - "loss": 2.2235, - "grad_norm": 1.739864468574524, - "learning_rate": 5e-05, - "epoch": 0.4565537555228277, - "step": 7130 - }, - { - "loss": 2.2242, - "grad_norm": 1.6936887502670288, - "learning_rate": 5e-05, - "epoch": 0.45687391944675676, - "step": 7135 - }, - { - "loss": 2.2379, - "grad_norm": 1.6458464860916138, - "learning_rate": 5e-05, - "epoch": 0.4571940833706858, - "step": 7140 - }, - { - "loss": 2.2267, - "grad_norm": 1.6865026950836182, - "learning_rate": 5e-05, - "epoch": 0.4575142472946148, - "step": 7145 - }, - { - "loss": 2.2165, - "grad_norm": 1.769681453704834, - "learning_rate": 5e-05, - "epoch": 0.4578344112185439, - "step": 7150 - }, - { - "loss": 2.2495, - "grad_norm": 1.7259198427200317, - "learning_rate": 5e-05, - "epoch": 0.45815457514247293, - "step": 7155 - }, - { - "loss": 2.2434, - "grad_norm": 1.6952046155929565, - "learning_rate": 5e-05, - "epoch": 0.458474739066402, - "step": 7160 - }, - { - "loss": 2.2313, - "grad_norm": 1.7676249742507935, - "learning_rate": 5e-05, - "epoch": 0.45879490299033104, - "step": 7165 - }, - { - "loss": 2.2408, - "grad_norm": 1.6167271137237549, - "learning_rate": 5e-05, - "epoch": 0.45911506691426013, - "step": 7170 - }, - { - "loss": 2.2076, - "grad_norm": 1.7571128606796265, - "learning_rate": 5e-05, - "epoch": 0.45943523083818916, - "step": 7175 - }, - { - "loss": 2.2307, - "grad_norm": 1.619318962097168, - "learning_rate": 5e-05, - "epoch": 0.4597553947621182, - "step": 7180 - }, - { - "loss": 2.2399, - "grad_norm": 1.6977887153625488, - "learning_rate": 5e-05, - "epoch": 0.46007555868604727, - "step": 7185 - }, - { - "loss": 2.2279, - "grad_norm": 1.7176746129989624, - "learning_rate": 5e-05, - "epoch": 0.4603957226099763, - "step": 7190 - }, - { - "loss": 2.2263, - "grad_norm": 1.7717937231063843, - "learning_rate": 5e-05, - "epoch": 0.4607158865339054, - "step": 7195 - }, - { - "loss": 2.2539, - "grad_norm": 1.7532376050949097, - "learning_rate": 5e-05, - "epoch": 0.4610360504578344, - "step": 7200 - }, - { - "eval_loss": 2.105297565460205, - "eval_runtime": 9.2265, - "eval_samples_per_second": 221.969, - "eval_steps_per_second": 27.746, - "epoch": 0.4610360504578344, - "step": 7200 - }, - { - "loss": 2.2402, - "grad_norm": 1.6974916458129883, - "learning_rate": 5e-05, - "epoch": 0.46135621438176344, - "step": 7205 - }, - { - "loss": 2.205, - "grad_norm": 1.703689455986023, - "learning_rate": 5e-05, - "epoch": 0.4616763783056925, - "step": 7210 - }, - { - "loss": 2.2214, - "grad_norm": 1.6790088415145874, - "learning_rate": 5e-05, - "epoch": 0.46199654222962155, - "step": 7215 - }, - { - "loss": 2.2013, - "grad_norm": 1.7056132555007935, - "learning_rate": 5e-05, - "epoch": 0.46231670615355064, - "step": 7220 - }, - { - "loss": 2.2226, - "grad_norm": 1.7162805795669556, - "learning_rate": 5e-05, - "epoch": 0.46263687007747967, - "step": 7225 - }, - { - "loss": 2.2465, - "grad_norm": 1.6653958559036255, - "learning_rate": 5e-05, - "epoch": 0.4629570340014087, - "step": 7230 - }, - { - "loss": 2.2545, - "grad_norm": 1.7309342622756958, - "learning_rate": 5e-05, - "epoch": 0.4632771979253378, - "step": 7235 - }, - { - "loss": 2.2474, - "grad_norm": 1.6637336015701294, - "learning_rate": 5e-05, - "epoch": 0.4635973618492668, - "step": 7240 - }, - { - "loss": 2.227, - "grad_norm": 1.6410720348358154, - "learning_rate": 5e-05, - "epoch": 0.4639175257731959, - "step": 7245 - }, - { - "loss": 2.2268, - "grad_norm": 1.7074912786483765, - "learning_rate": 5e-05, - "epoch": 0.4642376896971249, - "step": 7250 - }, - { - "loss": 2.212, - "grad_norm": 1.7265816926956177, - "learning_rate": 5e-05, - "epoch": 0.464557853621054, - "step": 7255 - }, - { - "loss": 2.2419, - "grad_norm": 1.7796136140823364, - "learning_rate": 5e-05, - "epoch": 0.46487801754498304, - "step": 7260 - }, - { - "loss": 2.2388, - "grad_norm": 1.748008370399475, - "learning_rate": 5e-05, - "epoch": 0.46519818146891206, - "step": 7265 - }, - { - "loss": 2.2236, - "grad_norm": 1.7407780885696411, - "learning_rate": 5e-05, - "epoch": 0.46551834539284115, - "step": 7270 - }, - { - "loss": 2.2306, - "grad_norm": 1.7564735412597656, - "learning_rate": 5e-05, - "epoch": 0.4658385093167702, - "step": 7275 - }, - { - "loss": 2.246, - "grad_norm": 1.8281434774398804, - "learning_rate": 5e-05, - "epoch": 0.46615867324069926, - "step": 7280 - }, - { - "loss": 2.2127, - "grad_norm": 1.7006824016571045, - "learning_rate": 5e-05, - "epoch": 0.4664788371646283, - "step": 7285 - }, - { - "loss": 2.2081, - "grad_norm": 1.7190055847167969, - "learning_rate": 5e-05, - "epoch": 0.4667990010885573, - "step": 7290 - }, - { - "loss": 2.2096, - "grad_norm": 1.6515896320343018, - "learning_rate": 5e-05, - "epoch": 0.4671191650124864, - "step": 7295 - }, - { - "loss": 2.2523, - "grad_norm": 1.7050222158432007, - "learning_rate": 5e-05, - "epoch": 0.46743932893641543, - "step": 7300 - }, - { - "loss": 2.257, - "grad_norm": 1.8344449996948242, - "learning_rate": 5e-05, - "epoch": 0.4677594928603445, - "step": 7305 - }, - { - "loss": 2.2389, - "grad_norm": 1.762683391571045, - "learning_rate": 5e-05, - "epoch": 0.46807965678427355, - "step": 7310 - }, - { - "loss": 2.2384, - "grad_norm": 1.7812387943267822, - "learning_rate": 5e-05, - "epoch": 0.4683998207082026, - "step": 7315 - }, - { - "loss": 2.2468, - "grad_norm": 1.6442910432815552, - "learning_rate": 5e-05, - "epoch": 0.46871998463213166, - "step": 7320 - }, - { - "loss": 2.1976, - "grad_norm": 1.7651413679122925, - "learning_rate": 5e-05, - "epoch": 0.4690401485560607, - "step": 7325 - }, - { - "loss": 2.2286, - "grad_norm": 1.6484975814819336, - "learning_rate": 5e-05, - "epoch": 0.46936031247998977, - "step": 7330 - }, - { - "loss": 2.2328, - "grad_norm": 1.7871202230453491, - "learning_rate": 5e-05, - "epoch": 0.4696804764039188, - "step": 7335 - }, - { - "loss": 2.2273, - "grad_norm": 1.7313917875289917, - "learning_rate": 5e-05, - "epoch": 0.4700006403278479, - "step": 7340 - }, - { - "loss": 2.2665, - "grad_norm": 1.7805308103561401, - "learning_rate": 5e-05, - "epoch": 0.4703208042517769, - "step": 7345 - }, - { - "loss": 2.2815, - "grad_norm": 1.734679937362671, - "learning_rate": 5e-05, - "epoch": 0.47064096817570594, - "step": 7350 - }, - { - "loss": 2.2212, - "grad_norm": 1.696459412574768, - "learning_rate": 5e-05, - "epoch": 0.470961132099635, - "step": 7355 - }, - { - "loss": 2.2609, - "grad_norm": 1.767733097076416, - "learning_rate": 5e-05, - "epoch": 0.47128129602356406, - "step": 7360 - }, - { - "loss": 2.2074, - "grad_norm": 1.703615665435791, - "learning_rate": 5e-05, - "epoch": 0.47160145994749314, - "step": 7365 - }, - { - "loss": 2.2239, - "grad_norm": 1.6856378316879272, - "learning_rate": 5e-05, - "epoch": 0.47192162387142217, - "step": 7370 - }, - { - "loss": 2.2218, - "grad_norm": 1.75584876537323, - "learning_rate": 5e-05, - "epoch": 0.4722417877953512, - "step": 7375 - }, - { - "loss": 2.2304, - "grad_norm": 1.733469843864441, - "learning_rate": 5e-05, - "epoch": 0.4725619517192803, - "step": 7380 - }, - { - "loss": 2.242, - "grad_norm": 1.6238700151443481, - "learning_rate": 5e-05, - "epoch": 0.4728821156432093, - "step": 7385 - }, - { - "loss": 2.2392, - "grad_norm": 1.7466235160827637, - "learning_rate": 5e-05, - "epoch": 0.4732022795671384, - "step": 7390 - }, - { - "loss": 2.2455, - "grad_norm": 1.8809025287628174, - "learning_rate": 5e-05, - "epoch": 0.4735224434910674, - "step": 7395 - }, - { - "loss": 2.2431, - "grad_norm": 1.705507516860962, - "learning_rate": 5e-05, - "epoch": 0.47384260741499645, - "step": 7400 - }, - { - "eval_loss": 2.098146915435791, - "eval_runtime": 9.2837, - "eval_samples_per_second": 220.601, - "eval_steps_per_second": 27.575, - "epoch": 0.47384260741499645, - "step": 7400 - }, - { - "loss": 2.1957, - "grad_norm": 1.7113065719604492, - "learning_rate": 5e-05, - "epoch": 0.47416277133892554, - "step": 7405 - }, - { - "loss": 2.2073, - "grad_norm": 1.6644835472106934, - "learning_rate": 5e-05, - "epoch": 0.47448293526285457, - "step": 7410 - }, - { - "loss": 2.2312, - "grad_norm": 1.6765412092208862, - "learning_rate": 5e-05, - "epoch": 0.47480309918678365, - "step": 7415 - }, - { - "loss": 2.2219, - "grad_norm": 1.6999036073684692, - "learning_rate": 5e-05, - "epoch": 0.4751232631107127, - "step": 7420 - }, - { - "loss": 2.2325, - "grad_norm": 1.6527864933013916, - "learning_rate": 5e-05, - "epoch": 0.47544342703464176, - "step": 7425 - }, - { - "loss": 2.2194, - "grad_norm": 1.6665046215057373, - "learning_rate": 5e-05, - "epoch": 0.4757635909585708, - "step": 7430 - }, - { - "loss": 2.2436, - "grad_norm": 1.6418559551239014, - "learning_rate": 5e-05, - "epoch": 0.4760837548824998, - "step": 7435 - }, - { - "loss": 2.2442, - "grad_norm": 1.6663761138916016, - "learning_rate": 5e-05, - "epoch": 0.4764039188064289, - "step": 7440 - }, - { - "loss": 2.2229, - "grad_norm": 1.7055283784866333, - "learning_rate": 5e-05, - "epoch": 0.47672408273035793, - "step": 7445 - }, - { - "loss": 2.233, - "grad_norm": 1.8025060892105103, - "learning_rate": 5e-05, - "epoch": 0.477044246654287, - "step": 7450 - }, - { - "loss": 2.2237, - "grad_norm": 1.688681721687317, - "learning_rate": 5e-05, - "epoch": 0.47736441057821605, - "step": 7455 - }, - { - "loss": 2.2231, - "grad_norm": 1.7080329656600952, - "learning_rate": 5e-05, - "epoch": 0.4776845745021451, - "step": 7460 - }, - { - "loss": 2.2249, - "grad_norm": 1.6620713472366333, - "learning_rate": 5e-05, - "epoch": 0.47800473842607416, - "step": 7465 - }, - { - "loss": 2.2358, - "grad_norm": 1.7365187406539917, - "learning_rate": 5e-05, - "epoch": 0.4783249023500032, - "step": 7470 - }, - { - "loss": 2.2135, - "grad_norm": 1.6564573049545288, - "learning_rate": 5e-05, - "epoch": 0.4786450662739323, - "step": 7475 - }, - { - "loss": 2.2008, - "grad_norm": 1.6541259288787842, - "learning_rate": 5e-05, - "epoch": 0.4789652301978613, - "step": 7480 - }, - { - "loss": 2.1807, - "grad_norm": 1.7617090940475464, - "learning_rate": 5e-05, - "epoch": 0.47928539412179033, - "step": 7485 - }, - { - "loss": 2.2205, - "grad_norm": 1.6854571104049683, - "learning_rate": 5e-05, - "epoch": 0.4796055580457194, - "step": 7490 - }, - { - "loss": 2.2363, - "grad_norm": 1.7207971811294556, - "learning_rate": 5e-05, - "epoch": 0.47992572196964844, - "step": 7495 - }, - { - "loss": 2.2369, - "grad_norm": 1.7152527570724487, - "learning_rate": 5e-05, - "epoch": 0.48024588589357753, - "step": 7500 - }, - { - "loss": 2.2348, - "grad_norm": 1.6942616701126099, - "learning_rate": 5e-05, - "epoch": 0.48056604981750656, - "step": 7505 - }, - { - "loss": 2.2159, - "grad_norm": 1.7268720865249634, - "learning_rate": 5e-05, - "epoch": 0.48088621374143564, - "step": 7510 - }, - { - "loss": 2.2252, - "grad_norm": 1.7245421409606934, - "learning_rate": 5e-05, - "epoch": 0.48120637766536467, - "step": 7515 - }, - { - "loss": 2.235, - "grad_norm": 1.8024479150772095, - "learning_rate": 5e-05, - "epoch": 0.4815265415892937, - "step": 7520 - }, - { - "loss": 2.2198, - "grad_norm": 1.7535227537155151, - "learning_rate": 5e-05, - "epoch": 0.4818467055132228, - "step": 7525 - }, - { - "loss": 2.2312, - "grad_norm": 1.6243641376495361, - "learning_rate": 5e-05, - "epoch": 0.4821668694371518, - "step": 7530 - }, - { - "loss": 2.2252, - "grad_norm": 1.7475508451461792, - "learning_rate": 5e-05, - "epoch": 0.4824870333610809, - "step": 7535 - }, - { - "loss": 2.2565, - "grad_norm": 1.6990846395492554, - "learning_rate": 5e-05, - "epoch": 0.4828071972850099, - "step": 7540 - }, - { - "loss": 2.2315, - "grad_norm": 1.758632779121399, - "learning_rate": 5e-05, - "epoch": 0.48312736120893895, - "step": 7545 - }, - { - "loss": 2.2011, - "grad_norm": 1.6367809772491455, - "learning_rate": 5e-05, - "epoch": 0.48344752513286804, - "step": 7550 - }, - { - "loss": 2.2523, - "grad_norm": 1.667632818222046, - "learning_rate": 5e-05, - "epoch": 0.48376768905679707, - "step": 7555 - }, - { - "loss": 2.2309, - "grad_norm": 1.674497365951538, - "learning_rate": 5e-05, - "epoch": 0.48408785298072615, - "step": 7560 - }, - { - "loss": 2.2254, - "grad_norm": 1.6481330394744873, - "learning_rate": 5e-05, - "epoch": 0.4844080169046552, - "step": 7565 - }, - { - "loss": 2.2089, - "grad_norm": 1.6713910102844238, - "learning_rate": 5e-05, - "epoch": 0.4847281808285842, - "step": 7570 - }, - { - "loss": 2.2481, - "grad_norm": 1.807297945022583, - "learning_rate": 5e-05, - "epoch": 0.4850483447525133, - "step": 7575 - }, - { - "loss": 2.227, - "grad_norm": 1.6840758323669434, - "learning_rate": 5e-05, - "epoch": 0.4853685086764423, - "step": 7580 - }, - { - "loss": 2.2443, - "grad_norm": 1.7010893821716309, - "learning_rate": 5e-05, - "epoch": 0.4856886726003714, - "step": 7585 - }, - { - "loss": 2.2416, - "grad_norm": 1.6932690143585205, - "learning_rate": 5e-05, - "epoch": 0.48600883652430044, - "step": 7590 - }, - { - "loss": 2.2301, - "grad_norm": 1.7321279048919678, - "learning_rate": 5e-05, - "epoch": 0.4863290004482295, - "step": 7595 - }, - { - "loss": 2.2394, - "grad_norm": 1.7051780223846436, - "learning_rate": 5e-05, - "epoch": 0.48664916437215855, - "step": 7600 - }, - { - "eval_loss": 2.1021814346313477, - "eval_runtime": 9.0753, - "eval_samples_per_second": 225.668, - "eval_steps_per_second": 28.208, - "epoch": 0.48664916437215855, - "step": 7600 - }, - { - "loss": 2.2093, - "grad_norm": 1.7261319160461426, - "learning_rate": 5e-05, - "epoch": 0.4869693282960876, - "step": 7605 - }, - { - "loss": 2.2148, - "grad_norm": 1.7780207395553589, - "learning_rate": 5e-05, - "epoch": 0.48728949222001666, - "step": 7610 - }, - { - "loss": 2.217, - "grad_norm": 1.7456703186035156, - "learning_rate": 5e-05, - "epoch": 0.4876096561439457, - "step": 7615 - }, - { - "loss": 2.1884, - "grad_norm": 1.8097208738327026, - "learning_rate": 5e-05, - "epoch": 0.4879298200678748, - "step": 7620 - }, - { - "loss": 2.2064, - "grad_norm": 1.7063500881195068, - "learning_rate": 5e-05, - "epoch": 0.4882499839918038, - "step": 7625 - }, - { - "loss": 2.2059, - "grad_norm": 1.7562440633773804, - "learning_rate": 5e-05, - "epoch": 0.48857014791573283, - "step": 7630 - }, - { - "loss": 2.2136, - "grad_norm": 1.7514058351516724, - "learning_rate": 5e-05, - "epoch": 0.4888903118396619, - "step": 7635 - }, - { - "loss": 2.2369, - "grad_norm": 1.7494462728500366, - "learning_rate": 5e-05, - "epoch": 0.48921047576359095, - "step": 7640 - }, - { - "loss": 2.2129, - "grad_norm": 1.7088241577148438, - "learning_rate": 5e-05, - "epoch": 0.48953063968752003, - "step": 7645 - }, - { - "loss": 2.2257, - "grad_norm": 1.6516956090927124, - "learning_rate": 5e-05, - "epoch": 0.48985080361144906, - "step": 7650 - }, - { - "loss": 2.246, - "grad_norm": 1.7086745500564575, - "learning_rate": 5e-05, - "epoch": 0.4901709675353781, - "step": 7655 - }, - { - "loss": 2.2448, - "grad_norm": 1.748744010925293, - "learning_rate": 5e-05, - "epoch": 0.4904911314593072, - "step": 7660 - }, - { - "loss": 2.2262, - "grad_norm": 1.612053632736206, - "learning_rate": 5e-05, - "epoch": 0.4908112953832362, - "step": 7665 - }, - { - "loss": 2.2449, - "grad_norm": 1.7028324604034424, - "learning_rate": 5e-05, - "epoch": 0.4911314593071653, - "step": 7670 - }, - { - "loss": 2.2583, - "grad_norm": 1.735455870628357, - "learning_rate": 5e-05, - "epoch": 0.4914516232310943, - "step": 7675 - }, - { - "loss": 2.209, - "grad_norm": 1.806516170501709, - "learning_rate": 5e-05, - "epoch": 0.4917717871550234, - "step": 7680 - }, - { - "loss": 2.226, - "grad_norm": 1.6990715265274048, - "learning_rate": 5e-05, - "epoch": 0.4920919510789524, - "step": 7685 - }, - { - "loss": 2.2201, - "grad_norm": 1.6955472230911255, - "learning_rate": 5e-05, - "epoch": 0.49241211500288146, - "step": 7690 - }, - { - "loss": 2.2677, - "grad_norm": 1.6222796440124512, - "learning_rate": 5e-05, - "epoch": 0.49273227892681054, - "step": 7695 - }, - { - "loss": 2.2376, - "grad_norm": 1.7312852144241333, - "learning_rate": 5e-05, - "epoch": 0.49305244285073957, - "step": 7700 - }, - { - "loss": 2.2389, - "grad_norm": 1.6851000785827637, - "learning_rate": 5e-05, - "epoch": 0.49337260677466865, - "step": 7705 - }, - { - "loss": 2.2027, - "grad_norm": 1.6476107835769653, - "learning_rate": 5e-05, - "epoch": 0.4936927706985977, - "step": 7710 - }, - { - "loss": 2.2448, - "grad_norm": 1.6625196933746338, - "learning_rate": 5e-05, - "epoch": 0.4940129346225267, - "step": 7715 - }, - { - "loss": 2.1801, - "grad_norm": 1.7839092016220093, - "learning_rate": 5e-05, - "epoch": 0.4943330985464558, - "step": 7720 - }, - { - "loss": 2.2286, - "grad_norm": 1.6703824996948242, - "learning_rate": 5e-05, - "epoch": 0.4946532624703848, - "step": 7725 - }, - { - "loss": 2.1893, - "grad_norm": 1.825608253479004, - "learning_rate": 5e-05, - "epoch": 0.4949734263943139, - "step": 7730 - }, - { - "loss": 2.2228, - "grad_norm": 1.6634159088134766, - "learning_rate": 5e-05, - "epoch": 0.49529359031824294, - "step": 7735 - }, - { - "loss": 2.203, - "grad_norm": 1.6320128440856934, - "learning_rate": 5e-05, - "epoch": 0.49561375424217197, - "step": 7740 - }, - { - "loss": 2.2567, - "grad_norm": 1.6982113122940063, - "learning_rate": 5e-05, - "epoch": 0.49593391816610105, - "step": 7745 - }, - { - "loss": 2.2155, - "grad_norm": 1.5984007120132446, - "learning_rate": 5e-05, - "epoch": 0.4962540820900301, - "step": 7750 - }, - { - "loss": 2.2294, - "grad_norm": 1.7036561965942383, - "learning_rate": 5e-05, - "epoch": 0.49657424601395916, - "step": 7755 - }, - { - "loss": 2.2139, - "grad_norm": 1.7015743255615234, - "learning_rate": 5e-05, - "epoch": 0.4968944099378882, - "step": 7760 - }, - { - "loss": 2.2231, - "grad_norm": 1.7515677213668823, - "learning_rate": 5e-05, - "epoch": 0.4972145738618173, - "step": 7765 - }, - { - "loss": 2.2211, - "grad_norm": 1.7809141874313354, - "learning_rate": 5e-05, - "epoch": 0.4975347377857463, - "step": 7770 - }, - { - "loss": 2.2251, - "grad_norm": 1.6579275131225586, - "learning_rate": 5e-05, - "epoch": 0.49785490170967533, - "step": 7775 - }, - { - "loss": 2.2045, - "grad_norm": 1.6126208305358887, - "learning_rate": 5e-05, - "epoch": 0.4981750656336044, - "step": 7780 - }, - { - "loss": 2.1937, - "grad_norm": 1.7193243503570557, - "learning_rate": 5e-05, - "epoch": 0.49849522955753345, - "step": 7785 - }, - { - "loss": 2.2378, - "grad_norm": 1.7689208984375, - "learning_rate": 5e-05, - "epoch": 0.49881539348146253, - "step": 7790 - }, - { - "loss": 2.2021, - "grad_norm": 1.7081634998321533, - "learning_rate": 5e-05, - "epoch": 0.49913555740539156, - "step": 7795 - }, - { - "loss": 2.2286, - "grad_norm": 1.682572841644287, - "learning_rate": 5e-05, - "epoch": 0.4994557213293206, - "step": 7800 - }, - { - "eval_loss": 2.093745231628418, - "eval_runtime": 9.0651, - "eval_samples_per_second": 225.921, - "eval_steps_per_second": 28.24, - "epoch": 0.4994557213293206, - "step": 7800 - }, - { - "loss": 2.2273, - "grad_norm": 1.7163746356964111, - "learning_rate": 5e-05, - "epoch": 0.4997758852532497, - "step": 7805 - }, - { - "loss": 2.2204, - "grad_norm": 1.7658613920211792, - "learning_rate": 5e-05, - "epoch": 0.5000960491771788, - "step": 7810 - }, - { - "loss": 2.2302, - "grad_norm": 1.7190687656402588, - "learning_rate": 5e-05, - "epoch": 0.5004162131011077, - "step": 7815 - }, - { - "loss": 2.2075, - "grad_norm": 1.7589188814163208, - "learning_rate": 5e-05, - "epoch": 0.5007363770250368, - "step": 7820 - }, - { - "loss": 2.2345, - "grad_norm": 1.6196938753128052, - "learning_rate": 5e-05, - "epoch": 0.5010565409489659, - "step": 7825 - }, - { - "loss": 2.2037, - "grad_norm": 1.5979619026184082, - "learning_rate": 5e-05, - "epoch": 0.5013767048728949, - "step": 7830 - }, - { - "loss": 2.2133, - "grad_norm": 1.748252511024475, - "learning_rate": 5e-05, - "epoch": 0.501696868796824, - "step": 7835 - }, - { - "loss": 2.1874, - "grad_norm": 1.6827281713485718, - "learning_rate": 5e-05, - "epoch": 0.502017032720753, - "step": 7840 - }, - { - "loss": 2.2298, - "grad_norm": 1.7303760051727295, - "learning_rate": 5e-05, - "epoch": 0.5023371966446821, - "step": 7845 - }, - { - "loss": 2.2331, - "grad_norm": 1.724907398223877, - "learning_rate": 5e-05, - "epoch": 0.5026573605686111, - "step": 7850 - }, - { - "loss": 2.2103, - "grad_norm": 1.6769534349441528, - "learning_rate": 5e-05, - "epoch": 0.5029775244925402, - "step": 7855 - }, - { - "loss": 2.2251, - "grad_norm": 1.7141268253326416, - "learning_rate": 5e-05, - "epoch": 0.5032976884164693, - "step": 7860 - }, - { - "loss": 2.2383, - "grad_norm": 1.691082239151001, - "learning_rate": 5e-05, - "epoch": 0.5036178523403982, - "step": 7865 - }, - { - "loss": 2.2198, - "grad_norm": 1.7353357076644897, - "learning_rate": 5e-05, - "epoch": 0.5039380162643273, - "step": 7870 - }, - { - "loss": 2.2259, - "grad_norm": 1.6525827646255493, - "learning_rate": 5e-05, - "epoch": 0.5042581801882564, - "step": 7875 - }, - { - "loss": 2.1898, - "grad_norm": 1.592120885848999, - "learning_rate": 5e-05, - "epoch": 0.5045783441121854, - "step": 7880 - }, - { - "loss": 2.2348, - "grad_norm": 1.6837745904922485, - "learning_rate": 5e-05, - "epoch": 0.5048985080361145, - "step": 7885 - }, - { - "loss": 2.1958, - "grad_norm": 1.5835824012756348, - "learning_rate": 5e-05, - "epoch": 0.5052186719600436, - "step": 7890 - }, - { - "loss": 2.218, - "grad_norm": 1.701811671257019, - "learning_rate": 5e-05, - "epoch": 0.5055388358839726, - "step": 7895 - }, - { - "loss": 2.236, - "grad_norm": 1.6456730365753174, - "learning_rate": 5e-05, - "epoch": 0.5058589998079016, - "step": 7900 - }, - { - "loss": 2.2219, - "grad_norm": 1.8244099617004395, - "learning_rate": 5e-05, - "epoch": 0.5061791637318307, - "step": 7905 - }, - { - "loss": 2.2081, - "grad_norm": 1.7030236721038818, - "learning_rate": 5e-05, - "epoch": 0.5064993276557598, - "step": 7910 - }, - { - "loss": 2.2244, - "grad_norm": 1.7620062828063965, - "learning_rate": 5e-05, - "epoch": 0.5068194915796888, - "step": 7915 - }, - { - "loss": 2.2201, - "grad_norm": 1.6482844352722168, - "learning_rate": 5e-05, - "epoch": 0.5071396555036178, - "step": 7920 - }, - { - "loss": 2.2033, - "grad_norm": 1.6990326642990112, - "learning_rate": 5e-05, - "epoch": 0.5074598194275469, - "step": 7925 - }, - { - "loss": 2.2165, - "grad_norm": 1.8352545499801636, - "learning_rate": 5e-05, - "epoch": 0.507779983351476, - "step": 7930 - }, - { - "loss": 2.2163, - "grad_norm": 1.757162094116211, - "learning_rate": 5e-05, - "epoch": 0.508100147275405, - "step": 7935 - }, - { - "loss": 2.2313, - "grad_norm": 1.762620210647583, - "learning_rate": 5e-05, - "epoch": 0.5084203111993341, - "step": 7940 - }, - { - "loss": 2.2162, - "grad_norm": 1.82100510597229, - "learning_rate": 5e-05, - "epoch": 0.5087404751232631, - "step": 7945 - }, - { - "loss": 2.2278, - "grad_norm": 1.6846513748168945, - "learning_rate": 5e-05, - "epoch": 0.5090606390471921, - "step": 7950 - }, - { - "loss": 2.1958, - "grad_norm": 1.769995093345642, - "learning_rate": 5e-05, - "epoch": 0.5093808029711212, - "step": 7955 - }, - { - "loss": 2.1942, - "grad_norm": 1.7381685972213745, - "learning_rate": 5e-05, - "epoch": 0.5097009668950503, - "step": 7960 - }, - { - "loss": 2.2312, - "grad_norm": 1.723332166671753, - "learning_rate": 5e-05, - "epoch": 0.5100211308189793, - "step": 7965 - }, - { - "loss": 2.2185, - "grad_norm": 1.7073355913162231, - "learning_rate": 5e-05, - "epoch": 0.5103412947429083, - "step": 7970 - }, - { - "loss": 2.2389, - "grad_norm": 1.625958800315857, - "learning_rate": 5e-05, - "epoch": 0.5106614586668374, - "step": 7975 - }, - { - "loss": 2.2267, - "grad_norm": 1.673528790473938, - "learning_rate": 5e-05, - "epoch": 0.5109816225907665, - "step": 7980 - }, - { - "loss": 2.224, - "grad_norm": 1.6753426790237427, - "learning_rate": 5e-05, - "epoch": 0.5113017865146955, - "step": 7985 - }, - { - "loss": 2.2299, - "grad_norm": 1.6783485412597656, - "learning_rate": 5e-05, - "epoch": 0.5116219504386246, - "step": 7990 - }, - { - "loss": 2.2078, - "grad_norm": 1.6852103471755981, - "learning_rate": 5e-05, - "epoch": 0.5119421143625537, - "step": 7995 - }, - { - "loss": 2.2307, - "grad_norm": 1.7385365962982178, - "learning_rate": 5e-05, - "epoch": 0.5122622782864826, - "step": 8000 - }, - { - "eval_loss": 2.0892796516418457, - "eval_runtime": 9.339, - "eval_samples_per_second": 219.294, - "eval_steps_per_second": 27.412, - "epoch": 0.5122622782864826, - "step": 8000 - }, - { - "loss": 2.2291, - "grad_norm": 1.5970470905303955, - "learning_rate": 5e-05, - "epoch": 0.5125824422104117, - "step": 8005 - }, - { - "loss": 2.2039, - "grad_norm": 1.7087442874908447, - "learning_rate": 5e-05, - "epoch": 0.5129026061343408, - "step": 8010 - }, - { - "loss": 2.204, - "grad_norm": 1.634369969367981, - "learning_rate": 5e-05, - "epoch": 0.5132227700582699, - "step": 8015 - }, - { - "loss": 2.2052, - "grad_norm": 1.6249363422393799, - "learning_rate": 5e-05, - "epoch": 0.5135429339821989, - "step": 8020 - }, - { - "loss": 2.227, - "grad_norm": 1.75498366355896, - "learning_rate": 5e-05, - "epoch": 0.5138630979061279, - "step": 8025 - }, - { - "loss": 2.229, - "grad_norm": 1.7273918390274048, - "learning_rate": 5e-05, - "epoch": 0.514183261830057, - "step": 8030 - }, - { - "loss": 2.2165, - "grad_norm": 1.7545193433761597, - "learning_rate": 5e-05, - "epoch": 0.514503425753986, - "step": 8035 - }, - { - "loss": 2.224, - "grad_norm": 1.73219895362854, - "learning_rate": 5e-05, - "epoch": 0.5148235896779151, - "step": 8040 - }, - { - "loss": 2.2239, - "grad_norm": 1.7364838123321533, - "learning_rate": 5e-05, - "epoch": 0.5151437536018442, - "step": 8045 - }, - { - "loss": 2.2069, - "grad_norm": 1.626757025718689, - "learning_rate": 5e-05, - "epoch": 0.5154639175257731, - "step": 8050 - }, - { - "loss": 2.1969, - "grad_norm": 1.7197450399398804, - "learning_rate": 5e-05, - "epoch": 0.5157840814497022, - "step": 8055 - }, - { - "loss": 2.1923, - "grad_norm": 1.7638471126556396, - "learning_rate": 5e-05, - "epoch": 0.5161042453736313, - "step": 8060 - }, - { - "loss": 2.218, - "grad_norm": 1.6772651672363281, - "learning_rate": 5e-05, - "epoch": 0.5164244092975604, - "step": 8065 - }, - { - "loss": 2.2105, - "grad_norm": 1.707062005996704, - "learning_rate": 5e-05, - "epoch": 0.5167445732214894, - "step": 8070 - }, - { - "loss": 2.235, - "grad_norm": 1.679762601852417, - "learning_rate": 5e-05, - "epoch": 0.5170647371454185, - "step": 8075 - }, - { - "loss": 2.2301, - "grad_norm": 1.6003955602645874, - "learning_rate": 5e-05, - "epoch": 0.5173849010693475, - "step": 8080 - }, - { - "loss": 2.2387, - "grad_norm": 1.7114651203155518, - "learning_rate": 5e-05, - "epoch": 0.5177050649932765, - "step": 8085 - }, - { - "loss": 2.1936, - "grad_norm": 1.6603319644927979, - "learning_rate": 5e-05, - "epoch": 0.5180252289172056, - "step": 8090 - }, - { - "loss": 2.2455, - "grad_norm": 1.806725263595581, - "learning_rate": 5e-05, - "epoch": 0.5183453928411347, - "step": 8095 - }, - { - "loss": 2.2437, - "grad_norm": 1.642220377922058, - "learning_rate": 5e-05, - "epoch": 0.5186655567650638, - "step": 8100 - }, - { - "loss": 2.2011, - "grad_norm": 1.641445279121399, - "learning_rate": 5e-05, - "epoch": 0.5189857206889927, - "step": 8105 - }, - { - "loss": 2.2033, - "grad_norm": 1.741835594177246, - "learning_rate": 5e-05, - "epoch": 0.5193058846129218, - "step": 8110 - }, - { - "loss": 2.2164, - "grad_norm": 1.6442292928695679, - "learning_rate": 5e-05, - "epoch": 0.5196260485368509, - "step": 8115 - }, - { - "loss": 2.1894, - "grad_norm": 1.6900848150253296, - "learning_rate": 5e-05, - "epoch": 0.5199462124607799, - "step": 8120 - }, - { - "loss": 2.1935, - "grad_norm": 1.69651198387146, - "learning_rate": 5e-05, - "epoch": 0.520266376384709, - "step": 8125 - }, - { - "loss": 2.2244, - "grad_norm": 1.6706749200820923, - "learning_rate": 5e-05, - "epoch": 0.520586540308638, - "step": 8130 - }, - { - "loss": 2.2044, - "grad_norm": 1.7208715677261353, - "learning_rate": 5e-05, - "epoch": 0.520906704232567, - "step": 8135 - }, - { - "loss": 2.2312, - "grad_norm": 1.554226040840149, - "learning_rate": 5e-05, - "epoch": 0.5212268681564961, - "step": 8140 - }, - { - "loss": 2.2347, - "grad_norm": 1.7291901111602783, - "learning_rate": 5e-05, - "epoch": 0.5215470320804252, - "step": 8145 - }, - { - "loss": 2.2104, - "grad_norm": 1.6915407180786133, - "learning_rate": 5e-05, - "epoch": 0.5218671960043543, - "step": 8150 - }, - { - "loss": 2.1875, - "grad_norm": 1.6418696641921997, - "learning_rate": 5e-05, - "epoch": 0.5221873599282832, - "step": 8155 - }, - { - "loss": 2.2015, - "grad_norm": 1.656497597694397, - "learning_rate": 5e-05, - "epoch": 0.5225075238522123, - "step": 8160 - }, - { - "loss": 2.2165, - "grad_norm": 1.78023362159729, - "learning_rate": 5e-05, - "epoch": 0.5228276877761414, - "step": 8165 - }, - { - "loss": 2.2254, - "grad_norm": 1.690218448638916, - "learning_rate": 5e-05, - "epoch": 0.5231478517000704, - "step": 8170 - }, - { - "loss": 2.2104, - "grad_norm": 1.6391229629516602, - "learning_rate": 5e-05, - "epoch": 0.5234680156239995, - "step": 8175 - }, - { - "loss": 2.194, - "grad_norm": 1.6620936393737793, - "learning_rate": 5e-05, - "epoch": 0.5237881795479286, - "step": 8180 - }, - { - "loss": 2.1981, - "grad_norm": 1.6374820470809937, - "learning_rate": 5e-05, - "epoch": 0.5241083434718576, - "step": 8185 - }, - { - "loss": 2.2037, - "grad_norm": 1.6494983434677124, - "learning_rate": 5e-05, - "epoch": 0.5244285073957866, - "step": 8190 - }, - { - "loss": 2.2126, - "grad_norm": 1.6629283428192139, - "learning_rate": 5e-05, - "epoch": 0.5247486713197157, - "step": 8195 - }, - { - "loss": 2.1914, - "grad_norm": 1.7156426906585693, - "learning_rate": 5e-05, - "epoch": 0.5250688352436448, - "step": 8200 - }, - { - "eval_loss": 2.0739922523498535, - "eval_runtime": 9.5483, - "eval_samples_per_second": 214.489, - "eval_steps_per_second": 26.811, - "epoch": 0.5250688352436448, - "step": 8200 - }, - { - "loss": 2.2416, - "grad_norm": 1.7021827697753906, - "learning_rate": 5e-05, - "epoch": 0.5253889991675738, - "step": 8205 - }, - { - "loss": 2.1897, - "grad_norm": 1.7273354530334473, - "learning_rate": 5e-05, - "epoch": 0.5257091630915028, - "step": 8210 - }, - { - "loss": 2.2093, - "grad_norm": 1.6649446487426758, - "learning_rate": 5e-05, - "epoch": 0.5260293270154319, - "step": 8215 - }, - { - "loss": 2.2231, - "grad_norm": 1.7139078378677368, - "learning_rate": 5e-05, - "epoch": 0.5263494909393609, - "step": 8220 - }, - { - "loss": 2.212, - "grad_norm": 1.7619116306304932, - "learning_rate": 5e-05, - "epoch": 0.52666965486329, - "step": 8225 - }, - { - "loss": 2.2043, - "grad_norm": 1.537295937538147, - "learning_rate": 5e-05, - "epoch": 0.5269898187872191, - "step": 8230 - }, - { - "loss": 2.1928, - "grad_norm": 1.6285382509231567, - "learning_rate": 5e-05, - "epoch": 0.5273099827111482, - "step": 8235 - }, - { - "loss": 2.217, - "grad_norm": 1.673884630203247, - "learning_rate": 5e-05, - "epoch": 0.5276301466350771, - "step": 8240 - }, - { - "loss": 2.2091, - "grad_norm": 1.7069264650344849, - "learning_rate": 5e-05, - "epoch": 0.5279503105590062, - "step": 8245 - }, - { - "loss": 2.2122, - "grad_norm": 1.6686755418777466, - "learning_rate": 5e-05, - "epoch": 0.5282704744829353, - "step": 8250 - }, - { - "loss": 2.2098, - "grad_norm": 1.7338638305664062, - "learning_rate": 5e-05, - "epoch": 0.5285906384068643, - "step": 8255 - }, - { - "loss": 2.1981, - "grad_norm": 1.7347779273986816, - "learning_rate": 5e-05, - "epoch": 0.5289108023307934, - "step": 8260 - }, - { - "loss": 2.2123, - "grad_norm": 1.6788371801376343, - "learning_rate": 5e-05, - "epoch": 0.5292309662547224, - "step": 8265 - }, - { - "loss": 2.2125, - "grad_norm": 1.7118918895721436, - "learning_rate": 5e-05, - "epoch": 0.5295511301786515, - "step": 8270 - }, - { - "loss": 2.2498, - "grad_norm": 1.6705480813980103, - "learning_rate": 5e-05, - "epoch": 0.5298712941025805, - "step": 8275 - }, - { - "loss": 2.2, - "grad_norm": 1.6503225564956665, - "learning_rate": 5e-05, - "epoch": 0.5301914580265096, - "step": 8280 - }, - { - "loss": 2.2383, - "grad_norm": 1.7550666332244873, - "learning_rate": 5e-05, - "epoch": 0.5305116219504387, - "step": 8285 - }, - { - "loss": 2.1962, - "grad_norm": 1.9004778861999512, - "learning_rate": 5e-05, - "epoch": 0.5308317858743676, - "step": 8290 - }, - { - "loss": 2.19, - "grad_norm": 1.6682841777801514, - "learning_rate": 5e-05, - "epoch": 0.5311519497982967, - "step": 8295 - }, - { - "loss": 2.2168, - "grad_norm": 1.801561951637268, - "learning_rate": 5e-05, - "epoch": 0.5314721137222258, - "step": 8300 - }, - { - "loss": 2.1998, - "grad_norm": 1.621793270111084, - "learning_rate": 5e-05, - "epoch": 0.5317922776461548, - "step": 8305 - }, - { - "loss": 2.2359, - "grad_norm": 1.597138524055481, - "learning_rate": 5e-05, - "epoch": 0.5321124415700839, - "step": 8310 - }, - { - "loss": 2.2064, - "grad_norm": 1.5918503999710083, - "learning_rate": 5e-05, - "epoch": 0.532432605494013, - "step": 8315 - }, - { - "loss": 2.2203, - "grad_norm": 1.6550569534301758, - "learning_rate": 5e-05, - "epoch": 0.532752769417942, - "step": 8320 - }, - { - "loss": 2.2105, - "grad_norm": 1.5995575189590454, - "learning_rate": 5e-05, - "epoch": 0.533072933341871, - "step": 8325 - }, - { - "loss": 2.1901, - "grad_norm": 1.7386360168457031, - "learning_rate": 5e-05, - "epoch": 0.5333930972658001, - "step": 8330 - }, - { - "loss": 2.2097, - "grad_norm": 1.672532320022583, - "learning_rate": 5e-05, - "epoch": 0.5337132611897292, - "step": 8335 - }, - { - "loss": 2.2103, - "grad_norm": 1.6559290885925293, - "learning_rate": 5e-05, - "epoch": 0.5340334251136581, - "step": 8340 - }, - { - "loss": 2.1924, - "grad_norm": 1.7685126066207886, - "learning_rate": 5e-05, - "epoch": 0.5343535890375872, - "step": 8345 - }, - { - "loss": 2.2295, - "grad_norm": 1.6063467264175415, - "learning_rate": 5e-05, - "epoch": 0.5346737529615163, - "step": 8350 - }, - { - "loss": 2.21, - "grad_norm": 1.6099108457565308, - "learning_rate": 5e-05, - "epoch": 0.5349939168854454, - "step": 8355 - }, - { - "loss": 2.2266, - "grad_norm": 1.751085877418518, - "learning_rate": 5e-05, - "epoch": 0.5353140808093744, - "step": 8360 - }, - { - "loss": 2.229, - "grad_norm": 1.7449450492858887, - "learning_rate": 5e-05, - "epoch": 0.5356342447333035, - "step": 8365 - }, - { - "loss": 2.2268, - "grad_norm": 1.6309188604354858, - "learning_rate": 5e-05, - "epoch": 0.5359544086572325, - "step": 8370 - }, - { - "loss": 2.1899, - "grad_norm": 1.632546305656433, - "learning_rate": 5e-05, - "epoch": 0.5362745725811615, - "step": 8375 - }, - { - "loss": 2.2096, - "grad_norm": 1.6440316438674927, - "learning_rate": 5e-05, - "epoch": 0.5365947365050906, - "step": 8380 - }, - { - "loss": 2.2389, - "grad_norm": 1.7283806800842285, - "learning_rate": 5e-05, - "epoch": 0.5369149004290197, - "step": 8385 - }, - { - "loss": 2.2445, - "grad_norm": 1.6345056295394897, - "learning_rate": 5e-05, - "epoch": 0.5372350643529487, - "step": 8390 - }, - { - "loss": 2.1955, - "grad_norm": 1.7141886949539185, - "learning_rate": 5e-05, - "epoch": 0.5375552282768777, - "step": 8395 - }, - { - "loss": 2.2052, - "grad_norm": 1.652571678161621, - "learning_rate": 5e-05, - "epoch": 0.5378753922008068, - "step": 8400 - }, - { - "eval_loss": 2.0771679878234863, - "eval_runtime": 9.1156, - "eval_samples_per_second": 224.67, - "eval_steps_per_second": 28.084, - "epoch": 0.5378753922008068, - "step": 8400 - }, - { - "loss": 2.2219, - "grad_norm": 1.7584346532821655, - "learning_rate": 5e-05, - "epoch": 0.5381955561247359, - "step": 8405 - }, - { - "loss": 2.2288, - "grad_norm": 1.6449029445648193, - "learning_rate": 5e-05, - "epoch": 0.5385157200486649, - "step": 8410 - }, - { - "loss": 2.2211, - "grad_norm": 1.7171015739440918, - "learning_rate": 5e-05, - "epoch": 0.538835883972594, - "step": 8415 - }, - { - "loss": 2.2383, - "grad_norm": 1.8249092102050781, - "learning_rate": 5e-05, - "epoch": 0.539156047896523, - "step": 8420 - }, - { - "loss": 2.2288, - "grad_norm": 1.7422677278518677, - "learning_rate": 5e-05, - "epoch": 0.539476211820452, - "step": 8425 - }, - { - "loss": 2.2267, - "grad_norm": 1.7028205394744873, - "learning_rate": 5e-05, - "epoch": 0.5397963757443811, - "step": 8430 - }, - { - "loss": 2.2037, - "grad_norm": 1.6526613235473633, - "learning_rate": 5e-05, - "epoch": 0.5401165396683102, - "step": 8435 - }, - { - "loss": 2.2046, - "grad_norm": 1.6816647052764893, - "learning_rate": 5e-05, - "epoch": 0.5404367035922393, - "step": 8440 - }, - { - "loss": 2.2139, - "grad_norm": 1.7449307441711426, - "learning_rate": 5e-05, - "epoch": 0.5407568675161682, - "step": 8445 - }, - { - "loss": 2.2185, - "grad_norm": 1.6552678346633911, - "learning_rate": 5e-05, - "epoch": 0.5410770314400973, - "step": 8450 - }, - { - "loss": 2.2082, - "grad_norm": 1.6898199319839478, - "learning_rate": 5e-05, - "epoch": 0.5413971953640264, - "step": 8455 - }, - { - "loss": 2.2229, - "grad_norm": 1.6607279777526855, - "learning_rate": 5e-05, - "epoch": 0.5417173592879554, - "step": 8460 - }, - { - "loss": 2.2008, - "grad_norm": 1.650549054145813, - "learning_rate": 5e-05, - "epoch": 0.5420375232118845, - "step": 8465 - }, - { - "loss": 2.1981, - "grad_norm": 1.6187115907669067, - "learning_rate": 5e-05, - "epoch": 0.5423576871358136, - "step": 8470 - }, - { - "loss": 2.1946, - "grad_norm": 1.6210685968399048, - "learning_rate": 5e-05, - "epoch": 0.5426778510597425, - "step": 8475 - }, - { - "loss": 2.1881, - "grad_norm": 1.6523209810256958, - "learning_rate": 5e-05, - "epoch": 0.5429980149836716, - "step": 8480 - }, - { - "loss": 2.1864, - "grad_norm": 1.6581562757492065, - "learning_rate": 5e-05, - "epoch": 0.5433181789076007, - "step": 8485 - }, - { - "loss": 2.223, - "grad_norm": 1.628360629081726, - "learning_rate": 5e-05, - "epoch": 0.5436383428315298, - "step": 8490 - }, - { - "loss": 2.1894, - "grad_norm": 1.696500301361084, - "learning_rate": 5e-05, - "epoch": 0.5439585067554588, - "step": 8495 - }, - { - "loss": 2.2349, - "grad_norm": 1.7601076364517212, - "learning_rate": 5e-05, - "epoch": 0.5442786706793878, - "step": 8500 - }, - { - "loss": 2.2428, - "grad_norm": 1.718100666999817, - "learning_rate": 5e-05, - "epoch": 0.5445988346033169, - "step": 8505 - }, - { - "loss": 2.2134, - "grad_norm": 1.6576154232025146, - "learning_rate": 5e-05, - "epoch": 0.5449189985272459, - "step": 8510 - }, - { - "loss": 2.1908, - "grad_norm": 1.724191665649414, - "learning_rate": 5e-05, - "epoch": 0.545239162451175, - "step": 8515 - }, - { - "loss": 2.2296, - "grad_norm": 1.724888801574707, - "learning_rate": 5e-05, - "epoch": 0.5455593263751041, - "step": 8520 - }, - { - "loss": 2.211, - "grad_norm": 1.753718614578247, - "learning_rate": 5e-05, - "epoch": 0.5458794902990332, - "step": 8525 - }, - { - "loss": 2.1835, - "grad_norm": 1.7066841125488281, - "learning_rate": 5e-05, - "epoch": 0.5461996542229621, - "step": 8530 - }, - { - "loss": 2.2209, - "grad_norm": 1.7294259071350098, - "learning_rate": 5e-05, - "epoch": 0.5465198181468912, - "step": 8535 - }, - { - "loss": 2.1839, - "grad_norm": 1.827268123626709, - "learning_rate": 5e-05, - "epoch": 0.5468399820708203, - "step": 8540 - }, - { - "loss": 2.2227, - "grad_norm": 1.6551259756088257, - "learning_rate": 5e-05, - "epoch": 0.5471601459947493, - "step": 8545 - }, - { - "loss": 2.2131, - "grad_norm": 1.7133010625839233, - "learning_rate": 5e-05, - "epoch": 0.5474803099186784, - "step": 8550 - }, - { - "loss": 2.2248, - "grad_norm": 1.6199853420257568, - "learning_rate": 5e-05, - "epoch": 0.5478004738426074, - "step": 8555 - }, - { - "loss": 2.1953, - "grad_norm": 1.6595536470413208, - "learning_rate": 5e-05, - "epoch": 0.5481206377665364, - "step": 8560 - }, - { - "loss": 2.2208, - "grad_norm": 1.6121689081192017, - "learning_rate": 5e-05, - "epoch": 0.5484408016904655, - "step": 8565 - }, - { - "loss": 2.2003, - "grad_norm": 1.670493721961975, - "learning_rate": 5e-05, - "epoch": 0.5487609656143946, - "step": 8570 - }, - { - "loss": 2.2055, - "grad_norm": 1.6377112865447998, - "learning_rate": 5e-05, - "epoch": 0.5490811295383237, - "step": 8575 - }, - { - "loss": 2.1795, - "grad_norm": 1.755138635635376, - "learning_rate": 5e-05, - "epoch": 0.5494012934622526, - "step": 8580 - }, - { - "loss": 2.2184, - "grad_norm": 1.746940016746521, - "learning_rate": 5e-05, - "epoch": 0.5497214573861817, - "step": 8585 - }, - { - "loss": 2.1888, - "grad_norm": 1.7026208639144897, - "learning_rate": 5e-05, - "epoch": 0.5500416213101108, - "step": 8590 - }, - { - "loss": 2.1954, - "grad_norm": 1.622263789176941, - "learning_rate": 5e-05, - "epoch": 0.5503617852340398, - "step": 8595 - }, - { - "loss": 2.1978, - "grad_norm": 1.7794569730758667, - "learning_rate": 5e-05, - "epoch": 0.5506819491579689, - "step": 8600 - }, - { - "eval_loss": 2.068195343017578, - "eval_runtime": 8.9579, - "eval_samples_per_second": 228.625, - "eval_steps_per_second": 28.578, - "epoch": 0.5506819491579689, - "step": 8600 - }, - { - "loss": 2.1963, - "grad_norm": 1.67849862575531, - "learning_rate": 5e-05, - "epoch": 0.551002113081898, - "step": 8605 - }, - { - "loss": 2.2099, - "grad_norm": 1.6759015321731567, - "learning_rate": 5e-05, - "epoch": 0.551322277005827, - "step": 8610 - }, - { - "loss": 2.2053, - "grad_norm": 1.6613909006118774, - "learning_rate": 5e-05, - "epoch": 0.551642440929756, - "step": 8615 - }, - { - "loss": 2.2083, - "grad_norm": 1.719401478767395, - "learning_rate": 5e-05, - "epoch": 0.5519626048536851, - "step": 8620 - }, - { - "loss": 2.1983, - "grad_norm": 1.6846554279327393, - "learning_rate": 5e-05, - "epoch": 0.5522827687776142, - "step": 8625 - }, - { - "loss": 2.1777, - "grad_norm": 1.585060715675354, - "learning_rate": 5e-05, - "epoch": 0.5526029327015431, - "step": 8630 - }, - { - "loss": 2.2142, - "grad_norm": 1.5889817476272583, - "learning_rate": 5e-05, - "epoch": 0.5529230966254722, - "step": 8635 - }, - { - "loss": 2.2204, - "grad_norm": 1.8110606670379639, - "learning_rate": 5e-05, - "epoch": 0.5532432605494013, - "step": 8640 - }, - { - "loss": 2.212, - "grad_norm": 1.6412723064422607, - "learning_rate": 5e-05, - "epoch": 0.5535634244733303, - "step": 8645 - }, - { - "loss": 2.2182, - "grad_norm": 1.7718569040298462, - "learning_rate": 5e-05, - "epoch": 0.5538835883972594, - "step": 8650 - }, - { - "loss": 2.193, - "grad_norm": 1.6897401809692383, - "learning_rate": 5e-05, - "epoch": 0.5542037523211885, - "step": 8655 - }, - { - "loss": 2.1971, - "grad_norm": 1.5787633657455444, - "learning_rate": 5e-05, - "epoch": 0.5545239162451175, - "step": 8660 - }, - { - "loss": 2.1995, - "grad_norm": 1.791604995727539, - "learning_rate": 5e-05, - "epoch": 0.5548440801690465, - "step": 8665 - }, - { - "loss": 2.242, - "grad_norm": 1.6345185041427612, - "learning_rate": 5e-05, - "epoch": 0.5551642440929756, - "step": 8670 - }, - { - "loss": 2.2313, - "grad_norm": 1.6359039545059204, - "learning_rate": 5e-05, - "epoch": 0.5554844080169047, - "step": 8675 - }, - { - "loss": 2.2167, - "grad_norm": 1.5969913005828857, - "learning_rate": 5e-05, - "epoch": 0.5558045719408337, - "step": 8680 - }, - { - "loss": 2.1846, - "grad_norm": 1.6926162242889404, - "learning_rate": 5e-05, - "epoch": 0.5561247358647627, - "step": 8685 - }, - { - "loss": 2.2072, - "grad_norm": 1.5990930795669556, - "learning_rate": 5e-05, - "epoch": 0.5564448997886918, - "step": 8690 - }, - { - "loss": 2.2057, - "grad_norm": 1.6216379404067993, - "learning_rate": 5e-05, - "epoch": 0.5567650637126209, - "step": 8695 - }, - { - "loss": 2.1902, - "grad_norm": 1.7529386281967163, - "learning_rate": 5e-05, - "epoch": 0.5570852276365499, - "step": 8700 - }, - { - "loss": 2.2107, - "grad_norm": 1.7439981698989868, - "learning_rate": 5e-05, - "epoch": 0.557405391560479, - "step": 8705 - }, - { - "loss": 2.2051, - "grad_norm": 1.6986141204833984, - "learning_rate": 5e-05, - "epoch": 0.557725555484408, - "step": 8710 - }, - { - "loss": 2.2122, - "grad_norm": 1.678336501121521, - "learning_rate": 5e-05, - "epoch": 0.558045719408337, - "step": 8715 - }, - { - "loss": 2.2034, - "grad_norm": 1.6236997842788696, - "learning_rate": 5e-05, - "epoch": 0.5583658833322661, - "step": 8720 - }, - { - "loss": 2.2108, - "grad_norm": 1.568988561630249, - "learning_rate": 5e-05, - "epoch": 0.5586860472561952, - "step": 8725 - }, - { - "loss": 2.1891, - "grad_norm": 1.6444505453109741, - "learning_rate": 5e-05, - "epoch": 0.5590062111801242, - "step": 8730 - }, - { - "loss": 2.2248, - "grad_norm": 1.6724077463150024, - "learning_rate": 5e-05, - "epoch": 0.5593263751040533, - "step": 8735 - }, - { - "loss": 2.214, - "grad_norm": 1.6417819261550903, - "learning_rate": 5e-05, - "epoch": 0.5596465390279823, - "step": 8740 - }, - { - "loss": 2.1809, - "grad_norm": 1.6727244853973389, - "learning_rate": 5e-05, - "epoch": 0.5599667029519114, - "step": 8745 - }, - { - "loss": 2.2277, - "grad_norm": 1.6233677864074707, - "learning_rate": 5e-05, - "epoch": 0.5602868668758404, - "step": 8750 - }, - { - "loss": 2.2074, - "grad_norm": 1.6876188516616821, - "learning_rate": 5e-05, - "epoch": 0.5606070307997695, - "step": 8755 - }, - { - "loss": 2.2446, - "grad_norm": 1.6457571983337402, - "learning_rate": 5e-05, - "epoch": 0.5609271947236986, - "step": 8760 - }, - { - "loss": 2.2219, - "grad_norm": 1.6713467836380005, - "learning_rate": 5e-05, - "epoch": 0.5612473586476275, - "step": 8765 - }, - { - "loss": 2.1961, - "grad_norm": 1.6506388187408447, - "learning_rate": 5e-05, - "epoch": 0.5615675225715566, - "step": 8770 - }, - { - "loss": 2.2156, - "grad_norm": 1.7466049194335938, - "learning_rate": 5e-05, - "epoch": 0.5618876864954857, - "step": 8775 - }, - { - "loss": 2.2071, - "grad_norm": 1.702660322189331, - "learning_rate": 5e-05, - "epoch": 0.5622078504194148, - "step": 8780 - }, - { - "loss": 2.1891, - "grad_norm": 1.733842134475708, - "learning_rate": 5e-05, - "epoch": 0.5625280143433438, - "step": 8785 - }, - { - "loss": 2.1982, - "grad_norm": 1.6536738872528076, - "learning_rate": 5e-05, - "epoch": 0.5628481782672728, - "step": 8790 - }, - { - "loss": 2.2137, - "grad_norm": 1.6220111846923828, - "learning_rate": 5e-05, - "epoch": 0.5631683421912019, - "step": 8795 - }, - { - "loss": 2.2096, - "grad_norm": 1.7264735698699951, - "learning_rate": 5e-05, - "epoch": 0.5634885061151309, - "step": 8800 - }, - { - "eval_loss": 2.054811477661133, - "eval_runtime": 9.6124, - "eval_samples_per_second": 213.059, - "eval_steps_per_second": 26.632, - "epoch": 0.5634885061151309, - "step": 8800 - }, - { - "loss": 2.207, - "grad_norm": 1.7159490585327148, - "learning_rate": 5e-05, - "epoch": 0.56380867003906, - "step": 8805 - }, - { - "loss": 2.2107, - "grad_norm": 1.6418389081954956, - "learning_rate": 5e-05, - "epoch": 0.5641288339629891, - "step": 8810 - }, - { - "loss": 2.2117, - "grad_norm": 1.6357439756393433, - "learning_rate": 5e-05, - "epoch": 0.564448997886918, - "step": 8815 - }, - { - "loss": 2.2005, - "grad_norm": 1.6727110147476196, - "learning_rate": 5e-05, - "epoch": 0.5647691618108471, - "step": 8820 - }, - { - "loss": 2.209, - "grad_norm": 1.7824372053146362, - "learning_rate": 5e-05, - "epoch": 0.5650893257347762, - "step": 8825 - }, - { - "loss": 2.2069, - "grad_norm": 1.6073055267333984, - "learning_rate": 5e-05, - "epoch": 0.5654094896587053, - "step": 8830 - }, - { - "loss": 2.2006, - "grad_norm": 1.6197354793548584, - "learning_rate": 5e-05, - "epoch": 0.5657296535826343, - "step": 8835 - }, - { - "loss": 2.187, - "grad_norm": 1.6128679513931274, - "learning_rate": 5e-05, - "epoch": 0.5660498175065634, - "step": 8840 - }, - { - "loss": 2.186, - "grad_norm": 1.6642301082611084, - "learning_rate": 5e-05, - "epoch": 0.5663699814304924, - "step": 8845 - }, - { - "loss": 2.2474, - "grad_norm": 1.6685996055603027, - "learning_rate": 5e-05, - "epoch": 0.5666901453544214, - "step": 8850 - }, - { - "loss": 2.2148, - "grad_norm": 1.7215569019317627, - "learning_rate": 5e-05, - "epoch": 0.5670103092783505, - "step": 8855 - }, - { - "loss": 2.2341, - "grad_norm": 1.8265643119812012, - "learning_rate": 5e-05, - "epoch": 0.5673304732022796, - "step": 8860 - }, - { - "loss": 2.1853, - "grad_norm": 1.7127398252487183, - "learning_rate": 5e-05, - "epoch": 0.5676506371262087, - "step": 8865 - }, - { - "loss": 2.1994, - "grad_norm": 1.6666933298110962, - "learning_rate": 5e-05, - "epoch": 0.5679708010501376, - "step": 8870 - }, - { - "loss": 2.1971, - "grad_norm": 1.6345758438110352, - "learning_rate": 5e-05, - "epoch": 0.5682909649740667, - "step": 8875 - }, - { - "loss": 2.1836, - "grad_norm": 1.6167995929718018, - "learning_rate": 5e-05, - "epoch": 0.5686111288979958, - "step": 8880 - }, - { - "loss": 2.1954, - "grad_norm": 1.57766854763031, - "learning_rate": 5e-05, - "epoch": 0.5689312928219248, - "step": 8885 - }, - { - "loss": 2.2106, - "grad_norm": 1.6215426921844482, - "learning_rate": 5e-05, - "epoch": 0.5692514567458539, - "step": 8890 - }, - { - "loss": 2.1891, - "grad_norm": 1.6707900762557983, - "learning_rate": 5e-05, - "epoch": 0.569571620669783, - "step": 8895 - }, - { - "loss": 2.1936, - "grad_norm": 1.6593118906021118, - "learning_rate": 5e-05, - "epoch": 0.5698917845937119, - "step": 8900 - }, - { - "loss": 2.2129, - "grad_norm": 1.666658639907837, - "learning_rate": 5e-05, - "epoch": 0.570211948517641, - "step": 8905 - }, - { - "loss": 2.193, - "grad_norm": 1.680935025215149, - "learning_rate": 5e-05, - "epoch": 0.5705321124415701, - "step": 8910 - }, - { - "loss": 2.2007, - "grad_norm": 1.6215225458145142, - "learning_rate": 5e-05, - "epoch": 0.5708522763654992, - "step": 8915 - }, - { - "loss": 2.2161, - "grad_norm": 1.716860294342041, - "learning_rate": 5e-05, - "epoch": 0.5711724402894282, - "step": 8920 - }, - { - "loss": 2.211, - "grad_norm": 1.6454654932022095, - "learning_rate": 5e-05, - "epoch": 0.5714926042133572, - "step": 8925 - }, - { - "loss": 2.1803, - "grad_norm": 1.6381767988204956, - "learning_rate": 5e-05, - "epoch": 0.5718127681372863, - "step": 8930 - }, - { - "loss": 2.2293, - "grad_norm": 1.6502015590667725, - "learning_rate": 5e-05, - "epoch": 0.5721329320612153, - "step": 8935 - }, - { - "loss": 2.2001, - "grad_norm": 1.691019892692566, - "learning_rate": 5e-05, - "epoch": 0.5724530959851444, - "step": 8940 - }, - { - "loss": 2.1875, - "grad_norm": 1.6835181713104248, - "learning_rate": 5e-05, - "epoch": 0.5727732599090735, - "step": 8945 - }, - { - "loss": 2.2026, - "grad_norm": 1.6180915832519531, - "learning_rate": 5e-05, - "epoch": 0.5730934238330025, - "step": 8950 - }, - { - "loss": 2.2328, - "grad_norm": 1.57301926612854, - "learning_rate": 5e-05, - "epoch": 0.5734135877569315, - "step": 8955 - }, - { - "loss": 2.2343, - "grad_norm": 1.712069034576416, - "learning_rate": 5e-05, - "epoch": 0.5737337516808606, - "step": 8960 - }, - { - "loss": 2.189, - "grad_norm": 1.712024211883545, - "learning_rate": 5e-05, - "epoch": 0.5740539156047897, - "step": 8965 - }, - { - "loss": 2.1816, - "grad_norm": 1.7028721570968628, - "learning_rate": 5e-05, - "epoch": 0.5743740795287187, - "step": 8970 - }, - { - "loss": 2.1809, - "grad_norm": 1.7358742952346802, - "learning_rate": 5e-05, - "epoch": 0.5746942434526477, - "step": 8975 - }, - { - "loss": 2.2039, - "grad_norm": 1.7282836437225342, - "learning_rate": 5e-05, - "epoch": 0.5750144073765768, - "step": 8980 - }, - { - "loss": 2.2086, - "grad_norm": 1.6507220268249512, - "learning_rate": 5e-05, - "epoch": 0.5753345713005058, - "step": 8985 - }, - { - "loss": 2.1958, - "grad_norm": 1.7284539937973022, - "learning_rate": 5e-05, - "epoch": 0.5756547352244349, - "step": 8990 - }, - { - "loss": 2.1792, - "grad_norm": 1.5841361284255981, - "learning_rate": 5e-05, - "epoch": 0.575974899148364, - "step": 8995 - }, - { - "loss": 2.1905, - "grad_norm": 1.7276769876480103, - "learning_rate": 5e-05, - "epoch": 0.5762950630722931, - "step": 9000 - }, - { - "eval_loss": 2.0714640617370605, - "eval_runtime": 9.0714, - "eval_samples_per_second": 225.763, - "eval_steps_per_second": 28.22, - "epoch": 0.5762950630722931, - "step": 9000 - }, - { - "loss": 2.217, - "grad_norm": 1.657918095588684, - "learning_rate": 5e-05, - "epoch": 0.576615226996222, - "step": 9005 - }, - { - "loss": 2.2101, - "grad_norm": 1.6392052173614502, - "learning_rate": 5e-05, - "epoch": 0.5769353909201511, - "step": 9010 - }, - { - "loss": 2.1809, - "grad_norm": 1.600339412689209, - "learning_rate": 5e-05, - "epoch": 0.5772555548440802, - "step": 9015 - }, - { - "loss": 2.1968, - "grad_norm": 1.6771150827407837, - "learning_rate": 5e-05, - "epoch": 0.5775757187680092, - "step": 9020 - }, - { - "loss": 2.2139, - "grad_norm": 1.615399718284607, - "learning_rate": 5e-05, - "epoch": 0.5778958826919383, - "step": 9025 - }, - { - "loss": 2.1459, - "grad_norm": 1.6415457725524902, - "learning_rate": 5e-05, - "epoch": 0.5782160466158673, - "step": 9030 - }, - { - "loss": 2.2141, - "grad_norm": 1.626848816871643, - "learning_rate": 5e-05, - "epoch": 0.5785362105397964, - "step": 9035 - }, - { - "loss": 2.1816, - "grad_norm": 1.6501692533493042, - "learning_rate": 5e-05, - "epoch": 0.5788563744637254, - "step": 9040 - }, - { - "loss": 2.2044, - "grad_norm": 1.6761474609375, - "learning_rate": 5e-05, - "epoch": 0.5791765383876545, - "step": 9045 - }, - { - "loss": 2.2251, - "grad_norm": 1.6290276050567627, - "learning_rate": 5e-05, - "epoch": 0.5794967023115836, - "step": 9050 - }, - { - "loss": 2.1981, - "grad_norm": 1.680826187133789, - "learning_rate": 5e-05, - "epoch": 0.5798168662355125, - "step": 9055 - }, - { - "loss": 2.2105, - "grad_norm": 1.649733066558838, - "learning_rate": 5e-05, - "epoch": 0.5801370301594416, - "step": 9060 - }, - { - "loss": 2.1951, - "grad_norm": 1.550475001335144, - "learning_rate": 5e-05, - "epoch": 0.5804571940833707, - "step": 9065 - }, - { - "loss": 2.1964, - "grad_norm": 1.6704422235488892, - "learning_rate": 5e-05, - "epoch": 0.5807773580072997, - "step": 9070 - }, - { - "loss": 2.1977, - "grad_norm": 1.5833699703216553, - "learning_rate": 5e-05, - "epoch": 0.5810975219312288, - "step": 9075 - }, - { - "loss": 2.2158, - "grad_norm": 1.6299668550491333, - "learning_rate": 5e-05, - "epoch": 0.5814176858551579, - "step": 9080 - }, - { - "loss": 2.1806, - "grad_norm": 1.6842360496520996, - "learning_rate": 5e-05, - "epoch": 0.5817378497790869, - "step": 9085 - }, - { - "loss": 2.2131, - "grad_norm": 1.6969107389450073, - "learning_rate": 5e-05, - "epoch": 0.5820580137030159, - "step": 9090 - }, - { - "loss": 2.2133, - "grad_norm": 1.7344791889190674, - "learning_rate": 5e-05, - "epoch": 0.582378177626945, - "step": 9095 - }, - { - "loss": 2.2099, - "grad_norm": 1.6542928218841553, - "learning_rate": 5e-05, - "epoch": 0.5826983415508741, - "step": 9100 - }, - { - "loss": 2.1778, - "grad_norm": 1.6566765308380127, - "learning_rate": 5e-05, - "epoch": 0.583018505474803, - "step": 9105 - }, - { - "loss": 2.1857, - "grad_norm": 1.6014552116394043, - "learning_rate": 5e-05, - "epoch": 0.5833386693987321, - "step": 9110 - }, - { - "loss": 2.2152, - "grad_norm": 1.6418790817260742, - "learning_rate": 5e-05, - "epoch": 0.5836588333226612, - "step": 9115 - }, - { - "loss": 2.1682, - "grad_norm": 1.601062536239624, - "learning_rate": 5e-05, - "epoch": 0.5839789972465903, - "step": 9120 - }, - { - "loss": 2.179, - "grad_norm": 1.721508502960205, - "learning_rate": 5e-05, - "epoch": 0.5842991611705193, - "step": 9125 - }, - { - "loss": 2.1789, - "grad_norm": 1.6199660301208496, - "learning_rate": 5e-05, - "epoch": 0.5846193250944484, - "step": 9130 - }, - { - "loss": 2.2258, - "grad_norm": 1.7283803224563599, - "learning_rate": 5e-05, - "epoch": 0.5849394890183774, - "step": 9135 - }, - { - "loss": 2.1966, - "grad_norm": 1.761349081993103, - "learning_rate": 5e-05, - "epoch": 0.5852596529423064, - "step": 9140 - }, - { - "loss": 2.1639, - "grad_norm": 1.6975593566894531, - "learning_rate": 5e-05, - "epoch": 0.5855798168662355, - "step": 9145 - }, - { - "loss": 2.1935, - "grad_norm": 1.672235369682312, - "learning_rate": 5e-05, - "epoch": 0.5858999807901646, - "step": 9150 - }, - { - "loss": 2.2, - "grad_norm": 1.6695904731750488, - "learning_rate": 5e-05, - "epoch": 0.5862201447140936, - "step": 9155 - }, - { - "loss": 2.2062, - "grad_norm": 1.6303057670593262, - "learning_rate": 5e-05, - "epoch": 0.5865403086380226, - "step": 9160 - }, - { - "loss": 2.2062, - "grad_norm": 1.704704761505127, - "learning_rate": 5e-05, - "epoch": 0.5868604725619517, - "step": 9165 - }, - { - "loss": 2.183, - "grad_norm": 1.5838806629180908, - "learning_rate": 5e-05, - "epoch": 0.5871806364858808, - "step": 9170 - }, - { - "loss": 2.2028, - "grad_norm": 1.6641960144042969, - "learning_rate": 5e-05, - "epoch": 0.5875008004098098, - "step": 9175 - }, - { - "loss": 2.2251, - "grad_norm": 1.6648904085159302, - "learning_rate": 5e-05, - "epoch": 0.5878209643337389, - "step": 9180 - }, - { - "loss": 2.2126, - "grad_norm": 1.7670952081680298, - "learning_rate": 5e-05, - "epoch": 0.588141128257668, - "step": 9185 - }, - { - "loss": 2.2139, - "grad_norm": 1.7805712223052979, - "learning_rate": 5e-05, - "epoch": 0.5884612921815969, - "step": 9190 - }, - { - "loss": 2.2055, - "grad_norm": 1.7651017904281616, - "learning_rate": 5e-05, - "epoch": 0.588781456105526, - "step": 9195 - }, - { - "loss": 2.1971, - "grad_norm": 1.8590534925460815, - "learning_rate": 5e-05, - "epoch": 0.5891016200294551, - "step": 9200 - }, - { - "eval_loss": 2.0600976943969727, - "eval_runtime": 9.0846, - "eval_samples_per_second": 225.436, - "eval_steps_per_second": 28.18, - "epoch": 0.5891016200294551, - "step": 9200 - }, - { - "loss": 2.1996, - "grad_norm": 1.6831485033035278, - "learning_rate": 5e-05, - "epoch": 0.5894217839533842, - "step": 9205 - }, - { - "loss": 2.1846, - "grad_norm": 1.6748493909835815, - "learning_rate": 5e-05, - "epoch": 0.5897419478773132, - "step": 9210 - }, - { - "loss": 2.1819, - "grad_norm": 1.6625423431396484, - "learning_rate": 5e-05, - "epoch": 0.5900621118012422, - "step": 9215 - }, - { - "loss": 2.2139, - "grad_norm": 1.6799538135528564, - "learning_rate": 5e-05, - "epoch": 0.5903822757251713, - "step": 9220 - }, - { - "loss": 2.2051, - "grad_norm": 1.723824143409729, - "learning_rate": 5e-05, - "epoch": 0.5907024396491003, - "step": 9225 - }, - { - "loss": 2.1924, - "grad_norm": 1.634695291519165, - "learning_rate": 5e-05, - "epoch": 0.5910226035730294, - "step": 9230 - }, - { - "loss": 2.1907, - "grad_norm": 1.6800014972686768, - "learning_rate": 5e-05, - "epoch": 0.5913427674969585, - "step": 9235 - }, - { - "loss": 2.1846, - "grad_norm": 1.5842480659484863, - "learning_rate": 5e-05, - "epoch": 0.5916629314208874, - "step": 9240 - }, - { - "loss": 2.1945, - "grad_norm": 1.6501984596252441, - "learning_rate": 5e-05, - "epoch": 0.5919830953448165, - "step": 9245 - }, - { - "loss": 2.2027, - "grad_norm": 1.6772314310073853, - "learning_rate": 5e-05, - "epoch": 0.5923032592687456, - "step": 9250 - }, - { - "loss": 2.1834, - "grad_norm": 1.6418979167938232, - "learning_rate": 5e-05, - "epoch": 0.5926234231926747, - "step": 9255 - }, - { - "loss": 2.1975, - "grad_norm": 1.6140997409820557, - "learning_rate": 5e-05, - "epoch": 0.5929435871166037, - "step": 9260 - }, - { - "loss": 2.2261, - "grad_norm": 1.6440484523773193, - "learning_rate": 5e-05, - "epoch": 0.5932637510405327, - "step": 9265 - }, - { - "loss": 2.1802, - "grad_norm": 1.642822027206421, - "learning_rate": 5e-05, - "epoch": 0.5935839149644618, - "step": 9270 - }, - { - "loss": 2.1736, - "grad_norm": 1.622124433517456, - "learning_rate": 5e-05, - "epoch": 0.5939040788883908, - "step": 9275 - }, - { - "loss": 2.1894, - "grad_norm": 1.6651124954223633, - "learning_rate": 5e-05, - "epoch": 0.5942242428123199, - "step": 9280 - }, - { - "loss": 2.1898, - "grad_norm": 1.6245893239974976, - "learning_rate": 5e-05, - "epoch": 0.594544406736249, - "step": 9285 - }, - { - "loss": 2.1863, - "grad_norm": 1.5905382633209229, - "learning_rate": 5e-05, - "epoch": 0.5948645706601781, - "step": 9290 - }, - { - "loss": 2.1795, - "grad_norm": 1.688947319984436, - "learning_rate": 5e-05, - "epoch": 0.595184734584107, - "step": 9295 - }, - { - "loss": 2.2024, - "grad_norm": 1.6358672380447388, - "learning_rate": 5e-05, - "epoch": 0.5955048985080361, - "step": 9300 - }, - { - "loss": 2.1899, - "grad_norm": 1.6548181772232056, - "learning_rate": 5e-05, - "epoch": 0.5958250624319652, - "step": 9305 - }, - { - "loss": 2.1889, - "grad_norm": 1.612070918083191, - "learning_rate": 5e-05, - "epoch": 0.5961452263558942, - "step": 9310 - }, - { - "loss": 2.2026, - "grad_norm": 1.6708488464355469, - "learning_rate": 5e-05, - "epoch": 0.5964653902798233, - "step": 9315 - }, - { - "loss": 2.1895, - "grad_norm": 1.5530942678451538, - "learning_rate": 5e-05, - "epoch": 0.5967855542037523, - "step": 9320 - }, - { - "loss": 2.2076, - "grad_norm": 1.713191270828247, - "learning_rate": 5e-05, - "epoch": 0.5971057181276813, - "step": 9325 - }, - { - "loss": 2.2008, - "grad_norm": 1.6353609561920166, - "learning_rate": 5e-05, - "epoch": 0.5974258820516104, - "step": 9330 - }, - { - "loss": 2.1978, - "grad_norm": 1.6699703931808472, - "learning_rate": 5e-05, - "epoch": 0.5977460459755395, - "step": 9335 - }, - { - "loss": 2.2398, - "grad_norm": 1.6570907831192017, - "learning_rate": 5e-05, - "epoch": 0.5980662098994686, - "step": 9340 - }, - { - "loss": 2.1825, - "grad_norm": 1.5736383199691772, - "learning_rate": 5e-05, - "epoch": 0.5983863738233975, - "step": 9345 - }, - { - "loss": 2.1911, - "grad_norm": 1.6274065971374512, - "learning_rate": 5e-05, - "epoch": 0.5987065377473266, - "step": 9350 - }, - { - "loss": 2.2171, - "grad_norm": 1.6303297281265259, - "learning_rate": 5e-05, - "epoch": 0.5990267016712557, - "step": 9355 - }, - { - "loss": 2.2006, - "grad_norm": 1.5498477220535278, - "learning_rate": 5e-05, - "epoch": 0.5993468655951847, - "step": 9360 - }, - { - "loss": 2.2076, - "grad_norm": 1.6340336799621582, - "learning_rate": 5e-05, - "epoch": 0.5996670295191138, - "step": 9365 - }, - { - "loss": 2.197, - "grad_norm": 1.693532943725586, - "learning_rate": 5e-05, - "epoch": 0.5999871934430429, - "step": 9370 - }, - { - "loss": 2.1982, - "grad_norm": 1.7246029376983643, - "learning_rate": 5e-05, - "epoch": 0.6003073573669719, - "step": 9375 - }, - { - "loss": 2.1855, - "grad_norm": 1.6022217273712158, - "learning_rate": 5e-05, - "epoch": 0.6006275212909009, - "step": 9380 - }, - { - "loss": 2.1501, - "grad_norm": 1.6127444505691528, - "learning_rate": 5e-05, - "epoch": 0.60094768521483, - "step": 9385 - }, - { - "loss": 2.1776, - "grad_norm": 1.6102303266525269, - "learning_rate": 5e-05, - "epoch": 0.6012678491387591, - "step": 9390 - }, - { - "loss": 2.1867, - "grad_norm": 1.6318905353546143, - "learning_rate": 5e-05, - "epoch": 0.601588013062688, - "step": 9395 - }, - { - "loss": 2.1573, - "grad_norm": 1.6750482320785522, - "learning_rate": 5e-05, - "epoch": 0.6019081769866171, - "step": 9400 - }, - { - "eval_loss": 2.0679101943969727, - "eval_runtime": 9.4704, - "eval_samples_per_second": 216.253, - "eval_steps_per_second": 27.032, - "epoch": 0.6019081769866171, - "step": 9400 - }, - { - "loss": 2.1869, - "grad_norm": 1.6901905536651611, - "learning_rate": 5e-05, - "epoch": 0.6022283409105462, - "step": 9405 - }, - { - "loss": 2.1978, - "grad_norm": 1.6613404750823975, - "learning_rate": 5e-05, - "epoch": 0.6025485048344752, - "step": 9410 - }, - { - "loss": 2.2079, - "grad_norm": 1.7926342487335205, - "learning_rate": 5e-05, - "epoch": 0.6028686687584043, - "step": 9415 - }, - { - "loss": 2.2059, - "grad_norm": 1.624189019203186, - "learning_rate": 5e-05, - "epoch": 0.6031888326823334, - "step": 9420 - }, - { - "loss": 2.1925, - "grad_norm": 1.628775715827942, - "learning_rate": 5e-05, - "epoch": 0.6035089966062624, - "step": 9425 - }, - { - "loss": 2.199, - "grad_norm": 1.6382794380187988, - "learning_rate": 5e-05, - "epoch": 0.6038291605301914, - "step": 9430 - }, - { - "loss": 2.174, - "grad_norm": 1.6540191173553467, - "learning_rate": 5e-05, - "epoch": 0.6041493244541205, - "step": 9435 - }, - { - "loss": 2.2111, - "grad_norm": 1.689916729927063, - "learning_rate": 5e-05, - "epoch": 0.6044694883780496, - "step": 9440 - }, - { - "loss": 2.163, - "grad_norm": 1.7421029806137085, - "learning_rate": 5e-05, - "epoch": 0.6047896523019786, - "step": 9445 - }, - { - "loss": 2.1926, - "grad_norm": 1.6341474056243896, - "learning_rate": 5e-05, - "epoch": 0.6051098162259076, - "step": 9450 - }, - { - "loss": 2.1649, - "grad_norm": 1.5881348848342896, - "learning_rate": 5e-05, - "epoch": 0.6054299801498367, - "step": 9455 - }, - { - "loss": 2.199, - "grad_norm": 1.727407693862915, - "learning_rate": 5e-05, - "epoch": 0.6057501440737658, - "step": 9460 - }, - { - "loss": 2.1982, - "grad_norm": 1.7314342260360718, - "learning_rate": 5e-05, - "epoch": 0.6060703079976948, - "step": 9465 - }, - { - "loss": 2.1971, - "grad_norm": 1.656754732131958, - "learning_rate": 5e-05, - "epoch": 0.6063904719216239, - "step": 9470 - }, - { - "loss": 2.1756, - "grad_norm": 1.6005347967147827, - "learning_rate": 5e-05, - "epoch": 0.606710635845553, - "step": 9475 - }, - { - "loss": 2.1881, - "grad_norm": 1.5883443355560303, - "learning_rate": 5e-05, - "epoch": 0.6070307997694819, - "step": 9480 - }, - { - "loss": 2.1901, - "grad_norm": 1.8181809186935425, - "learning_rate": 5e-05, - "epoch": 0.607350963693411, - "step": 9485 - }, - { - "loss": 2.1893, - "grad_norm": 1.8270716667175293, - "learning_rate": 5e-05, - "epoch": 0.6076711276173401, - "step": 9490 - }, - { - "loss": 2.2014, - "grad_norm": 1.7661374807357788, - "learning_rate": 5e-05, - "epoch": 0.6079912915412691, - "step": 9495 - }, - { - "loss": 2.2077, - "grad_norm": 1.6404733657836914, - "learning_rate": 5e-05, - "epoch": 0.6083114554651982, - "step": 9500 - }, - { - "loss": 2.1991, - "grad_norm": 1.6874150037765503, - "learning_rate": 5e-05, - "epoch": 0.6086316193891272, - "step": 9505 - }, - { - "loss": 2.212, - "grad_norm": 1.7771927118301392, - "learning_rate": 5e-05, - "epoch": 0.6089517833130563, - "step": 9510 - }, - { - "loss": 2.1921, - "grad_norm": 1.7601332664489746, - "learning_rate": 5e-05, - "epoch": 0.6092719472369853, - "step": 9515 - }, - { - "loss": 2.1887, - "grad_norm": 1.7171956300735474, - "learning_rate": 5e-05, - "epoch": 0.6095921111609144, - "step": 9520 - }, - { - "loss": 2.2107, - "grad_norm": 1.5932697057724, - "learning_rate": 5e-05, - "epoch": 0.6099122750848435, - "step": 9525 - }, - { - "loss": 2.1952, - "grad_norm": 1.6156558990478516, - "learning_rate": 5e-05, - "epoch": 0.6102324390087724, - "step": 9530 - }, - { - "loss": 2.2223, - "grad_norm": 1.643193244934082, - "learning_rate": 5e-05, - "epoch": 0.6105526029327015, - "step": 9535 - }, - { - "loss": 2.1795, - "grad_norm": 1.7439320087432861, - "learning_rate": 5e-05, - "epoch": 0.6108727668566306, - "step": 9540 - }, - { - "loss": 2.2021, - "grad_norm": 1.6661970615386963, - "learning_rate": 5e-05, - "epoch": 0.6111929307805597, - "step": 9545 - }, - { - "loss": 2.1948, - "grad_norm": 1.6624956130981445, - "learning_rate": 5e-05, - "epoch": 0.6115130947044887, - "step": 9550 - }, - { - "loss": 2.191, - "grad_norm": 1.6625850200653076, - "learning_rate": 5e-05, - "epoch": 0.6118332586284178, - "step": 9555 - }, - { - "loss": 2.1988, - "grad_norm": 1.6776957511901855, - "learning_rate": 5e-05, - "epoch": 0.6121534225523468, - "step": 9560 - }, - { - "loss": 2.2195, - "grad_norm": 1.6892213821411133, - "learning_rate": 5e-05, - "epoch": 0.6124735864762758, - "step": 9565 - }, - { - "loss": 2.2082, - "grad_norm": 1.7207622528076172, - "learning_rate": 5e-05, - "epoch": 0.6127937504002049, - "step": 9570 - }, - { - "loss": 2.1844, - "grad_norm": 1.6284791231155396, - "learning_rate": 5e-05, - "epoch": 0.613113914324134, - "step": 9575 - }, - { - "loss": 2.199, - "grad_norm": 1.6319007873535156, - "learning_rate": 5e-05, - "epoch": 0.613434078248063, - "step": 9580 - }, - { - "loss": 2.1955, - "grad_norm": 1.634732723236084, - "learning_rate": 5e-05, - "epoch": 0.613754242171992, - "step": 9585 - }, - { - "loss": 2.1924, - "grad_norm": 1.7350897789001465, - "learning_rate": 5e-05, - "epoch": 0.6140744060959211, - "step": 9590 - }, - { - "loss": 2.1722, - "grad_norm": 1.6487643718719482, - "learning_rate": 5e-05, - "epoch": 0.6143945700198502, - "step": 9595 - }, - { - "loss": 2.1732, - "grad_norm": 1.678441047668457, - "learning_rate": 5e-05, - "epoch": 0.6147147339437792, - "step": 9600 - }, - { - "eval_loss": 2.0491390228271484, - "eval_runtime": 10.1854, - "eval_samples_per_second": 201.071, - "eval_steps_per_second": 25.134, - "epoch": 0.6147147339437792, - "step": 9600 - }, - { - "loss": 2.2301, - "grad_norm": 1.7390731573104858, - "learning_rate": 5e-05, - "epoch": 0.6150348978677083, - "step": 9605 - }, - { - "loss": 2.2033, - "grad_norm": 1.70026433467865, - "learning_rate": 5e-05, - "epoch": 0.6153550617916373, - "step": 9610 - }, - { - "loss": 2.2143, - "grad_norm": 1.6489602327346802, - "learning_rate": 5e-05, - "epoch": 0.6156752257155663, - "step": 9615 - }, - { - "loss": 2.1629, - "grad_norm": 1.707454800605774, - "learning_rate": 5e-05, - "epoch": 0.6159953896394954, - "step": 9620 - }, - { - "loss": 2.1867, - "grad_norm": 1.7108656167984009, - "learning_rate": 5e-05, - "epoch": 0.6163155535634245, - "step": 9625 - }, - { - "loss": 2.1792, - "grad_norm": 1.6427521705627441, - "learning_rate": 5e-05, - "epoch": 0.6166357174873536, - "step": 9630 - }, - { - "loss": 2.193, - "grad_norm": 1.6153441667556763, - "learning_rate": 5e-05, - "epoch": 0.6169558814112825, - "step": 9635 - }, - { - "loss": 2.1779, - "grad_norm": 1.5934603214263916, - "learning_rate": 5e-05, - "epoch": 0.6172760453352116, - "step": 9640 - }, - { - "loss": 2.1825, - "grad_norm": 1.7031588554382324, - "learning_rate": 5e-05, - "epoch": 0.6175962092591407, - "step": 9645 - }, - { - "loss": 2.1919, - "grad_norm": 1.7012118101119995, - "learning_rate": 5e-05, - "epoch": 0.6179163731830697, - "step": 9650 - }, - { - "loss": 2.1638, - "grad_norm": 1.695015549659729, - "learning_rate": 5e-05, - "epoch": 0.6182365371069988, - "step": 9655 - }, - { - "loss": 2.191, - "grad_norm": 1.6796445846557617, - "learning_rate": 5e-05, - "epoch": 0.6185567010309279, - "step": 9660 - }, - { - "loss": 2.2073, - "grad_norm": 1.71908700466156, - "learning_rate": 5e-05, - "epoch": 0.6188768649548568, - "step": 9665 - }, - { - "loss": 2.222, - "grad_norm": 1.6226277351379395, - "learning_rate": 5e-05, - "epoch": 0.6191970288787859, - "step": 9670 - }, - { - "loss": 2.1851, - "grad_norm": 1.6331515312194824, - "learning_rate": 5e-05, - "epoch": 0.619517192802715, - "step": 9675 - }, - { - "loss": 2.2073, - "grad_norm": 1.7072153091430664, - "learning_rate": 5e-05, - "epoch": 0.6198373567266441, - "step": 9680 - }, - { - "loss": 2.1987, - "grad_norm": 1.7724852561950684, - "learning_rate": 5e-05, - "epoch": 0.6201575206505731, - "step": 9685 - }, - { - "loss": 2.2316, - "grad_norm": 1.5883903503417969, - "learning_rate": 5e-05, - "epoch": 0.6204776845745021, - "step": 9690 - }, - { - "loss": 2.1984, - "grad_norm": 1.6855353116989136, - "learning_rate": 5e-05, - "epoch": 0.6207978484984312, - "step": 9695 - }, - { - "loss": 2.1934, - "grad_norm": 1.6740260124206543, - "learning_rate": 5e-05, - "epoch": 0.6211180124223602, - "step": 9700 - }, - { - "loss": 2.1905, - "grad_norm": 1.5806589126586914, - "learning_rate": 5e-05, - "epoch": 0.6214381763462893, - "step": 9705 - }, - { - "loss": 2.1811, - "grad_norm": 1.7320712804794312, - "learning_rate": 5e-05, - "epoch": 0.6217583402702184, - "step": 9710 - }, - { - "loss": 2.1561, - "grad_norm": 1.6316450834274292, - "learning_rate": 5e-05, - "epoch": 0.6220785041941475, - "step": 9715 - }, - { - "loss": 2.1887, - "grad_norm": 1.693595290184021, - "learning_rate": 5e-05, - "epoch": 0.6223986681180764, - "step": 9720 - }, - { - "loss": 2.1818, - "grad_norm": 1.6201201677322388, - "learning_rate": 5e-05, - "epoch": 0.6227188320420055, - "step": 9725 - }, - { - "loss": 2.1803, - "grad_norm": 1.6515777111053467, - "learning_rate": 5e-05, - "epoch": 0.6230389959659346, - "step": 9730 - }, - { - "loss": 2.2073, - "grad_norm": 1.6333870887756348, - "learning_rate": 5e-05, - "epoch": 0.6233591598898636, - "step": 9735 - }, - { - "loss": 2.2036, - "grad_norm": 1.6375046968460083, - "learning_rate": 5e-05, - "epoch": 0.6236793238137927, - "step": 9740 - }, - { - "loss": 2.1786, - "grad_norm": 1.6964610815048218, - "learning_rate": 5e-05, - "epoch": 0.6239994877377217, - "step": 9745 - }, - { - "loss": 2.1623, - "grad_norm": 1.571964144706726, - "learning_rate": 5e-05, - "epoch": 0.6243196516616507, - "step": 9750 - }, - { - "loss": 2.1862, - "grad_norm": 1.5701056718826294, - "learning_rate": 5e-05, - "epoch": 0.6246398155855798, - "step": 9755 - }, - { - "loss": 2.2058, - "grad_norm": 1.6287676095962524, - "learning_rate": 5e-05, - "epoch": 0.6249599795095089, - "step": 9760 - }, - { - "loss": 2.1764, - "grad_norm": 1.6932698488235474, - "learning_rate": 5e-05, - "epoch": 0.625280143433438, - "step": 9765 - }, - { - "loss": 2.1822, - "grad_norm": 1.6898211240768433, - "learning_rate": 5e-05, - "epoch": 0.6256003073573669, - "step": 9770 - }, - { - "loss": 2.196, - "grad_norm": 1.8175290822982788, - "learning_rate": 5e-05, - "epoch": 0.625920471281296, - "step": 9775 - }, - { - "loss": 2.1976, - "grad_norm": 1.6941791772842407, - "learning_rate": 5e-05, - "epoch": 0.6262406352052251, - "step": 9780 - }, - { - "loss": 2.202, - "grad_norm": 1.6704522371292114, - "learning_rate": 5e-05, - "epoch": 0.6265607991291541, - "step": 9785 - }, - { - "loss": 2.1804, - "grad_norm": 1.6961215734481812, - "learning_rate": 5e-05, - "epoch": 0.6268809630530832, - "step": 9790 - }, - { - "loss": 2.1622, - "grad_norm": 1.6616603136062622, - "learning_rate": 5e-05, - "epoch": 0.6272011269770122, - "step": 9795 - }, - { - "loss": 2.2176, - "grad_norm": 1.7092373371124268, - "learning_rate": 5e-05, - "epoch": 0.6275212909009413, - "step": 9800 - }, - { - "eval_loss": 2.066584587097168, - "eval_runtime": 9.576, - "eval_samples_per_second": 213.869, - "eval_steps_per_second": 26.734, - "epoch": 0.6275212909009413, - "step": 9800 - }, - { - "loss": 2.1975, - "grad_norm": 1.7031360864639282, - "learning_rate": 5e-05, - "epoch": 0.6278414548248703, - "step": 9805 - }, - { - "loss": 2.1963, - "grad_norm": 1.5917502641677856, - "learning_rate": 5e-05, - "epoch": 0.6281616187487994, - "step": 9810 - }, - { - "loss": 2.1941, - "grad_norm": 1.6682829856872559, - "learning_rate": 5e-05, - "epoch": 0.6284817826727285, - "step": 9815 - }, - { - "loss": 2.1917, - "grad_norm": 1.5641679763793945, - "learning_rate": 5e-05, - "epoch": 0.6288019465966574, - "step": 9820 - }, - { - "loss": 2.1773, - "grad_norm": 1.645300269126892, - "learning_rate": 5e-05, - "epoch": 0.6291221105205865, - "step": 9825 - }, - { - "loss": 2.1631, - "grad_norm": 1.5623157024383545, - "learning_rate": 5e-05, - "epoch": 0.6294422744445156, - "step": 9830 - }, - { - "loss": 2.1714, - "grad_norm": 1.5632286071777344, - "learning_rate": 5e-05, - "epoch": 0.6297624383684447, - "step": 9835 - }, - { - "loss": 2.2008, - "grad_norm": 1.5870970487594604, - "learning_rate": 5e-05, - "epoch": 0.6300826022923737, - "step": 9840 - }, - { - "loss": 2.177, - "grad_norm": 1.6274374723434448, - "learning_rate": 5e-05, - "epoch": 0.6304027662163028, - "step": 9845 - }, - { - "loss": 2.1598, - "grad_norm": 1.6094468832015991, - "learning_rate": 5e-05, - "epoch": 0.6307229301402318, - "step": 9850 - }, - { - "loss": 2.2126, - "grad_norm": 1.6894714832305908, - "learning_rate": 5e-05, - "epoch": 0.6310430940641608, - "step": 9855 - }, - { - "loss": 2.182, - "grad_norm": 1.6404249668121338, - "learning_rate": 5e-05, - "epoch": 0.6313632579880899, - "step": 9860 - }, - { - "loss": 2.1884, - "grad_norm": 1.5737414360046387, - "learning_rate": 5e-05, - "epoch": 0.631683421912019, - "step": 9865 - }, - { - "loss": 2.1913, - "grad_norm": 1.6798261404037476, - "learning_rate": 5e-05, - "epoch": 0.632003585835948, - "step": 9870 - }, - { - "loss": 2.1629, - "grad_norm": 1.6156235933303833, - "learning_rate": 5e-05, - "epoch": 0.632323749759877, - "step": 9875 - }, - { - "loss": 2.1785, - "grad_norm": 1.7109487056732178, - "learning_rate": 5e-05, - "epoch": 0.6326439136838061, - "step": 9880 - }, - { - "loss": 2.211, - "grad_norm": 1.6673591136932373, - "learning_rate": 5e-05, - "epoch": 0.6329640776077352, - "step": 9885 - }, - { - "loss": 2.2038, - "grad_norm": 1.6591060161590576, - "learning_rate": 5e-05, - "epoch": 0.6332842415316642, - "step": 9890 - }, - { - "loss": 2.1607, - "grad_norm": 1.6050101518630981, - "learning_rate": 5e-05, - "epoch": 0.6336044054555933, - "step": 9895 - }, - { - "loss": 2.1672, - "grad_norm": 1.6460251808166504, - "learning_rate": 5e-05, - "epoch": 0.6339245693795224, - "step": 9900 - }, - { - "loss": 2.1672, - "grad_norm": 1.7413328886032104, - "learning_rate": 5e-05, - "epoch": 0.6342447333034513, - "step": 9905 - }, - { - "loss": 2.2128, - "grad_norm": 1.684525966644287, - "learning_rate": 5e-05, - "epoch": 0.6345648972273804, - "step": 9910 - }, - { - "loss": 2.1987, - "grad_norm": 1.6904735565185547, - "learning_rate": 5e-05, - "epoch": 0.6348850611513095, - "step": 9915 - }, - { - "loss": 2.1979, - "grad_norm": 1.6782430410385132, - "learning_rate": 5e-05, - "epoch": 0.6352052250752386, - "step": 9920 - }, - { - "loss": 2.1839, - "grad_norm": 1.7190313339233398, - "learning_rate": 5e-05, - "epoch": 0.6355253889991676, - "step": 9925 - }, - { - "loss": 2.1885, - "grad_norm": 1.745063304901123, - "learning_rate": 5e-05, - "epoch": 0.6358455529230966, - "step": 9930 - }, - { - "loss": 2.174, - "grad_norm": 1.6339811086654663, - "learning_rate": 5e-05, - "epoch": 0.6361657168470257, - "step": 9935 - }, - { - "loss": 2.197, - "grad_norm": 1.6524280309677124, - "learning_rate": 5e-05, - "epoch": 0.6364858807709547, - "step": 9940 - }, - { - "loss": 2.1844, - "grad_norm": 1.7359994649887085, - "learning_rate": 5e-05, - "epoch": 0.6368060446948838, - "step": 9945 - }, - { - "loss": 2.1894, - "grad_norm": 1.6910420656204224, - "learning_rate": 5e-05, - "epoch": 0.6371262086188129, - "step": 9950 - }, - { - "loss": 2.1981, - "grad_norm": 1.6106345653533936, - "learning_rate": 5e-05, - "epoch": 0.6374463725427418, - "step": 9955 - }, - { - "loss": 2.2047, - "grad_norm": 1.6369112730026245, - "learning_rate": 5e-05, - "epoch": 0.6377665364666709, - "step": 9960 - }, - { - "loss": 2.2073, - "grad_norm": 1.6089766025543213, - "learning_rate": 5e-05, - "epoch": 0.6380867003906, - "step": 9965 - }, - { - "loss": 2.184, - "grad_norm": 1.7142517566680908, - "learning_rate": 5e-05, - "epoch": 0.6384068643145291, - "step": 9970 - }, - { - "loss": 2.1588, - "grad_norm": 1.6717356443405151, - "learning_rate": 5e-05, - "epoch": 0.6387270282384581, - "step": 9975 - }, - { - "loss": 2.1723, - "grad_norm": 1.7235606908798218, - "learning_rate": 5e-05, - "epoch": 0.6390471921623871, - "step": 9980 - }, - { - "loss": 2.201, - "grad_norm": 1.6770853996276855, - "learning_rate": 5e-05, - "epoch": 0.6393673560863162, - "step": 9985 - }, - { - "loss": 2.1764, - "grad_norm": 1.6714833974838257, - "learning_rate": 5e-05, - "epoch": 0.6396875200102452, - "step": 9990 - }, - { - "loss": 2.1779, - "grad_norm": 1.587377905845642, - "learning_rate": 5e-05, - "epoch": 0.6400076839341743, - "step": 9995 - }, - { - "loss": 2.1861, - "grad_norm": 1.6190916299819946, - "learning_rate": 5e-05, - "epoch": 0.6403278478581034, - "step": 10000 - }, - { - "eval_loss": 2.057950258255005, - "eval_runtime": 9.0347, - "eval_samples_per_second": 226.682, - "eval_steps_per_second": 28.335, - "epoch": 0.6403278478581034, - "step": 10000 - }, - { - "loss": 2.2194, - "grad_norm": 1.6693397760391235, - "learning_rate": 5e-05, - "epoch": 0.6406480117820325, - "step": 10005 - }, - { - "loss": 2.1777, - "grad_norm": 1.6318798065185547, - "learning_rate": 5e-05, - "epoch": 0.6409681757059614, - "step": 10010 - }, - { - "loss": 2.1711, - "grad_norm": 1.5991014242172241, - "learning_rate": 5e-05, - "epoch": 0.6412883396298905, - "step": 10015 - }, - { - "loss": 2.1958, - "grad_norm": 1.601650595664978, - "learning_rate": 5e-05, - "epoch": 0.6416085035538196, - "step": 10020 - }, - { - "loss": 2.1848, - "grad_norm": 1.6435421705245972, - "learning_rate": 5e-05, - "epoch": 0.6419286674777486, - "step": 10025 - }, - { - "loss": 2.1893, - "grad_norm": 1.6513253450393677, - "learning_rate": 5e-05, - "epoch": 0.6422488314016777, - "step": 10030 - }, - { - "loss": 2.192, - "grad_norm": 1.7683993577957153, - "learning_rate": 5e-05, - "epoch": 0.6425689953256067, - "step": 10035 - }, - { - "loss": 2.1817, - "grad_norm": 1.6842045783996582, - "learning_rate": 5e-05, - "epoch": 0.6428891592495357, - "step": 10040 - }, - { - "loss": 2.1972, - "grad_norm": 1.6233887672424316, - "learning_rate": 5e-05, - "epoch": 0.6432093231734648, - "step": 10045 - }, - { - "loss": 2.179, - "grad_norm": 1.659524917602539, - "learning_rate": 5e-05, - "epoch": 0.6435294870973939, - "step": 10050 - }, - { - "loss": 2.1826, - "grad_norm": 1.620766282081604, - "learning_rate": 5e-05, - "epoch": 0.643849651021323, - "step": 10055 - }, - { - "loss": 2.1796, - "grad_norm": 1.7828584909439087, - "learning_rate": 5e-05, - "epoch": 0.6441698149452519, - "step": 10060 + "loss": 440.6308, + "grad_norm": 98.61355590820312, + "learning_rate": 0.0004999990058793643, + "epoch": 0.0008969184444871549, + "step": 1 }, { - "loss": 2.1733, - "grad_norm": 1.6462182998657227, - "learning_rate": 5e-05, - "epoch": 0.644489978869181, - "step": 10065 + "loss": 515.3978, + "grad_norm": 1112.0234375, + "learning_rate": 0.0004999960235253631, + "epoch": 0.0017938368889743098, + "step": 2 }, { - "loss": 2.1725, - "grad_norm": 1.6294734477996826, - "learning_rate": 5e-05, - "epoch": 0.6448101427931101, - "step": 10070 + "loss": 477.4767, + "grad_norm": 392.8102722167969, + "learning_rate": 0.0004999910529617153, + "epoch": 0.0026907553334614646, + "step": 3 }, { - "loss": 2.1705, - "grad_norm": 1.6464056968688965, - "learning_rate": 5e-05, - "epoch": 0.6451303067170391, - "step": 10075 + "loss": 457.2771, + "grad_norm": 292.9400939941406, + "learning_rate": 0.0004999840942279514, + "epoch": 0.0035876737779486196, + "step": 4 }, { - "loss": 2.1945, - "grad_norm": 1.6422755718231201, - "learning_rate": 5e-05, - "epoch": 0.6454504706409682, - "step": 10080 - }, - { - "loss": 2.1755, - "grad_norm": 1.6333072185516357, - "learning_rate": 5e-05, - "epoch": 0.6457706345648973, - "step": 10085 - }, - { - "loss": 2.1642, - "grad_norm": 1.6198492050170898, - "learning_rate": 5e-05, - "epoch": 0.6460907984888263, - "step": 10090 - }, - { - "loss": 2.1547, - "grad_norm": 1.7265499830245972, - "learning_rate": 5e-05, - "epoch": 0.6464109624127553, - "step": 10095 - }, - { - "loss": 2.1862, - "grad_norm": 1.6740344762802124, - "learning_rate": 5e-05, - "epoch": 0.6467311263366844, - "step": 10100 - }, - { - "loss": 2.1726, - "grad_norm": 1.6273069381713867, - "learning_rate": 5e-05, - "epoch": 0.6470512902606135, - "step": 10105 - }, - { - "loss": 2.1927, - "grad_norm": 1.6094673871994019, - "learning_rate": 5e-05, - "epoch": 0.6473714541845424, - "step": 10110 + "loss": 444.411, + "grad_norm": 166.66598510742188, + "learning_rate": 0.000499975147379414, + "epoch": 0.004484592222435775, + "step": 5 }, { - "loss": 2.1617, - "grad_norm": 1.621466040611267, - "learning_rate": 5e-05, - "epoch": 0.6476916181084715, - "step": 10115 + "loss": 438.7729, + "grad_norm": 132.8984375, + "learning_rate": 0.000499964212487257, + "epoch": 0.005381510666922929, + "step": 6 }, { - "loss": 2.1788, - "grad_norm": 1.7757426500320435, - "learning_rate": 5e-05, - "epoch": 0.6480117820324006, - "step": 10120 + "loss": 434.4058, + "grad_norm": 102.88407135009766, + "learning_rate": 0.0004999512896384454, + "epoch": 0.006278429111410084, + "step": 7 }, { - "loss": 2.2063, - "grad_norm": 1.6837197542190552, - "learning_rate": 5e-05, - "epoch": 0.6483319459563296, - "step": 10125 + "loss": 431.3428, + "grad_norm": 109.61495971679688, + "learning_rate": 0.0004999363789357541, + "epoch": 0.007175347555897239, + "step": 8 }, { - "loss": 2.2046, - "grad_norm": 1.810106635093689, - "learning_rate": 5e-05, - "epoch": 0.6486521098802587, - "step": 10130 + "loss": 430.6904, + "grad_norm": 94.442626953125, + "learning_rate": 0.0004999194804977674, + "epoch": 0.008072266000384394, + "step": 9 }, { - "loss": 2.1667, - "grad_norm": 1.6960062980651855, - "learning_rate": 5e-05, - "epoch": 0.6489722738041878, - "step": 10135 + "loss": 427.7128, + "grad_norm": 79.10123443603516, + "learning_rate": 0.0004999005944588778, + "epoch": 0.00896918444487155, + "step": 10 }, { - "loss": 2.1799, - "grad_norm": 1.574356198310852, - "learning_rate": 5e-05, - "epoch": 0.6492924377281168, - "step": 10140 + "loss": 430.6295, + "grad_norm": 81.77398681640625, + "learning_rate": 0.0004998797209692856, + "epoch": 0.009866102889358703, + "step": 11 }, { - "loss": 2.1849, - "grad_norm": 1.5934480428695679, - "learning_rate": 5e-05, - "epoch": 0.6496126016520458, - "step": 10145 + "loss": 422.6068, + "grad_norm": 67.85909271240234, + "learning_rate": 0.0004998568601949967, + "epoch": 0.010763021333845858, + "step": 12 }, { - "loss": 2.1843, - "grad_norm": 1.5735338926315308, - "learning_rate": 5e-05, - "epoch": 0.6499327655759749, - "step": 10150 + "loss": 422.3798, + "grad_norm": 81.51007843017578, + "learning_rate": 0.0004998320123178223, + "epoch": 0.011659939778333014, + "step": 13 }, { - "loss": 2.1852, - "grad_norm": 1.611663818359375, - "learning_rate": 5e-05, - "epoch": 0.650252929499904, - "step": 10155 + "loss": 423.3609, + "grad_norm": 70.7045669555664, + "learning_rate": 0.0004998051775353763, + "epoch": 0.012556858222820167, + "step": 14 }, { - "loss": 2.2181, - "grad_norm": 1.699562430381775, - "learning_rate": 5e-05, - "epoch": 0.650573093423833, - "step": 10160 + "loss": 423.232, + "grad_norm": 75.2995834350586, + "learning_rate": 0.0004997763560610752, + "epoch": 0.013453776667307323, + "step": 15 }, { - "loss": 2.1745, - "grad_norm": 1.612734079360962, - "learning_rate": 5e-05, - "epoch": 0.650893257347762, - "step": 10165 + "loss": 414.7621, + "grad_norm": 63.627197265625, + "learning_rate": 0.000499745548124135, + "epoch": 0.014350695111794478, + "step": 16 }, { - "loss": 2.1816, - "grad_norm": 1.620051622390747, - "learning_rate": 5e-05, - "epoch": 0.6512134212716911, - "step": 10170 + "loss": 419.0351, + "grad_norm": 73.96087646484375, + "learning_rate": 0.0004997127539695701, + "epoch": 0.015247613556281632, + "step": 17 }, { - "loss": 2.193, - "grad_norm": 1.772910475730896, - "learning_rate": 5e-05, - "epoch": 0.6515335851956202, - "step": 10175 + "loss": 418.1977, + "grad_norm": 70.3633804321289, + "learning_rate": 0.0004996779738581913, + "epoch": 0.016144532000768787, + "step": 18 }, { - "loss": 2.2045, - "grad_norm": 1.6278610229492188, - "learning_rate": 5e-05, - "epoch": 0.6518537491195492, - "step": 10180 + "loss": 416.1606, + "grad_norm": 74.2279052734375, + "learning_rate": 0.0004996412080666037, + "epoch": 0.017041450445255943, + "step": 19 }, { - "loss": 2.1702, - "grad_norm": 1.6800163984298706, - "learning_rate": 5e-05, - "epoch": 0.6521739130434783, - "step": 10185 + "loss": 417.2284, + "grad_norm": 63.311676025390625, + "learning_rate": 0.0004996024568872042, + "epoch": 0.0179383688897431, + "step": 20 }, { - "loss": 2.1728, - "grad_norm": 1.5924550294876099, - "learning_rate": 5e-05, - "epoch": 0.6524940769674074, - "step": 10190 + "loss": 409.5278, + "grad_norm": 63.21588897705078, + "learning_rate": 0.0004995617206281797, + "epoch": 0.01883528733423025, + "step": 21 }, { - "loss": 2.1876, - "grad_norm": 1.7043887376785278, - "learning_rate": 5e-05, - "epoch": 0.6528142408913363, - "step": 10195 + "loss": 414.1958, + "grad_norm": 61.4863395690918, + "learning_rate": 0.0004995189996135042, + "epoch": 0.019732205778717406, + "step": 22 }, { - "loss": 2.197, - "grad_norm": 1.7267884016036987, - "learning_rate": 5e-05, - "epoch": 0.6531344048152654, - "step": 10200 + "loss": 419.7891, + "grad_norm": 61.297481536865234, + "learning_rate": 0.0004994742941829364, + "epoch": 0.02062912422320456, + "step": 23 }, { - "eval_loss": 2.0496010780334473, - "eval_runtime": 9.1194, - "eval_samples_per_second": 224.576, - "eval_steps_per_second": 28.072, - "epoch": 0.6531344048152654, - "step": 10200 + "loss": 414.3831, + "grad_norm": 68.20845031738281, + "learning_rate": 0.0004994276046920171, + "epoch": 0.021526042667691717, + "step": 24 }, { - "loss": 2.1663, - "grad_norm": 1.721843957901001, - "learning_rate": 5e-05, - "epoch": 0.6534545687391945, - "step": 10205 + "loss": 415.8848, + "grad_norm": 59.016239166259766, + "learning_rate": 0.0004993789315120662, + "epoch": 0.022422961112178872, + "step": 25 }, { - "loss": 2.194, - "grad_norm": 1.666116714477539, - "learning_rate": 5e-05, - "epoch": 0.6537747326631235, - "step": 10210 + "loss": 417.4357, + "grad_norm": 55.90328598022461, + "learning_rate": 0.0004993282750301799, + "epoch": 0.023319879556666027, + "step": 26 }, { - "loss": 2.1648, - "grad_norm": 1.6751972436904907, - "learning_rate": 5e-05, - "epoch": 0.6540948965870526, - "step": 10215 + "loss": 411.6564, + "grad_norm": 59.52859115600586, + "learning_rate": 0.000499275635649227, + "epoch": 0.02421679800115318, + "step": 27 }, { - "loss": 2.1695, - "grad_norm": 1.6365602016448975, - "learning_rate": 5e-05, - "epoch": 0.6544150605109816, - "step": 10220 + "loss": 412.2451, + "grad_norm": 59.61384963989258, + "learning_rate": 0.0004992210137878472, + "epoch": 0.025113716445640335, + "step": 28 }, { - "loss": 2.1997, - "grad_norm": 1.6664323806762695, - "learning_rate": 5e-05, - "epoch": 0.6547352244349107, - "step": 10225 + "loss": 416.412, + "grad_norm": 60.00177001953125, + "learning_rate": 0.000499164409880446, + "epoch": 0.02601063489012749, + "step": 29 }, { - "loss": 2.1405, - "grad_norm": 1.6744410991668701, - "learning_rate": 5e-05, - "epoch": 0.6550553883588397, - "step": 10230 + "loss": 405.7923, + "grad_norm": 59.08831024169922, + "learning_rate": 0.0004991058243771922, + "epoch": 0.026907553334614646, + "step": 30 }, { - "loss": 2.1786, - "grad_norm": 1.5833547115325928, - "learning_rate": 5e-05, - "epoch": 0.6553755522827688, - "step": 10235 + "loss": 411.6278, + "grad_norm": 58.00886154174805, + "learning_rate": 0.0004990452577440143, + "epoch": 0.0278044717791018, + "step": 31 }, { - "loss": 2.1724, - "grad_norm": 1.6934590339660645, - "learning_rate": 5e-05, - "epoch": 0.6556957162066979, - "step": 10240 + "loss": 406.3222, + "grad_norm": 57.3386116027832, + "learning_rate": 0.0004989827104625969, + "epoch": 0.028701390223588957, + "step": 32 }, { - "loss": 2.2072, - "grad_norm": 1.6612778902053833, - "learning_rate": 5e-05, - "epoch": 0.6560158801306268, - "step": 10245 + "loss": 404.9872, + "grad_norm": 56.013816833496094, + "learning_rate": 0.000498918183030376, + "epoch": 0.02959830866807611, + "step": 33 }, { - "loss": 2.2002, - "grad_norm": 1.6952781677246094, - "learning_rate": 5e-05, - "epoch": 0.6563360440545559, - "step": 10250 + "loss": 406.4626, + "grad_norm": 57.787132263183594, + "learning_rate": 0.0004988516759605363, + "epoch": 0.030495227112563264, + "step": 34 }, { - "loss": 2.2005, - "grad_norm": 1.5911494493484497, - "learning_rate": 5e-05, - "epoch": 0.656656207978485, - "step": 10255 + "loss": 405.2309, + "grad_norm": 54.9903678894043, + "learning_rate": 0.0004987831897820059, + "epoch": 0.03139214555705042, + "step": 35 }, { - "loss": 2.1954, - "grad_norm": 1.6101614236831665, - "learning_rate": 5e-05, - "epoch": 0.6569763719024141, - "step": 10260 + "loss": 415.0021, + "grad_norm": 55.86436462402344, + "learning_rate": 0.0004987127250394532, + "epoch": 0.032289064001537575, + "step": 36 }, { - "loss": 2.2006, - "grad_norm": 1.5616638660430908, - "learning_rate": 5e-05, - "epoch": 0.6572965358263431, - "step": 10265 + "loss": 402.1766, + "grad_norm": 53.72284698486328, + "learning_rate": 0.0004986402822932818, + "epoch": 0.03318598244602473, + "step": 37 }, { - "loss": 2.1962, - "grad_norm": 1.6221803426742554, - "learning_rate": 5e-05, - "epoch": 0.6576166997502721, - "step": 10270 + "loss": 409.7162, + "grad_norm": 56.52421569824219, + "learning_rate": 0.0004985658621196263, + "epoch": 0.034082900890511886, + "step": 38 }, { - "loss": 2.1947, - "grad_norm": 1.5761173963546753, - "learning_rate": 5e-05, - "epoch": 0.6579368636742012, - "step": 10275 + "loss": 406.8592, + "grad_norm": 63.26171875, + "learning_rate": 0.0004984894651103478, + "epoch": 0.03497981933499904, + "step": 39 }, { - "loss": 2.171, - "grad_norm": 1.6054048538208008, - "learning_rate": 5e-05, - "epoch": 0.6582570275981302, - "step": 10280 + "loss": 401.9672, + "grad_norm": 52.98197937011719, + "learning_rate": 0.0004984110918730289, + "epoch": 0.0358767377794862, + "step": 40 }, { - "loss": 2.1872, - "grad_norm": 1.6371126174926758, - "learning_rate": 5e-05, - "epoch": 0.6585771915220593, - "step": 10285 + "loss": 402.0731, + "grad_norm": 61.255733489990234, + "learning_rate": 0.0004983307430309695, + "epoch": 0.03677365622397335, + "step": 41 }, { - "loss": 2.1471, - "grad_norm": 1.6947263479232788, - "learning_rate": 5e-05, - "epoch": 0.6588973554459884, - "step": 10290 + "loss": 405.9777, + "grad_norm": 62.212188720703125, + "learning_rate": 0.0004982484192231808, + "epoch": 0.0376705746684605, + "step": 42 }, { - "loss": 2.1586, - "grad_norm": 1.6005196571350098, - "learning_rate": 5e-05, - "epoch": 0.6592175193699173, - "step": 10295 + "loss": 409.4884, + "grad_norm": 60.04124450683594, + "learning_rate": 0.0004981641211043813, + "epoch": 0.03856749311294766, + "step": 43 }, { - "loss": 2.1868, - "grad_norm": 1.6588430404663086, - "learning_rate": 5e-05, - "epoch": 0.6595376832938464, - "step": 10300 + "loss": 402.7691, + "grad_norm": 58.80691909790039, + "learning_rate": 0.0004980778493449912, + "epoch": 0.03946441155743481, + "step": 44 }, { - "loss": 2.1795, - "grad_norm": 1.719780683517456, - "learning_rate": 5e-05, - "epoch": 0.6598578472177755, - "step": 10305 + "loss": 406.07, + "grad_norm": 58.074493408203125, + "learning_rate": 0.0004979896046311265, + "epoch": 0.04036133000192197, + "step": 45 }, { - "loss": 2.2096, - "grad_norm": 1.7076449394226074, - "learning_rate": 5e-05, - "epoch": 0.6601780111417046, - "step": 10310 + "loss": 406.7423, + "grad_norm": 62.749534606933594, + "learning_rate": 0.0004978993876645944, + "epoch": 0.04125824844640912, + "step": 46 }, { - "loss": 2.1607, - "grad_norm": 1.708767294883728, - "learning_rate": 5e-05, - "epoch": 0.6604981750656336, - "step": 10315 + "loss": 403.2931, + "grad_norm": 58.47712707519531, + "learning_rate": 0.0004978071991628874, + "epoch": 0.04215516689089628, + "step": 47 }, { - "loss": 2.1654, - "grad_norm": 1.5694926977157593, - "learning_rate": 5e-05, - "epoch": 0.6608183389895627, - "step": 10320 + "loss": 402.5574, + "grad_norm": 64.82901000976562, + "learning_rate": 0.0004977130398591775, + "epoch": 0.04305208533538343, + "step": 48 }, { - "loss": 2.1733, - "grad_norm": 1.6309330463409424, - "learning_rate": 5e-05, - "epoch": 0.6611385029134917, - "step": 10325 + "loss": 405.5097, + "grad_norm": 56.95109939575195, + "learning_rate": 0.00049761691050231, + "epoch": 0.043949003779870585, + "step": 49 }, { - "loss": 2.1714, - "grad_norm": 1.545966386795044, - "learning_rate": 5e-05, - "epoch": 0.6614586668374207, - "step": 10330 + "loss": 408.4274, + "grad_norm": 60.67522048950195, + "learning_rate": 0.0004975188118567987, + "epoch": 0.044845922224357744, + "step": 50 }, { - "loss": 2.1862, - "grad_norm": 1.6850484609603882, - "learning_rate": 5e-05, - "epoch": 0.6617788307613498, - "step": 10335 + "eval_loss": 1.7932980060577393, + "eval_runtime": 41.7475, + "eval_samples_per_second": 49.057, + "eval_steps_per_second": 3.066, + "epoch": 0.044845922224357744, + "step": 50 }, { - "loss": 2.2057, - "grad_norm": 1.7686723470687866, - "learning_rate": 5e-05, - "epoch": 0.6620989946852789, - "step": 10340 + "loss": 405.2191, + "grad_norm": 61.441951751708984, + "learning_rate": 0.0004974187447028184, + "epoch": 0.045742840668844896, + "step": 51 }, { - "loss": 2.2103, - "grad_norm": 1.6861554384231567, - "learning_rate": 5e-05, - "epoch": 0.662419158609208, - "step": 10345 + "loss": 402.9874, + "grad_norm": 56.64131546020508, + "learning_rate": 0.0004973167098361999, + "epoch": 0.046639759113332055, + "step": 52 }, { - "loss": 2.1704, - "grad_norm": 1.6404895782470703, - "learning_rate": 5e-05, - "epoch": 0.6627393225331369, - "step": 10350 + "loss": 403.7462, + "grad_norm": 58.905479431152344, + "learning_rate": 0.0004972127080684228, + "epoch": 0.04753667755781921, + "step": 53 }, { - "loss": 2.1692, - "grad_norm": 1.6215230226516724, - "learning_rate": 5e-05, - "epoch": 0.663059486457066, - "step": 10355 + "loss": 402.2606, + "grad_norm": 60.9106559753418, + "learning_rate": 0.0004971067402266096, + "epoch": 0.04843359600230636, + "step": 54 }, { - "loss": 2.1937, - "grad_norm": 1.6805126667022705, - "learning_rate": 5e-05, - "epoch": 0.6633796503809951, - "step": 10360 + "loss": 397.4493, + "grad_norm": 55.347869873046875, + "learning_rate": 0.0004969988071535188, + "epoch": 0.04933051444679352, + "step": 55 }, { - "loss": 2.1821, - "grad_norm": 1.6423799991607666, - "learning_rate": 5e-05, - "epoch": 0.6636998143049241, - "step": 10365 + "loss": 398.7716, + "grad_norm": 56.816104888916016, + "learning_rate": 0.0004968889097075385, + "epoch": 0.05022743289128067, + "step": 56 }, { - "loss": 2.1924, - "grad_norm": 1.6313560009002686, - "learning_rate": 5e-05, - "epoch": 0.6640199782288532, - "step": 10370 + "loss": 399.2036, + "grad_norm": 63.388851165771484, + "learning_rate": 0.0004967770487626791, + "epoch": 0.05112435133576783, + "step": 57 }, { - "loss": 2.1803, - "grad_norm": 1.6506428718566895, - "learning_rate": 5e-05, - "epoch": 0.6643401421527823, - "step": 10375 + "loss": 402.6399, + "grad_norm": 58.803466796875, + "learning_rate": 0.0004966632252085668, + "epoch": 0.05202126978025498, + "step": 58 }, { - "loss": 2.1903, - "grad_norm": 1.6371464729309082, - "learning_rate": 5e-05, - "epoch": 0.6646603060767112, - "step": 10380 + "loss": 401.2329, + "grad_norm": 61.42218780517578, + "learning_rate": 0.0004965474399504364, + "epoch": 0.05291818822474213, + "step": 59 }, { - "loss": 2.1877, - "grad_norm": 1.6620516777038574, - "learning_rate": 5e-05, - "epoch": 0.6649804700006403, - "step": 10385 + "loss": 394.491, + "grad_norm": 54.581748962402344, + "learning_rate": 0.000496429693909124, + "epoch": 0.05381510666922929, + "step": 60 }, { - "loss": 2.1649, - "grad_norm": 1.6384953260421753, - "learning_rate": 5e-05, - "epoch": 0.6653006339245694, - "step": 10390 + "loss": 402.2176, + "grad_norm": 60.348812103271484, + "learning_rate": 0.0004963099880210597, + "epoch": 0.05471202511371644, + "step": 61 }, { - "loss": 2.2066, - "grad_norm": 1.5989880561828613, - "learning_rate": 5e-05, - "epoch": 0.6656207978484985, - "step": 10395 + "loss": 401.5288, + "grad_norm": 58.51568603515625, + "learning_rate": 0.0004961883232382603, + "epoch": 0.0556089435582036, + "step": 62 }, { - "loss": 2.1917, - "grad_norm": 1.6223093271255493, - "learning_rate": 5e-05, - "epoch": 0.6659409617724275, - "step": 10400 + "loss": 402.1975, + "grad_norm": 53.891822814941406, + "learning_rate": 0.0004960647005283217, + "epoch": 0.056505862002690754, + "step": 63 }, { - "eval_loss": 2.0479464530944824, - "eval_runtime": 9.5359, - "eval_samples_per_second": 214.766, - "eval_steps_per_second": 26.846, - "epoch": 0.6659409617724275, - "step": 10400 + "loss": 402.8554, + "grad_norm": 54.66781234741211, + "learning_rate": 0.0004959391208744108, + "epoch": 0.05740278044717791, + "step": 64 }, { - "loss": 2.1724, - "grad_norm": 1.6815154552459717, - "learning_rate": 5e-05, - "epoch": 0.6662611256963565, - "step": 10405 + "loss": 397.2245, + "grad_norm": 57.83986282348633, + "learning_rate": 0.0004958115852752582, + "epoch": 0.058299698891665065, + "step": 65 }, { - "loss": 2.1865, - "grad_norm": 1.6925560235977173, - "learning_rate": 5e-05, - "epoch": 0.6665812896202856, - "step": 10410 + "loss": 398.295, + "grad_norm": 56.6056022644043, + "learning_rate": 0.0004956820947451502, + "epoch": 0.05919661733615222, + "step": 66 }, { - "loss": 2.1976, - "grad_norm": 1.6829434633255005, - "learning_rate": 5e-05, - "epoch": 0.6669014535442146, - "step": 10415 + "loss": 398.1401, + "grad_norm": 58.830711364746094, + "learning_rate": 0.0004955506503139204, + "epoch": 0.060093535780639376, + "step": 67 }, { - "loss": 2.1846, - "grad_norm": 1.6039493083953857, - "learning_rate": 5e-05, - "epoch": 0.6672216174681437, - "step": 10420 + "loss": 401.4149, + "grad_norm": 54.770755767822266, + "learning_rate": 0.0004954172530269418, + "epoch": 0.06099045422512653, + "step": 68 }, { - "loss": 2.1652, - "grad_norm": 1.5750089883804321, - "learning_rate": 5e-05, - "epoch": 0.6675417813920728, - "step": 10425 + "loss": 399.5218, + "grad_norm": 59.45661926269531, + "learning_rate": 0.0004952819039451183, + "epoch": 0.06188737266961369, + "step": 69 }, { - "loss": 2.1989, - "grad_norm": 1.6804012060165405, - "learning_rate": 5e-05, - "epoch": 0.6678619453160018, - "step": 10430 + "loss": 396.4537, + "grad_norm": 53.4246826171875, + "learning_rate": 0.0004951446041448765, + "epoch": 0.06278429111410085, + "step": 70 }, { - "loss": 2.1673, - "grad_norm": 1.6494613885879517, - "learning_rate": 5e-05, - "epoch": 0.6681821092399308, - "step": 10435 + "loss": 401.2764, + "grad_norm": 55.125919342041016, + "learning_rate": 0.0004950053547181568, + "epoch": 0.063681209558588, + "step": 71 }, { - "loss": 2.1598, - "grad_norm": 1.5852676630020142, - "learning_rate": 5e-05, - "epoch": 0.6685022731638599, - "step": 10440 + "loss": 400.9092, + "grad_norm": 63.59549331665039, + "learning_rate": 0.0004948641567724053, + "epoch": 0.06457812800307515, + "step": 72 }, { - "loss": 2.188, - "grad_norm": 1.6259963512420654, - "learning_rate": 5e-05, - "epoch": 0.668822437087789, - "step": 10445 + "loss": 397.1968, + "grad_norm": 58.40228271484375, + "learning_rate": 0.0004947210114305639, + "epoch": 0.0654750464475623, + "step": 73 }, { - "loss": 2.211, - "grad_norm": 1.6978424787521362, - "learning_rate": 5e-05, - "epoch": 0.669142601011718, - "step": 10450 + "loss": 398.0598, + "grad_norm": 62.7151985168457, + "learning_rate": 0.0004945759198310629, + "epoch": 0.06637196489204945, + "step": 74 }, { - "loss": 2.1663, - "grad_norm": 1.6471407413482666, - "learning_rate": 5e-05, - "epoch": 0.669462764935647, - "step": 10455 + "loss": 398.7396, + "grad_norm": 59.287742614746094, + "learning_rate": 0.0004944288831278106, + "epoch": 0.06726888333653662, + "step": 75 }, { - "loss": 2.1957, - "grad_norm": 1.5860986709594727, - "learning_rate": 5e-05, - "epoch": 0.6697829288595761, - "step": 10460 + "loss": 391.3397, + "grad_norm": 59.052059173583984, + "learning_rate": 0.0004942799024901846, + "epoch": 0.06816580178102377, + "step": 76 }, { - "loss": 2.1557, - "grad_norm": 1.65492582321167, - "learning_rate": 5e-05, - "epoch": 0.6701030927835051, - "step": 10465 + "loss": 394.1899, + "grad_norm": 54.65058135986328, + "learning_rate": 0.0004941289791030229, + "epoch": 0.06906272022551092, + "step": 77 }, { - "loss": 2.18, - "grad_norm": 1.6820275783538818, - "learning_rate": 5e-05, - "epoch": 0.6704232567074342, - "step": 10470 + "loss": 393.8536, + "grad_norm": 51.59941101074219, + "learning_rate": 0.0004939761141666139, + "epoch": 0.06995963866999808, + "step": 78 }, { - "loss": 2.1984, - "grad_norm": 1.6048341989517212, - "learning_rate": 5e-05, - "epoch": 0.6707434206313633, - "step": 10475 + "loss": 396.7059, + "grad_norm": 55.84555435180664, + "learning_rate": 0.0004938213088966872, + "epoch": 0.07085655711448523, + "step": 79 }, { - "loss": 2.1651, - "grad_norm": 1.6907062530517578, - "learning_rate": 5e-05, - "epoch": 0.6710635845552924, - "step": 10480 + "loss": 392.0196, + "grad_norm": 55.808250427246094, + "learning_rate": 0.0004936645645244033, + "epoch": 0.0717534755589724, + "step": 80 }, { - "loss": 2.1792, - "grad_norm": 1.6291903257369995, - "learning_rate": 5e-05, - "epoch": 0.6713837484792213, - "step": 10485 + "loss": 395.5785, + "grad_norm": 53.83452224731445, + "learning_rate": 0.0004935058822963453, + "epoch": 0.07265039400345955, + "step": 81 }, { - "loss": 2.1838, - "grad_norm": 1.5670826435089111, - "learning_rate": 5e-05, - "epoch": 0.6717039124031504, - "step": 10490 + "loss": 398.3966, + "grad_norm": 61.950626373291016, + "learning_rate": 0.000493345263474507, + "epoch": 0.0735473124479467, + "step": 82 }, { - "loss": 2.1827, - "grad_norm": 1.6459773778915405, - "learning_rate": 5e-05, - "epoch": 0.6720240763270795, - "step": 10495 + "loss": 399.4866, + "grad_norm": 65.6949462890625, + "learning_rate": 0.0004931827093362844, + "epoch": 0.07444423089243385, + "step": 83 }, { - "loss": 2.1948, - "grad_norm": 1.6180630922317505, - "learning_rate": 5e-05, - "epoch": 0.6723442402510085, - "step": 10500 + "loss": 393.8017, + "grad_norm": 54.928836822509766, + "learning_rate": 0.0004930182211744649, + "epoch": 0.075341149336921, + "step": 84 }, { - "loss": 2.1589, - "grad_norm": 1.5975276231765747, - "learning_rate": 5e-05, - "epoch": 0.6726644041749376, - "step": 10505 + "loss": 398.1347, + "grad_norm": 59.81849670410156, + "learning_rate": 0.0004928518002972172, + "epoch": 0.07623806778140817, + "step": 85 }, { - "loss": 2.1806, - "grad_norm": 1.6482629776000977, - "learning_rate": 5e-05, - "epoch": 0.6729845680988666, - "step": 10510 + "loss": 392.8837, + "grad_norm": 57.970462799072266, + "learning_rate": 0.0004926834480280805, + "epoch": 0.07713498622589532, + "step": 86 }, { - "loss": 2.2065, - "grad_norm": 1.535201907157898, - "learning_rate": 5e-05, - "epoch": 0.6733047320227957, - "step": 10515 + "loss": 394.3792, + "grad_norm": 57.43026351928711, + "learning_rate": 0.0004925131657059547, + "epoch": 0.07803190467038247, + "step": 87 }, { - "loss": 2.1731, - "grad_norm": 1.6452282667160034, - "learning_rate": 5e-05, - "epoch": 0.6736248959467247, - "step": 10520 + "loss": 395.7612, + "grad_norm": 57.73651123046875, + "learning_rate": 0.0004923409546850891, + "epoch": 0.07892882311486962, + "step": 88 }, { - "loss": 2.1929, - "grad_norm": 1.5697953701019287, - "learning_rate": 5e-05, - "epoch": 0.6739450598706538, - "step": 10525 + "loss": 396.5627, + "grad_norm": 58.27775573730469, + "learning_rate": 0.000492166816335072, + "epoch": 0.07982574155935677, + "step": 89 }, { - "loss": 2.1677, - "grad_norm": 1.689011812210083, - "learning_rate": 5e-05, - "epoch": 0.6742652237945829, - "step": 10530 + "loss": 398.5615, + "grad_norm": 53.49543762207031, + "learning_rate": 0.0004919907520408196, + "epoch": 0.08072266000384394, + "step": 90 }, { - "loss": 2.1907, - "grad_norm": 1.5526947975158691, - "learning_rate": 5e-05, - "epoch": 0.6745853877185118, - "step": 10535 + "loss": 398.6497, + "grad_norm": 57.175514221191406, + "learning_rate": 0.000491812763202565, + "epoch": 0.08161957844833109, + "step": 91 }, { - "loss": 2.1569, - "grad_norm": 1.6372793912887573, - "learning_rate": 5e-05, - "epoch": 0.6749055516424409, - "step": 10540 + "loss": 392.5616, + "grad_norm": 58.206119537353516, + "learning_rate": 0.0004916328512358472, + "epoch": 0.08251649689281824, + "step": 92 }, { - "loss": 2.185, - "grad_norm": 1.670624852180481, - "learning_rate": 5e-05, - "epoch": 0.67522571556637, - "step": 10545 + "loss": 390.17, + "grad_norm": 56.978179931640625, + "learning_rate": 0.0004914510175714999, + "epoch": 0.0834134153373054, + "step": 93 }, { - "loss": 2.1731, - "grad_norm": 1.6175780296325684, - "learning_rate": 5e-05, - "epoch": 0.675545879490299, - "step": 10550 + "loss": 391.477, + "grad_norm": 59.842369079589844, + "learning_rate": 0.0004912672636556397, + "epoch": 0.08431033378179256, + "step": 94 }, { - "loss": 2.1483, - "grad_norm": 1.5563071966171265, - "learning_rate": 5e-05, - "epoch": 0.6758660434142281, - "step": 10555 + "loss": 394.4383, + "grad_norm": 52.20112609863281, + "learning_rate": 0.0004910815909496555, + "epoch": 0.08520725222627971, + "step": 95 }, { - "loss": 2.1656, - "grad_norm": 1.6369822025299072, - "learning_rate": 5e-05, - "epoch": 0.6761862073381572, - "step": 10560 + "loss": 390.8443, + "grad_norm": 61.12334060668945, + "learning_rate": 0.0004908940009301954, + "epoch": 0.08610417067076687, + "step": 96 }, { - "loss": 2.1781, - "grad_norm": 1.5494641065597534, - "learning_rate": 5e-05, - "epoch": 0.6765063712620862, - "step": 10565 + "loss": 395.9276, + "grad_norm": 55.49872589111328, + "learning_rate": 0.0004907044950891565, + "epoch": 0.08700108911525402, + "step": 97 }, { - "loss": 2.175, - "grad_norm": 1.6584258079528809, - "learning_rate": 5e-05, - "epoch": 0.6768265351860152, - "step": 10570 + "loss": 394.7866, + "grad_norm": 59.71890640258789, + "learning_rate": 0.000490513074933672, + "epoch": 0.08789800755974117, + "step": 98 }, { - "loss": 2.1616, - "grad_norm": 1.6767727136611938, - "learning_rate": 5e-05, - "epoch": 0.6771466991099443, - "step": 10575 + "loss": 388.5464, + "grad_norm": 55.72919845581055, + "learning_rate": 0.0004903197419860999, + "epoch": 0.08879492600422834, + "step": 99 }, { - "loss": 2.1636, - "grad_norm": 1.555019497871399, - "learning_rate": 5e-05, - "epoch": 0.6774668630338734, - "step": 10580 + "loss": 392.9969, + "grad_norm": 61.6799430847168, + "learning_rate": 0.0004901244977840103, + "epoch": 0.08969184444871549, + "step": 100 }, { - "loss": 2.1667, - "grad_norm": 1.597027063369751, - "learning_rate": 5e-05, - "epoch": 0.6777870269578024, - "step": 10585 + "eval_loss": 1.7485355138778687, + "eval_runtime": 49.5113, + "eval_samples_per_second": 41.364, + "eval_steps_per_second": 2.585, + "epoch": 0.08969184444871549, + "step": 100 }, { - "loss": 2.1898, - "grad_norm": 1.5965244770050049, - "learning_rate": 5e-05, - "epoch": 0.6781071908817314, - "step": 10590 + "loss": 393.0805, + "grad_norm": 58.71113204956055, + "learning_rate": 0.0004899273438801734, + "epoch": 0.09058876289320264, + "step": 101 }, { - "loss": 2.1712, - "grad_norm": 1.53036630153656, - "learning_rate": 5e-05, - "epoch": 0.6784273548056605, - "step": 10595 + "loss": 391.5116, + "grad_norm": 54.11758804321289, + "learning_rate": 0.0004897282818425474, + "epoch": 0.09148568133768979, + "step": 102 }, { - "loss": 2.1438, - "grad_norm": 1.5849616527557373, - "learning_rate": 5e-05, - "epoch": 0.6787475187295896, - "step": 10600 + "loss": 394.4952, + "grad_norm": 53.54176712036133, + "learning_rate": 0.0004895273132542658, + "epoch": 0.09238259978217694, + "step": 103 }, { - "eval_loss": 2.038008213043213, - "eval_runtime": 9.1765, - "eval_samples_per_second": 223.178, - "eval_steps_per_second": 27.897, - "epoch": 0.6787475187295896, - "step": 10600 + "loss": 392.5484, + "grad_norm": 51.26163101196289, + "learning_rate": 0.0004893244397136246, + "epoch": 0.09327951822666411, + "step": 104 }, { - "loss": 2.1674, - "grad_norm": 1.6421003341674805, - "learning_rate": 5e-05, - "epoch": 0.6790676826535186, - "step": 10605 + "loss": 392.7574, + "grad_norm": 57.158973693847656, + "learning_rate": 0.0004891196628340703, + "epoch": 0.09417643667115126, + "step": 105 }, { - "loss": 2.1808, - "grad_norm": 1.602632761001587, - "learning_rate": 5e-05, - "epoch": 0.6793878465774477, - "step": 10610 + "loss": 392.1094, + "grad_norm": 51.87057113647461, + "learning_rate": 0.0004889129842441859, + "epoch": 0.09507335511563841, + "step": 106 }, { - "loss": 2.1703, - "grad_norm": 1.7214833498001099, - "learning_rate": 5e-05, - "epoch": 0.6797080105013767, - "step": 10615 + "loss": 391.9873, + "grad_norm": 62.71110534667969, + "learning_rate": 0.0004887044055876793, + "epoch": 0.09597027356012557, + "step": 107 }, { - "loss": 2.2029, - "grad_norm": 1.5542736053466797, - "learning_rate": 5e-05, - "epoch": 0.6800281744253057, - "step": 10620 + "loss": 393.0227, + "grad_norm": 61.41956329345703, + "learning_rate": 0.0004884939285233691, + "epoch": 0.09686719200461272, + "step": 108 }, { - "loss": 2.1652, - "grad_norm": 1.7546051740646362, - "learning_rate": 5e-05, - "epoch": 0.6803483383492348, - "step": 10625 + "loss": 389.2371, + "grad_norm": 59.030765533447266, + "learning_rate": 0.0004882815547251721, + "epoch": 0.09776411044909988, + "step": 109 }, { - "loss": 2.1935, - "grad_norm": 1.6220976114273071, - "learning_rate": 5e-05, - "epoch": 0.6806685022731639, - "step": 10630 + "loss": 394.932, + "grad_norm": 60.926448822021484, + "learning_rate": 0.00048806728588208966, + "epoch": 0.09866102889358704, + "step": 110 }, { - "loss": 2.1631, - "grad_norm": 1.7736234664916992, - "learning_rate": 5e-05, - "epoch": 0.6809886661970929, - "step": 10635 + "loss": 389.2965, + "grad_norm": 59.546268463134766, + "learning_rate": 0.0004878511236981945, + "epoch": 0.09955794733807419, + "step": 111 }, { - "loss": 2.1659, - "grad_norm": 1.6324067115783691, - "learning_rate": 5e-05, - "epoch": 0.681308830121022, - "step": 10640 + "loss": 389.0897, + "grad_norm": 56.25603103637695, + "learning_rate": 0.0004876330698926169, + "epoch": 0.10045486578256134, + "step": 112 }, { - "loss": 2.1806, - "grad_norm": 1.6361428499221802, - "learning_rate": 5e-05, - "epoch": 0.681628994044951, - "step": 10645 + "loss": 391.7546, + "grad_norm": 63.1163444519043, + "learning_rate": 0.00048741312619953104, + "epoch": 0.10135178422704849, + "step": 113 }, { - "loss": 2.1511, - "grad_norm": 1.695727825164795, - "learning_rate": 5e-05, - "epoch": 0.6819491579688801, - "step": 10650 + "loss": 392.0137, + "grad_norm": 70.23162078857422, + "learning_rate": 0.00048719129436814156, + "epoch": 0.10224870267153566, + "step": 114 }, { - "loss": 2.1771, - "grad_norm": 1.5814673900604248, - "learning_rate": 5e-05, - "epoch": 0.6822693218928091, - "step": 10655 + "loss": 390.5738, + "grad_norm": 60.9749755859375, + "learning_rate": 0.00048696757616266927, + "epoch": 0.10314562111602281, + "step": 115 }, { - "loss": 2.156, - "grad_norm": 1.5904582738876343, - "learning_rate": 5e-05, - "epoch": 0.6825894858167382, - "step": 10660 + "loss": 387.7592, + "grad_norm": 60.2146110534668, + "learning_rate": 0.0004867419733623372, + "epoch": 0.10404253956050996, + "step": 116 }, { - "loss": 2.1845, - "grad_norm": 1.6580817699432373, - "learning_rate": 5e-05, - "epoch": 0.6829096497406673, - "step": 10665 + "loss": 390.6403, + "grad_norm": 59.26010513305664, + "learning_rate": 0.00048651448776135654, + "epoch": 0.10493945800499711, + "step": 117 }, { - "loss": 2.1571, - "grad_norm": 1.5745543241500854, - "learning_rate": 5e-05, - "epoch": 0.6832298136645962, - "step": 10670 + "loss": 391.4545, + "grad_norm": 55.02613067626953, + "learning_rate": 0.00048628512116891234, + "epoch": 0.10583637644948427, + "step": 118 }, { - "loss": 2.1504, - "grad_norm": 1.5689435005187988, - "learning_rate": 5e-05, - "epoch": 0.6835499775885253, - "step": 10675 + "loss": 388.2937, + "grad_norm": 56.28743362426758, + "learning_rate": 0.00048605387540914916, + "epoch": 0.10673329489397143, + "step": 119 }, { - "loss": 2.1806, - "grad_norm": 1.5868749618530273, - "learning_rate": 5e-05, - "epoch": 0.6838701415124544, - "step": 10680 + "loss": 389.2755, + "grad_norm": 55.22878646850586, + "learning_rate": 0.0004858207523211563, + "epoch": 0.10763021333845858, + "step": 120 }, { - "loss": 2.1685, - "grad_norm": 1.6363295316696167, - "learning_rate": 5e-05, - "epoch": 0.6841903054363835, - "step": 10685 + "loss": 392.9062, + "grad_norm": 55.45512771606445, + "learning_rate": 0.00048558575375895377, + "epoch": 0.10852713178294573, + "step": 121 }, { - "loss": 2.1508, - "grad_norm": 1.5658912658691406, - "learning_rate": 5e-05, - "epoch": 0.6845104693603125, - "step": 10690 + "loss": 388.4548, + "grad_norm": 58.8115119934082, + "learning_rate": 0.0004853488815914767, + "epoch": 0.10942405022743289, + "step": 122 }, { - "loss": 2.1798, - "grad_norm": 1.73909330368042, - "learning_rate": 5e-05, - "epoch": 0.6848306332842415, - "step": 10695 + "loss": 390.1011, + "grad_norm": 55.49444580078125, + "learning_rate": 0.00048511013770256134, + "epoch": 0.11032096867192005, + "step": 123 }, { - "loss": 2.1349, - "grad_norm": 1.5258785486221313, - "learning_rate": 5e-05, - "epoch": 0.6851507972081706, - "step": 10700 + "loss": 388.7439, + "grad_norm": 54.36104202270508, + "learning_rate": 0.00048486952399092945, + "epoch": 0.1112178871164072, + "step": 124 }, { - "loss": 2.1775, - "grad_norm": 1.6583811044692993, - "learning_rate": 5e-05, - "epoch": 0.6854709611320996, - "step": 10705 + "loss": 391.1307, + "grad_norm": 52.75822067260742, + "learning_rate": 0.0004846270423701734, + "epoch": 0.11211480556089436, + "step": 125 }, { - "loss": 2.1723, - "grad_norm": 1.623826503753662, - "learning_rate": 5e-05, - "epoch": 0.6857911250560287, - "step": 10710 + "loss": 388.8095, + "grad_norm": 55.67084884643555, + "learning_rate": 0.0004843826947687411, + "epoch": 0.11301172400538151, + "step": 126 }, { - "loss": 2.162, - "grad_norm": 1.5844995975494385, - "learning_rate": 5e-05, - "epoch": 0.6861112889799578, - "step": 10715 + "loss": 388.7104, + "grad_norm": 58.483211517333984, + "learning_rate": 0.0004841364831299206, + "epoch": 0.11390864244986866, + "step": 127 }, { - "loss": 2.1719, - "grad_norm": 1.5476335287094116, - "learning_rate": 5e-05, - "epoch": 0.6864314529038867, - "step": 10720 + "loss": 392.5351, + "grad_norm": 54.69878387451172, + "learning_rate": 0.00048388840941182435, + "epoch": 0.11480556089435583, + "step": 128 }, { - "loss": 2.1687, - "grad_norm": 1.6284759044647217, - "learning_rate": 5e-05, - "epoch": 0.6867516168278158, - "step": 10725 + "loss": 389.9329, + "grad_norm": 56.85935974121094, + "learning_rate": 0.00048363847558737395, + "epoch": 0.11570247933884298, + "step": 129 }, { - "loss": 2.1917, - "grad_norm": 1.6278412342071533, - "learning_rate": 5e-05, - "epoch": 0.6870717807517449, - "step": 10730 + "loss": 389.8976, + "grad_norm": 55.818260192871094, + "learning_rate": 0.0004833866836442844, + "epoch": 0.11659939778333013, + "step": 130 }, { - "loss": 2.1692, - "grad_norm": 1.6887261867523193, - "learning_rate": 5e-05, - "epoch": 0.687391944675674, - "step": 10735 + "loss": 389.0714, + "grad_norm": 69.33192443847656, + "learning_rate": 0.0004831330355850483, + "epoch": 0.11749631622781728, + "step": 131 }, { - "loss": 2.1672, - "grad_norm": 1.6040128469467163, - "learning_rate": 5e-05, - "epoch": 0.687712108599603, - "step": 10740 + "loss": 387.675, + "grad_norm": 59.69966506958008, + "learning_rate": 0.0004828775334269198, + "epoch": 0.11839323467230443, + "step": 132 }, { - "loss": 2.1755, - "grad_norm": 1.6104363203048706, - "learning_rate": 5e-05, - "epoch": 0.688032272523532, - "step": 10745 + "loss": 389.1474, + "grad_norm": 63.28241729736328, + "learning_rate": 0.0004826201792018986, + "epoch": 0.1192901531167916, + "step": 133 }, { - "loss": 2.1828, - "grad_norm": 1.5533658266067505, - "learning_rate": 5e-05, - "epoch": 0.6883524364474611, - "step": 10750 + "loss": 386.0185, + "grad_norm": 60.13338851928711, + "learning_rate": 0.0004823609749567138, + "epoch": 0.12018707156127875, + "step": 134 }, { - "loss": 2.1762, - "grad_norm": 1.6379051208496094, - "learning_rate": 5e-05, - "epoch": 0.6886726003713901, - "step": 10755 + "loss": 393.0312, + "grad_norm": 50.345890045166016, + "learning_rate": 0.0004820999227528079, + "epoch": 0.1210839900057659, + "step": 135 }, { - "loss": 2.1811, - "grad_norm": 1.6067684888839722, - "learning_rate": 5e-05, - "epoch": 0.6889927642953192, - "step": 10760 + "loss": 388.9017, + "grad_norm": 54.398582458496094, + "learning_rate": 0.00048183702466631986, + "epoch": 0.12198090845025306, + "step": 136 }, { - "loss": 2.1711, - "grad_norm": 1.6196955442428589, - "learning_rate": 5e-05, - "epoch": 0.6893129282192483, - "step": 10765 + "loss": 390.3952, + "grad_norm": 58.791343688964844, + "learning_rate": 0.0004815722827880689, + "epoch": 0.12287782689474021, + "step": 137 }, { - "loss": 2.1666, - "grad_norm": 1.7209603786468506, - "learning_rate": 5e-05, - "epoch": 0.6896330921431774, - "step": 10770 + "loss": 391.5972, + "grad_norm": 56.27891540527344, + "learning_rate": 0.000481305699223538, + "epoch": 0.12377474533922737, + "step": 138 }, { - "loss": 2.1632, - "grad_norm": 1.6124151945114136, - "learning_rate": 5e-05, - "epoch": 0.6899532560671063, - "step": 10775 + "loss": 390.4619, + "grad_norm": 57.29872512817383, + "learning_rate": 0.000481037276092857, + "epoch": 0.12467166378371453, + "step": 139 }, { - "loss": 2.1812, - "grad_norm": 1.6349486112594604, - "learning_rate": 5e-05, - "epoch": 0.6902734199910354, - "step": 10780 + "loss": 386.5269, + "grad_norm": 56.40953826904297, + "learning_rate": 0.0004807670155307856, + "epoch": 0.1255685822282017, + "step": 140 }, { - "loss": 2.1459, - "grad_norm": 1.5787373781204224, - "learning_rate": 5e-05, - "epoch": 0.6905935839149645, - "step": 10785 + "loss": 386.9588, + "grad_norm": 56.36626434326172, + "learning_rate": 0.0004804949196866967, + "epoch": 0.12646550067268883, + "step": 141 }, { - "loss": 2.1705, - "grad_norm": 1.658933401107788, - "learning_rate": 5e-05, - "epoch": 0.6909137478388935, - "step": 10790 + "loss": 390.6064, + "grad_norm": 59.941890716552734, + "learning_rate": 0.00048022099072455893, + "epoch": 0.127362419117176, + "step": 142 }, { - "loss": 2.2042, - "grad_norm": 1.6257023811340332, - "learning_rate": 5e-05, - "epoch": 0.6912339117628226, - "step": 10795 + "loss": 389.5639, + "grad_norm": 55.42548370361328, + "learning_rate": 0.0004799452308229199, + "epoch": 0.12825933756166313, + "step": 143 }, { - "loss": 2.156, - "grad_norm": 1.6555736064910889, - "learning_rate": 5e-05, - "epoch": 0.6915540756867516, - "step": 10800 + "loss": 389.1144, + "grad_norm": 59.46462631225586, + "learning_rate": 0.0004796676421748883, + "epoch": 0.1291562560061503, + "step": 144 }, { - "eval_loss": 2.0406501293182373, - "eval_runtime": 9.4746, - "eval_samples_per_second": 216.157, - "eval_steps_per_second": 27.02, - "epoch": 0.6915540756867516, - "step": 10800 + "loss": 387.238, + "grad_norm": 61.307960510253906, + "learning_rate": 0.0004793882269881172, + "epoch": 0.13005317445063747, + "step": 145 }, { - "loss": 2.1915, - "grad_norm": 1.5673022270202637, - "learning_rate": 5e-05, - "epoch": 0.6918742396106806, - "step": 10805 + "loss": 385.9282, + "grad_norm": 53.019859313964844, + "learning_rate": 0.00047910698748478565, + "epoch": 0.1309500928951246, + "step": 146 }, { - "loss": 2.1848, - "grad_norm": 1.6205099821090698, - "learning_rate": 5e-05, - "epoch": 0.6921944035346097, - "step": 10810 + "loss": 388.6133, + "grad_norm": 59.57033920288086, + "learning_rate": 0.00047882392590158166, + "epoch": 0.13184701133961177, + "step": 147 }, { - "loss": 2.1478, - "grad_norm": 1.605733871459961, - "learning_rate": 5e-05, - "epoch": 0.6925145674585388, - "step": 10815 + "loss": 385.2765, + "grad_norm": 55.921993255615234, + "learning_rate": 0.000478539044489684, + "epoch": 0.1327439297840989, + "step": 148 }, { - "loss": 2.1869, - "grad_norm": 1.6565288305282593, - "learning_rate": 5e-05, - "epoch": 0.6928347313824679, - "step": 10820 + "loss": 387.315, + "grad_norm": 53.27146911621094, + "learning_rate": 0.0004782523455147448, + "epoch": 0.13364084822858607, + "step": 149 }, { - "loss": 2.1886, - "grad_norm": 1.5877238512039185, - "learning_rate": 5e-05, - "epoch": 0.6931548953063968, - "step": 10825 + "loss": 384.9127, + "grad_norm": 61.21531295776367, + "learning_rate": 0.0004779638312568708, + "epoch": 0.13453776667307324, + "step": 150 }, { - "loss": 2.1702, - "grad_norm": 1.612343668937683, - "learning_rate": 5e-05, - "epoch": 0.6934750592303259, - "step": 10830 + "eval_loss": 1.7258449792861938, + "eval_runtime": 36.7008, + "eval_samples_per_second": 55.803, + "eval_steps_per_second": 3.488, + "epoch": 0.13453776667307324, + "step": 150 }, { - "loss": 2.1676, - "grad_norm": 1.5942809581756592, - "learning_rate": 5e-05, - "epoch": 0.693795223154255, - "step": 10835 + "loss": 385.8539, + "grad_norm": 60.04133605957031, + "learning_rate": 0.00047767350401060606, + "epoch": 0.13543468511756038, + "step": 151 }, { - "loss": 2.1859, - "grad_norm": 1.575735330581665, - "learning_rate": 5e-05, - "epoch": 0.694115387078184, - "step": 10840 + "loss": 384.8003, + "grad_norm": 59.11763000488281, + "learning_rate": 0.0004773813660849128, + "epoch": 0.13633160356204754, + "step": 152 }, { - "loss": 2.1989, - "grad_norm": 1.6750792264938354, - "learning_rate": 5e-05, - "epoch": 0.6944355510021131, - "step": 10845 + "loss": 387.7485, + "grad_norm": 56.51465606689453, + "learning_rate": 0.0004770874198031538, + "epoch": 0.13722852200653468, + "step": 153 }, { - "loss": 2.188, - "grad_norm": 1.6374013423919678, - "learning_rate": 5e-05, - "epoch": 0.6947557149260422, - "step": 10850 + "loss": 383.2278, + "grad_norm": 56.18191146850586, + "learning_rate": 0.0004767916675030736, + "epoch": 0.13812544045102185, + "step": 154 }, { - "loss": 2.1745, - "grad_norm": 1.6327176094055176, - "learning_rate": 5e-05, - "epoch": 0.6950758788499712, - "step": 10855 + "loss": 383.6736, + "grad_norm": 57.308799743652344, + "learning_rate": 0.00047649411153678, + "epoch": 0.139022358895509, + "step": 155 }, { - "loss": 2.1858, - "grad_norm": 1.6038893461227417, - "learning_rate": 5e-05, - "epoch": 0.6953960427739002, - "step": 10860 + "loss": 383.3135, + "grad_norm": 56.1787109375, + "learning_rate": 0.0004761947542707251, + "epoch": 0.13991927733999615, + "step": 156 }, { - "loss": 2.1917, - "grad_norm": 1.6516578197479248, - "learning_rate": 5e-05, - "epoch": 0.6957162066978293, - "step": 10865 + "loss": 380.7021, + "grad_norm": 59.29663848876953, + "learning_rate": 0.0004758935980856868, + "epoch": 0.14081619578448332, + "step": 157 }, { - "loss": 2.1508, - "grad_norm": 1.6180627346038818, - "learning_rate": 5e-05, - "epoch": 0.6960363706217584, - "step": 10870 + "loss": 388.3537, + "grad_norm": 56.997901916503906, + "learning_rate": 0.00047559064537674973, + "epoch": 0.14171311422897045, + "step": 158 }, { - "loss": 2.155, - "grad_norm": 1.7027835845947266, - "learning_rate": 5e-05, - "epoch": 0.6963565345456874, - "step": 10875 + "loss": 382.6107, + "grad_norm": 54.997398376464844, + "learning_rate": 0.0004752858985532862, + "epoch": 0.14261003267345762, + "step": 159 }, { - "loss": 2.1702, - "grad_norm": 1.6208568811416626, - "learning_rate": 5e-05, - "epoch": 0.6966766984696164, - "step": 10880 + "loss": 390.4788, + "grad_norm": 61.30497360229492, + "learning_rate": 0.00047497936003893713, + "epoch": 0.1435069511179448, + "step": 160 }, { - "loss": 2.1676, - "grad_norm": 1.5398364067077637, - "learning_rate": 5e-05, - "epoch": 0.6969968623935455, - "step": 10885 + "loss": 383.9597, + "grad_norm": 56.59492492675781, + "learning_rate": 0.0004746710322715926, + "epoch": 0.14440386956243192, + "step": 161 }, { - "loss": 2.1617, - "grad_norm": 1.579201340675354, - "learning_rate": 5e-05, - "epoch": 0.6973170263174745, - "step": 10890 + "loss": 392.4949, + "grad_norm": 63.977073669433594, + "learning_rate": 0.0004743609177033725, + "epoch": 0.1453007880069191, + "step": 162 }, { - "loss": 2.2142, - "grad_norm": 1.629888653755188, - "learning_rate": 5e-05, - "epoch": 0.6976371902414036, - "step": 10895 + "loss": 385.7721, + "grad_norm": 63.132537841796875, + "learning_rate": 0.0004740490188006072, + "epoch": 0.14619770645140623, + "step": 163 }, { - "loss": 2.1654, - "grad_norm": 1.5622855424880981, - "learning_rate": 5e-05, - "epoch": 0.6979573541653327, - "step": 10900 + "loss": 385.057, + "grad_norm": 61.54987716674805, + "learning_rate": 0.0004737353380438178, + "epoch": 0.1470946248958934, + "step": 164 }, { - "loss": 2.1655, - "grad_norm": 1.6659269332885742, - "learning_rate": 5e-05, - "epoch": 0.6982775180892618, - "step": 10905 + "loss": 384.8288, + "grad_norm": 64.65653228759766, + "learning_rate": 0.00047341987792769635, + "epoch": 0.14799154334038056, + "step": 165 }, { - "loss": 2.1779, - "grad_norm": 1.5977221727371216, - "learning_rate": 5e-05, - "epoch": 0.6985976820131907, - "step": 10910 + "loss": 385.061, + "grad_norm": 52.979087829589844, + "learning_rate": 0.0004731026409610863, + "epoch": 0.1488884617848677, + "step": 166 }, { - "loss": 2.1613, - "grad_norm": 1.604508638381958, - "learning_rate": 5e-05, - "epoch": 0.6989178459371198, - "step": 10915 + "loss": 385.9828, + "grad_norm": 66.97553253173828, + "learning_rate": 0.00047278362966696197, + "epoch": 0.14978538022935486, + "step": 167 }, { - "loss": 2.1914, - "grad_norm": 1.6567248106002808, - "learning_rate": 5e-05, - "epoch": 0.6992380098610489, - "step": 10920 + "loss": 381.6645, + "grad_norm": 49.72977066040039, + "learning_rate": 0.00047246284658240925, + "epoch": 0.150682298673842, + "step": 168 }, { - "loss": 2.1479, - "grad_norm": 1.681601881980896, - "learning_rate": 5e-05, - "epoch": 0.6995581737849779, - "step": 10925 + "loss": 387.0713, + "grad_norm": 59.0352668762207, + "learning_rate": 0.0004721402942586046, + "epoch": 0.15157921711832917, + "step": 169 }, { - "loss": 2.1561, - "grad_norm": 1.6984683275222778, - "learning_rate": 5e-05, - "epoch": 0.699878337708907, - "step": 10930 + "loss": 388.6861, + "grad_norm": 56.49056625366211, + "learning_rate": 0.0004718159752607955, + "epoch": 0.15247613556281633, + "step": 170 }, { - "loss": 2.1897, - "grad_norm": 1.6151689291000366, - "learning_rate": 5e-05, - "epoch": 0.700198501632836, - "step": 10935 + "loss": 386.6622, + "grad_norm": 61.9783935546875, + "learning_rate": 0.00047148989216827964, + "epoch": 0.15337305400730347, + "step": 171 }, { - "loss": 2.1693, - "grad_norm": 1.680700659751892, - "learning_rate": 5e-05, - "epoch": 0.7005186655567651, - "step": 10940 + "loss": 385.3264, + "grad_norm": 60.84406280517578, + "learning_rate": 0.0004711620475743844, + "epoch": 0.15426997245179064, + "step": 172 }, { - "loss": 2.2056, - "grad_norm": 1.6594295501708984, - "learning_rate": 5e-05, - "epoch": 0.7008388294806941, - "step": 10945 + "loss": 383.2025, + "grad_norm": 55.59370803833008, + "learning_rate": 0.00047083244408644646, + "epoch": 0.15516689089627778, + "step": 173 }, { - "loss": 2.1944, - "grad_norm": 1.569491982460022, - "learning_rate": 5e-05, - "epoch": 0.7011589934046232, - "step": 10950 + "loss": 383.7802, + "grad_norm": 59.102760314941406, + "learning_rate": 0.0004705010843257908, + "epoch": 0.15606380934076494, + "step": 174 }, { - "loss": 2.1825, - "grad_norm": 1.6589845418930054, - "learning_rate": 5e-05, - "epoch": 0.7014791573285523, - "step": 10955 + "loss": 387.181, + "grad_norm": 63.97918701171875, + "learning_rate": 0.00047016797092771004, + "epoch": 0.1569607277852521, + "step": 175 }, { - "loss": 2.1556, - "grad_norm": 1.6408551931381226, - "learning_rate": 5e-05, - "epoch": 0.7017993212524812, - "step": 10960 + "loss": 382.4706, + "grad_norm": 58.40498733520508, + "learning_rate": 0.0004698331065414434, + "epoch": 0.15785764622973925, + "step": 176 }, { - "loss": 2.1657, - "grad_norm": 1.580729365348816, - "learning_rate": 5e-05, - "epoch": 0.7021194851764103, - "step": 10965 + "loss": 374.7974, + "grad_norm": 57.276405334472656, + "learning_rate": 0.0004694964938301556, + "epoch": 0.1587545646742264, + "step": 177 }, { - "loss": 2.203, - "grad_norm": 2.0288913249969482, - "learning_rate": 5e-05, - "epoch": 0.7024396491003394, - "step": 10970 + "loss": 383.6686, + "grad_norm": 65.17239379882812, + "learning_rate": 0.0004691581354709159, + "epoch": 0.15965148311871355, + "step": 178 }, { - "loss": 2.1871, - "grad_norm": 2.0533010959625244, - "learning_rate": 5e-05, - "epoch": 0.7027598130242684, - "step": 10975 + "loss": 382.2492, + "grad_norm": 54.67914962768555, + "learning_rate": 0.0004688180341546765, + "epoch": 0.16054840156320072, + "step": 179 }, { - "loss": 2.1798, - "grad_norm": 1.7345025539398193, - "learning_rate": 5e-05, - "epoch": 0.7030799769481975, - "step": 10980 + "loss": 379.0845, + "grad_norm": 61.17100524902344, + "learning_rate": 0.0004684761925862512, + "epoch": 0.16144532000768788, + "step": 180 }, { - "loss": 2.1758, - "grad_norm": 1.7670248746871948, - "learning_rate": 5e-05, - "epoch": 0.7034001408721265, - "step": 10985 + "loss": 380.5147, + "grad_norm": 53.48952102661133, + "learning_rate": 0.00046813261348429403, + "epoch": 0.16234223845217502, + "step": 181 }, { - "loss": 2.1745, - "grad_norm": 1.7378064393997192, - "learning_rate": 5e-05, - "epoch": 0.7037203047960556, - "step": 10990 + "loss": 388.3456, + "grad_norm": 62.524898529052734, + "learning_rate": 0.0004677872995812778, + "epoch": 0.16323915689666219, + "step": 182 }, { - "loss": 2.1861, - "grad_norm": 1.7046862840652466, - "learning_rate": 5e-05, - "epoch": 0.7040404687199846, - "step": 10995 + "loss": 384.9105, + "grad_norm": 55.23896026611328, + "learning_rate": 0.00046744025362347174, + "epoch": 0.16413607534114932, + "step": 183 }, { - "loss": 2.1677, - "grad_norm": 1.7800571918487549, - "learning_rate": 5e-05, - "epoch": 0.7043606326439137, - "step": 11000 + "loss": 388.0769, + "grad_norm": 58.2794075012207, + "learning_rate": 0.0004670914783709203, + "epoch": 0.1650329937856365, + "step": 184 }, { - "eval_loss": 2.030477523803711, - "eval_runtime": 11.977, - "eval_samples_per_second": 170.995, - "eval_steps_per_second": 21.374, - "epoch": 0.7043606326439137, - "step": 11000 + "loss": 375.4843, + "grad_norm": 57.62440872192383, + "learning_rate": 0.00046674097659742087, + "epoch": 0.16592991223012366, + "step": 185 }, { - "loss": 2.1441, - "grad_norm": 1.6233124732971191, - "learning_rate": 5e-05, - "epoch": 0.7046807965678428, - "step": 11005 + "loss": 388.4005, + "grad_norm": 54.49860763549805, + "learning_rate": 0.00046638875109050184, + "epoch": 0.1668268306746108, + "step": 186 }, { - "loss": 2.1677, - "grad_norm": 1.6337502002716064, - "learning_rate": 5e-05, - "epoch": 0.7050009604917717, - "step": 11010 + "loss": 379.2246, + "grad_norm": 56.57727813720703, + "learning_rate": 0.00046603480465140035, + "epoch": 0.16772374911909796, + "step": 187 }, { - "loss": 2.1558, - "grad_norm": 1.602023720741272, - "learning_rate": 5e-05, - "epoch": 0.7053211244157008, - "step": 11015 + "loss": 390.5371, + "grad_norm": 53.35488510131836, + "learning_rate": 0.0004656791400950401, + "epoch": 0.16862066756358512, + "step": 188 }, { - "loss": 2.2029, - "grad_norm": 1.642838478088379, - "learning_rate": 5e-05, - "epoch": 0.7056412883396299, - "step": 11020 + "loss": 376.5087, + "grad_norm": 57.38853454589844, + "learning_rate": 0.0004653217602500088, + "epoch": 0.16951758600807226, + "step": 189 }, { - "loss": 2.1435, - "grad_norm": 1.6262296438217163, - "learning_rate": 5e-05, - "epoch": 0.705961452263559, - "step": 11025 + "loss": 383.3448, + "grad_norm": 53.162269592285156, + "learning_rate": 0.00046496266795853606, + "epoch": 0.17041450445255943, + "step": 190 }, { - "loss": 2.1539, - "grad_norm": 1.6026281118392944, - "learning_rate": 5e-05, - "epoch": 0.706281616187488, - "step": 11030 + "loss": 385.954, + "grad_norm": 56.76969528198242, + "learning_rate": 0.0004646018660764701, + "epoch": 0.17131142289704657, + "step": 191 }, { - "loss": 2.1486, - "grad_norm": 1.722970962524414, - "learning_rate": 5e-05, - "epoch": 0.706601780111417, - "step": 11035 + "loss": 380.8749, + "grad_norm": 55.99345016479492, + "learning_rate": 0.0004642393574732559, + "epoch": 0.17220834134153373, + "step": 192 }, { - "loss": 2.1827, - "grad_norm": 1.606521487236023, - "learning_rate": 5e-05, - "epoch": 0.7069219440353461, - "step": 11040 + "loss": 379.5312, + "grad_norm": 49.73320770263672, + "learning_rate": 0.0004638751450319116, + "epoch": 0.1731052597860209, + "step": 193 }, { - "loss": 2.1569, - "grad_norm": 1.6021209955215454, - "learning_rate": 5e-05, - "epoch": 0.7072421079592751, - "step": 11045 + "loss": 385.7988, + "grad_norm": 56.80336380004883, + "learning_rate": 0.00046350923164900604, + "epoch": 0.17400217823050804, + "step": 194 }, { - "loss": 2.1489, - "grad_norm": 1.5476322174072266, - "learning_rate": 5e-05, - "epoch": 0.7075622718832042, - "step": 11050 + "loss": 380.8796, + "grad_norm": 57.32421875, + "learning_rate": 0.0004631416202346357, + "epoch": 0.1748990966749952, + "step": 195 }, { - "loss": 2.1817, - "grad_norm": 1.5742813348770142, - "learning_rate": 5e-05, - "epoch": 0.7078824358071333, - "step": 11055 + "loss": 382.128, + "grad_norm": 62.81551742553711, + "learning_rate": 0.00046277231371240113, + "epoch": 0.17579601511948234, + "step": 196 }, { - "loss": 2.1811, - "grad_norm": 1.5461931228637695, - "learning_rate": 5e-05, - "epoch": 0.7082025997310623, - "step": 11060 + "loss": 383.9042, + "grad_norm": 60.5498046875, + "learning_rate": 0.00046240131501938436, + "epoch": 0.1766929335639695, + "step": 197 }, { - "loss": 2.1569, - "grad_norm": 1.5782997608184814, - "learning_rate": 5e-05, - "epoch": 0.7085227636549913, - "step": 11065 + "loss": 380.0457, + "grad_norm": 54.78828811645508, + "learning_rate": 0.000462028627106125, + "epoch": 0.17758985200845667, + "step": 198 }, { - "loss": 2.1863, - "grad_norm": 1.652904748916626, - "learning_rate": 5e-05, - "epoch": 0.7088429275789204, - "step": 11070 + "loss": 383.6067, + "grad_norm": 60.62177276611328, + "learning_rate": 0.00046165425293659694, + "epoch": 0.1784867704529438, + "step": 199 }, { - "loss": 2.1902, - "grad_norm": 1.6661841869354248, - "learning_rate": 5e-05, - "epoch": 0.7091630915028495, - "step": 11075 + "loss": 385.004, + "grad_norm": 53.65549850463867, + "learning_rate": 0.00046127819548818507, + "epoch": 0.17938368889743098, + "step": 200 }, { - "loss": 2.1641, - "grad_norm": 1.5823413133621216, - "learning_rate": 5e-05, - "epoch": 0.7094832554267785, - "step": 11080 + "eval_loss": 1.6973483562469482, + "eval_runtime": 57.4311, + "eval_samples_per_second": 35.66, + "eval_steps_per_second": 2.229, + "epoch": 0.17938368889743098, + "step": 200 }, { - "loss": 2.1793, - "grad_norm": 1.6849126815795898, - "learning_rate": 5e-05, - "epoch": 0.7098034193507076, - "step": 11085 + "loss": 381.3797, + "grad_norm": 60.24985885620117, + "learning_rate": 0.0004609004577516609, + "epoch": 0.18028060734191811, + "step": 201 }, { - "loss": 2.1686, - "grad_norm": 1.4970881938934326, - "learning_rate": 5e-05, - "epoch": 0.7101235832746366, - "step": 11090 + "loss": 384.8868, + "grad_norm": 55.66313552856445, + "learning_rate": 0.00046052104273115957, + "epoch": 0.18117752578640528, + "step": 202 }, { - "loss": 2.1678, - "grad_norm": 1.5892386436462402, - "learning_rate": 5e-05, - "epoch": 0.7104437471985656, - "step": 11095 + "loss": 381.8181, + "grad_norm": 58.7210807800293, + "learning_rate": 0.0004601399534441556, + "epoch": 0.18207444423089245, + "step": 203 }, { - "loss": 2.1718, - "grad_norm": 1.6064002513885498, - "learning_rate": 5e-05, - "epoch": 0.7107639111224947, - "step": 11100 + "loss": 381.6777, + "grad_norm": 51.48910903930664, + "learning_rate": 0.0004597571929214386, + "epoch": 0.18297136267537958, + "step": 204 }, { - "loss": 2.1564, - "grad_norm": 1.6332510709762573, - "learning_rate": 5e-05, - "epoch": 0.7110840750464238, - "step": 11105 + "loss": 389.5296, + "grad_norm": 55.63520050048828, + "learning_rate": 0.00045937276420708985, + "epoch": 0.18386828111986675, + "step": 205 }, { - "loss": 2.1548, - "grad_norm": 1.5667170286178589, - "learning_rate": 5e-05, - "epoch": 0.7114042389703529, - "step": 11110 + "loss": 379.7319, + "grad_norm": 56.91200637817383, + "learning_rate": 0.00045898667035845726, + "epoch": 0.1847651995643539, + "step": 206 }, { - "loss": 2.1598, - "grad_norm": 1.6204004287719727, - "learning_rate": 5e-05, - "epoch": 0.7117244028942818, - "step": 11115 + "loss": 383.4648, + "grad_norm": 60.174800872802734, + "learning_rate": 0.0004585989144461319, + "epoch": 0.18566211800884105, + "step": 207 }, { - "loss": 2.1704, - "grad_norm": 1.6428627967834473, - "learning_rate": 5e-05, - "epoch": 0.7120445668182109, - "step": 11120 + "loss": 381.6614, + "grad_norm": 46.41486740112305, + "learning_rate": 0.00045820949955392286, + "epoch": 0.18655903645332822, + "step": 208 }, { - "loss": 2.1742, - "grad_norm": 1.6737961769104004, - "learning_rate": 5e-05, - "epoch": 0.71236473074214, - "step": 11125 + "loss": 388.843, + "grad_norm": 66.20514678955078, + "learning_rate": 0.0004578184287788333, + "epoch": 0.18745595489781536, + "step": 209 }, { - "loss": 2.1952, - "grad_norm": 1.6475317478179932, - "learning_rate": 5e-05, - "epoch": 0.712684894666069, - "step": 11130 + "loss": 382.3195, + "grad_norm": 52.08879470825195, + "learning_rate": 0.0004574257052310355, + "epoch": 0.18835287334230252, + "step": 210 }, { - "loss": 2.1933, - "grad_norm": 1.6665571928024292, - "learning_rate": 5e-05, - "epoch": 0.7130050585899981, - "step": 11135 + "loss": 376.9011, + "grad_norm": 59.04060363769531, + "learning_rate": 0.00045703133203384594, + "epoch": 0.18924979178678966, + "step": 211 }, { - "loss": 2.1678, - "grad_norm": 1.5727708339691162, - "learning_rate": 5e-05, - "epoch": 0.7133252225139272, - "step": 11140 + "loss": 382.9858, + "grad_norm": 57.139583587646484, + "learning_rate": 0.000456635312323701, + "epoch": 0.19014671023127683, + "step": 212 }, { - "loss": 2.1553, - "grad_norm": 1.5942537784576416, - "learning_rate": 5e-05, - "epoch": 0.7136453864378561, - "step": 11145 + "loss": 386.4098, + "grad_norm": 56.69694137573242, + "learning_rate": 0.00045623764925013154, + "epoch": 0.191043628675764, + "step": 213 }, { - "loss": 2.1724, - "grad_norm": 1.6233826875686646, - "learning_rate": 5e-05, - "epoch": 0.7139655503617852, - "step": 11150 + "loss": 381.0145, + "grad_norm": 54.969146728515625, + "learning_rate": 0.00045583834597573826, + "epoch": 0.19194054712025113, + "step": 214 }, { - "loss": 2.17, - "grad_norm": 1.6468729972839355, - "learning_rate": 5e-05, - "epoch": 0.7142857142857143, - "step": 11155 + "loss": 386.2006, + "grad_norm": 55.187095642089844, + "learning_rate": 0.000455437405676166, + "epoch": 0.1928374655647383, + "step": 215 }, { - "loss": 2.1759, - "grad_norm": 1.6635937690734863, - "learning_rate": 5e-05, - "epoch": 0.7146058782096434, - "step": 11160 + "loss": 385.4291, + "grad_norm": 56.27381896972656, + "learning_rate": 0.000455034831540079, + "epoch": 0.19373438400922544, + "step": 216 }, { - "loss": 2.1676, - "grad_norm": 1.67452073097229, - "learning_rate": 5e-05, - "epoch": 0.7149260421335724, - "step": 11165 + "loss": 382.2878, + "grad_norm": 55.81896209716797, + "learning_rate": 0.00045463062676913527, + "epoch": 0.1946313024537126, + "step": 217 }, { - "loss": 2.1679, - "grad_norm": 1.6762311458587646, - "learning_rate": 5e-05, - "epoch": 0.7152462060575014, - "step": 11170 + "loss": 381.0126, + "grad_norm": 60.54517364501953, + "learning_rate": 0.0004542247945779613, + "epoch": 0.19552822089819977, + "step": 218 }, { - "loss": 2.2012, - "grad_norm": 1.6281007528305054, - "learning_rate": 5e-05, - "epoch": 0.7155663699814305, - "step": 11175 + "loss": 382.4228, + "grad_norm": 51.44652557373047, + "learning_rate": 0.0004538173381941264, + "epoch": 0.1964251393426869, + "step": 219 }, { - "loss": 2.1857, - "grad_norm": 1.6250513792037964, - "learning_rate": 5e-05, - "epoch": 0.7158865339053595, - "step": 11180 + "loss": 374.3478, + "grad_norm": 57.77920150756836, + "learning_rate": 0.0004534082608581168, + "epoch": 0.19732205778717407, + "step": 220 }, { - "loss": 2.1537, - "grad_norm": 1.57022225856781, - "learning_rate": 5e-05, - "epoch": 0.7162066978292886, - "step": 11185 + "loss": 379.4279, + "grad_norm": 52.3509635925293, + "learning_rate": 0.0004529975658233104, + "epoch": 0.1982189762316612, + "step": 221 }, { - "loss": 2.1607, - "grad_norm": 1.6798429489135742, - "learning_rate": 5e-05, - "epoch": 0.7165268617532177, - "step": 11190 + "loss": 380.0542, + "grad_norm": 53.75742721557617, + "learning_rate": 0.0004525852563559505, + "epoch": 0.19911589467614838, + "step": 222 }, { - "loss": 2.1764, - "grad_norm": 1.6130719184875488, - "learning_rate": 5e-05, - "epoch": 0.7168470256771468, - "step": 11195 + "loss": 387.0319, + "grad_norm": 59.18511199951172, + "learning_rate": 0.0004521713357351198, + "epoch": 0.20001281312063554, + "step": 223 }, { - "loss": 2.1805, - "grad_norm": 1.6312038898468018, - "learning_rate": 5e-05, - "epoch": 0.7171671896010757, - "step": 11200 + "loss": 375.638, + "grad_norm": 53.67622375488281, + "learning_rate": 0.00045175580725271457, + "epoch": 0.20090973156512268, + "step": 224 }, { - "eval_loss": 2.0418291091918945, - "eval_runtime": 9.4156, - "eval_samples_per_second": 217.512, - "eval_steps_per_second": 27.189, - "epoch": 0.7171671896010757, - "step": 11200 + "loss": 383.951, + "grad_norm": 67.28981018066406, + "learning_rate": 0.00045133867421341835, + "epoch": 0.20180665000960984, + "step": 225 }, { - "loss": 2.1583, - "grad_norm": 1.6148508787155151, - "learning_rate": 5e-05, - "epoch": 0.7174873535250048, - "step": 11205 + "loss": 380.0722, + "grad_norm": 62.926700592041016, + "learning_rate": 0.00045091993993467554, + "epoch": 0.20270356845409698, + "step": 226 }, { - "loss": 2.156, - "grad_norm": 1.6483427286148071, - "learning_rate": 5e-05, - "epoch": 0.7178075174489339, - "step": 11210 + "loss": 377.9981, + "grad_norm": 53.50834274291992, + "learning_rate": 0.0004504996077466654, + "epoch": 0.20360048689858415, + "step": 227 }, { - "loss": 2.1894, - "grad_norm": 1.7447383403778076, - "learning_rate": 5e-05, - "epoch": 0.7181276813728629, - "step": 11215 + "loss": 380.4308, + "grad_norm": 61.55268096923828, + "learning_rate": 0.0004500776809922751, + "epoch": 0.20449740534307131, + "step": 228 }, { - "loss": 2.1905, - "grad_norm": 1.605284571647644, - "learning_rate": 5e-05, - "epoch": 0.718447845296792, - "step": 11220 + "loss": 375.9146, + "grad_norm": 55.11613845825195, + "learning_rate": 0.0004496541630270733, + "epoch": 0.20539432378755845, + "step": 229 }, { - "loss": 2.1626, - "grad_norm": 1.5911649465560913, - "learning_rate": 5e-05, - "epoch": 0.718768009220721, - "step": 11225 + "loss": 381.8729, + "grad_norm": 61.67683410644531, + "learning_rate": 0.00044922905721928366, + "epoch": 0.20629124223204562, + "step": 230 }, { - "loss": 2.1717, - "grad_norm": 1.535005807876587, - "learning_rate": 5e-05, - "epoch": 0.71908817314465, - "step": 11230 + "loss": 377.6188, + "grad_norm": 55.07930374145508, + "learning_rate": 0.00044880236694975773, + "epoch": 0.20718816067653276, + "step": 231 }, { - "loss": 2.1844, - "grad_norm": 1.5297882556915283, - "learning_rate": 5e-05, - "epoch": 0.7194083370685791, - "step": 11235 + "loss": 383.7285, + "grad_norm": 56.17093276977539, + "learning_rate": 0.0004483740956119485, + "epoch": 0.20808507912101992, + "step": 232 }, { - "loss": 2.1602, - "grad_norm": 1.6361292600631714, - "learning_rate": 5e-05, - "epoch": 0.7197285009925082, - "step": 11240 + "loss": 379.3219, + "grad_norm": 57.20262908935547, + "learning_rate": 0.0004479442466118828, + "epoch": 0.2089819975655071, + "step": 233 }, { - "loss": 2.1776, - "grad_norm": 1.6264761686325073, - "learning_rate": 5e-05, - "epoch": 0.7200486649164373, - "step": 11245 + "loss": 378.996, + "grad_norm": 52.91606521606445, + "learning_rate": 0.0004475128233681349, + "epoch": 0.20987891600999423, + "step": 234 }, { - "loss": 2.1692, - "grad_norm": 1.6878418922424316, - "learning_rate": 5e-05, - "epoch": 0.7203688288403662, - "step": 11250 + "loss": 376.5712, + "grad_norm": 53.59124755859375, + "learning_rate": 0.00044707982931179856, + "epoch": 0.2107758344544814, + "step": 235 }, { - "loss": 2.1613, - "grad_norm": 1.6586750745773315, - "learning_rate": 5e-05, - "epoch": 0.7206889927642953, - "step": 11255 + "loss": 385.7614, + "grad_norm": 57.6840705871582, + "learning_rate": 0.00044664526788646064, + "epoch": 0.21167275289896853, + "step": 236 }, { - "loss": 2.1647, - "grad_norm": 1.6286550760269165, - "learning_rate": 5e-05, - "epoch": 0.7210091566882244, - "step": 11260 + "loss": 381.0049, + "grad_norm": 54.7835578918457, + "learning_rate": 0.0004462091425481728, + "epoch": 0.2125696713434557, + "step": 237 }, { - "loss": 2.1485, - "grad_norm": 1.6325013637542725, - "learning_rate": 5e-05, - "epoch": 0.7213293206121534, - "step": 11265 + "loss": 380.4299, + "grad_norm": 56.61455535888672, + "learning_rate": 0.0004457714567654247, + "epoch": 0.21346658978794286, + "step": 238 }, { - "loss": 2.1846, - "grad_norm": 1.5908008813858032, - "learning_rate": 5e-05, - "epoch": 0.7216494845360825, - "step": 11270 + "loss": 377.3007, + "grad_norm": 54.04520797729492, + "learning_rate": 0.0004453322140191162, + "epoch": 0.21436350823243, + "step": 239 }, { - "loss": 2.1725, - "grad_norm": 1.5738271474838257, - "learning_rate": 5e-05, - "epoch": 0.7219696484600115, - "step": 11275 + "loss": 376.2494, + "grad_norm": 61.18534469604492, + "learning_rate": 0.0004448914178025293, + "epoch": 0.21526042667691717, + "step": 240 }, { - "loss": 2.1878, - "grad_norm": 1.6095255613327026, - "learning_rate": 5e-05, - "epoch": 0.7222898123839406, - "step": 11280 + "loss": 379.0678, + "grad_norm": 58.791934967041016, + "learning_rate": 0.000444449071621301, + "epoch": 0.21615734512140433, + "step": 241 }, { - "loss": 2.1599, - "grad_norm": 1.5834318399429321, - "learning_rate": 5e-05, - "epoch": 0.7226099763078696, - "step": 11285 + "loss": 383.8186, + "grad_norm": 54.751407623291016, + "learning_rate": 0.0004440051789933951, + "epoch": 0.21705426356589147, + "step": 242 }, { - "loss": 2.2047, - "grad_norm": 1.5938414335250854, - "learning_rate": 5e-05, - "epoch": 0.7229301402317987, - "step": 11290 + "loss": 374.9797, + "grad_norm": 54.97734451293945, + "learning_rate": 0.0004435597434490741, + "epoch": 0.21795118201037864, + "step": 243 }, { - "loss": 2.1833, - "grad_norm": 1.62465238571167, - "learning_rate": 5e-05, - "epoch": 0.7232503041557278, - "step": 11295 + "loss": 381.2922, + "grad_norm": 55.37065887451172, + "learning_rate": 0.00044311276853087144, + "epoch": 0.21884810045486577, + "step": 244 }, { - "loss": 2.1639, - "grad_norm": 1.6351125240325928, - "learning_rate": 5e-05, - "epoch": 0.7235704680796567, - "step": 11300 + "loss": 378.8845, + "grad_norm": 58.74147033691406, + "learning_rate": 0.0004426642577935629, + "epoch": 0.21974501889935294, + "step": 245 }, { - "loss": 2.1567, - "grad_norm": 1.6405686140060425, - "learning_rate": 5e-05, - "epoch": 0.7238906320035858, - "step": 11305 + "loss": 386.1524, + "grad_norm": 58.316097259521484, + "learning_rate": 0.0004422142148041388, + "epoch": 0.2206419373438401, + "step": 246 }, { - "loss": 2.1425, - "grad_norm": 1.6778993606567383, - "learning_rate": 5e-05, - "epoch": 0.7242107959275149, - "step": 11310 + "loss": 378.2374, + "grad_norm": 54.42732238769531, + "learning_rate": 0.00044176264314177535, + "epoch": 0.22153885578832724, + "step": 247 }, { - "loss": 2.1388, - "grad_norm": 1.5974764823913574, - "learning_rate": 5e-05, - "epoch": 0.7245309598514439, - "step": 11315 + "loss": 378.246, + "grad_norm": 56.714080810546875, + "learning_rate": 0.00044130954639780615, + "epoch": 0.2224357742328144, + "step": 248 }, { - "loss": 2.1866, - "grad_norm": 1.6215441226959229, - "learning_rate": 5e-05, - "epoch": 0.724851123775373, - "step": 11320 + "loss": 373.9691, + "grad_norm": 51.52580642700195, + "learning_rate": 0.0004408549281756937, + "epoch": 0.22333269267730155, + "step": 249 }, { - "loss": 2.1612, - "grad_norm": 1.638525366783142, - "learning_rate": 5e-05, - "epoch": 0.7251712876993021, - "step": 11325 + "loss": 377.4944, + "grad_norm": 61.44560241699219, + "learning_rate": 0.0004403987920910011, + "epoch": 0.2242296111217887, + "step": 250 }, { - "loss": 2.1588, - "grad_norm": 1.677040934562683, - "learning_rate": 5e-05, - "epoch": 0.7254914516232311, - "step": 11330 + "eval_loss": 1.6841200590133667, + "eval_runtime": 35.8648, + "eval_samples_per_second": 57.103, + "eval_steps_per_second": 3.569, + "epoch": 0.2242296111217887, + "step": 250 }, { - "loss": 2.1894, - "grad_norm": 1.6689181327819824, - "learning_rate": 5e-05, - "epoch": 0.7258116155471601, - "step": 11335 + "loss": 372.7726, + "grad_norm": 52.64440155029297, + "learning_rate": 0.00043994114177136245, + "epoch": 0.22512652956627588, + "step": 251 }, { - "loss": 2.1748, - "grad_norm": 1.6328116655349731, - "learning_rate": 5e-05, - "epoch": 0.7261317794710892, - "step": 11340 + "loss": 374.3314, + "grad_norm": 57.64458084106445, + "learning_rate": 0.0004394819808564549, + "epoch": 0.22602344801076302, + "step": 252 }, { - "loss": 2.154, - "grad_norm": 1.6823488473892212, - "learning_rate": 5e-05, - "epoch": 0.7264519433950183, - "step": 11345 + "loss": 380.1327, + "grad_norm": 48.348487854003906, + "learning_rate": 0.00043902131299796923, + "epoch": 0.22692036645525018, + "step": 253 }, { - "loss": 2.1432, - "grad_norm": 1.6382944583892822, - "learning_rate": 5e-05, - "epoch": 0.7267721073189473, - "step": 11350 + "loss": 376.8272, + "grad_norm": 55.306766510009766, + "learning_rate": 0.00043855914185958066, + "epoch": 0.22781728489973732, + "step": 254 }, { - "loss": 2.1632, - "grad_norm": 1.6119322776794434, - "learning_rate": 5e-05, - "epoch": 0.7270922712428763, - "step": 11355 + "loss": 373.5811, + "grad_norm": 50.16413879394531, + "learning_rate": 0.0004380954711169202, + "epoch": 0.2287142033442245, + "step": 255 }, { - "loss": 2.1636, - "grad_norm": 1.6790704727172852, - "learning_rate": 5e-05, - "epoch": 0.7274124351668054, - "step": 11360 + "loss": 380.8544, + "grad_norm": 52.902305603027344, + "learning_rate": 0.00043763030445754516, + "epoch": 0.22961112178871165, + "step": 256 }, { - "loss": 2.1905, - "grad_norm": 1.6697032451629639, - "learning_rate": 5e-05, - "epoch": 0.7277325990907345, - "step": 11365 + "loss": 380.7617, + "grad_norm": 55.323490142822266, + "learning_rate": 0.0004371636455809096, + "epoch": 0.2305080402331988, + "step": 257 }, { - "loss": 2.1917, - "grad_norm": 1.6095983982086182, - "learning_rate": 5e-05, - "epoch": 0.7280527630146635, - "step": 11370 + "loss": 378.9308, + "grad_norm": 53.362361907958984, + "learning_rate": 0.00043669549819833536, + "epoch": 0.23140495867768596, + "step": 258 }, { - "loss": 2.1417, - "grad_norm": 1.5918678045272827, - "learning_rate": 5e-05, - "epoch": 0.7283729269385926, - "step": 11375 + "loss": 378.0917, + "grad_norm": 51.511932373046875, + "learning_rate": 0.0004362258660329822, + "epoch": 0.2323018771221731, + "step": 259 }, { - "loss": 2.1782, - "grad_norm": 1.5360814332962036, - "learning_rate": 5e-05, - "epoch": 0.7286930908625217, - "step": 11380 + "loss": 374.3557, + "grad_norm": 60.112728118896484, + "learning_rate": 0.0004357547528198184, + "epoch": 0.23319879556666026, + "step": 260 }, { - "loss": 2.1869, - "grad_norm": 1.613787293434143, - "learning_rate": 5e-05, - "epoch": 0.7290132547864506, - "step": 11385 + "loss": 382.0044, + "grad_norm": 52.59751510620117, + "learning_rate": 0.0004352821623055908, + "epoch": 0.23409571401114743, + "step": 261 }, { - "loss": 2.1888, - "grad_norm": 1.5174287557601929, - "learning_rate": 5e-05, - "epoch": 0.7293334187103797, - "step": 11390 + "loss": 379.4641, + "grad_norm": 54.482444763183594, + "learning_rate": 0.0004348080982487953, + "epoch": 0.23499263245563456, + "step": 262 }, { - "loss": 2.1803, - "grad_norm": 1.625603437423706, - "learning_rate": 5e-05, - "epoch": 0.7296535826343088, - "step": 11395 + "loss": 376.0202, + "grad_norm": 57.2796516418457, + "learning_rate": 0.0004343325644196468, + "epoch": 0.23588955090012173, + "step": 263 }, { - "loss": 2.1788, - "grad_norm": 1.572482943534851, - "learning_rate": 5e-05, - "epoch": 0.7299737465582378, - "step": 11400 + "loss": 380.4021, + "grad_norm": 51.36527633666992, + "learning_rate": 0.0004338555646000492, + "epoch": 0.23678646934460887, + "step": 264 }, { - "eval_loss": 2.03629732131958, - "eval_runtime": 9.2282, - "eval_samples_per_second": 221.929, - "eval_steps_per_second": 27.741, - "epoch": 0.7299737465582378, - "step": 11400 + "loss": 382.1948, + "grad_norm": 54.246639251708984, + "learning_rate": 0.0004333771025835655, + "epoch": 0.23768338778909603, + "step": 265 }, { - "loss": 2.1694, - "grad_norm": 1.6652473211288452, - "learning_rate": 5e-05, - "epoch": 0.7302939104821669, - "step": 11405 + "loss": 376.0016, + "grad_norm": 53.845367431640625, + "learning_rate": 0.0004328971821753873, + "epoch": 0.2385803062335832, + "step": 266 }, { - "loss": 2.1589, - "grad_norm": 1.5308523178100586, - "learning_rate": 5e-05, - "epoch": 0.7306140744060959, - "step": 11410 + "loss": 378.0241, + "grad_norm": 55.82734298706055, + "learning_rate": 0.0004324158071923049, + "epoch": 0.23947722467807034, + "step": 267 }, { - "loss": 2.1666, - "grad_norm": 1.578356385231018, - "learning_rate": 5e-05, - "epoch": 0.730934238330025, - "step": 11415 + "loss": 376.6841, + "grad_norm": 52.28315734863281, + "learning_rate": 0.0004319329814626768, + "epoch": 0.2403741431225575, + "step": 268 }, { - "loss": 2.1538, - "grad_norm": 1.6113513708114624, - "learning_rate": 5e-05, - "epoch": 0.731254402253954, - "step": 11420 + "loss": 376.4868, + "grad_norm": 59.60106658935547, + "learning_rate": 0.00043144870882639907, + "epoch": 0.24127106156704464, + "step": 269 }, { - "loss": 2.184, - "grad_norm": 1.6025482416152954, - "learning_rate": 5e-05, - "epoch": 0.7315745661778831, - "step": 11425 + "loss": 376.3779, + "grad_norm": 58.55453109741211, + "learning_rate": 0.0004309629931348752, + "epoch": 0.2421679800115318, + "step": 270 }, { - "loss": 2.1466, - "grad_norm": 1.674232006072998, - "learning_rate": 5e-05, - "epoch": 0.7318947301018122, - "step": 11430 + "loss": 379.1783, + "grad_norm": 52.10798263549805, + "learning_rate": 0.0004304758382509849, + "epoch": 0.24306489845601897, + "step": 271 }, { - "loss": 2.1636, - "grad_norm": 1.6667330265045166, - "learning_rate": 5e-05, - "epoch": 0.7322148940257411, - "step": 11435 + "loss": 379.3161, + "grad_norm": 53.941673278808594, + "learning_rate": 0.0004299872480490542, + "epoch": 0.2439618169005061, + "step": 272 }, { - "loss": 2.1873, - "grad_norm": 1.5927730798721313, - "learning_rate": 5e-05, - "epoch": 0.7325350579496702, - "step": 11440 + "loss": 379.5319, + "grad_norm": 53.70753860473633, + "learning_rate": 0.00042949722641482383, + "epoch": 0.24485873534499328, + "step": 273 }, { - "loss": 2.1408, - "grad_norm": 1.6314619779586792, - "learning_rate": 5e-05, - "epoch": 0.7328552218735993, - "step": 11445 + "loss": 379.6953, + "grad_norm": 61.60326385498047, + "learning_rate": 0.0004290057772454187, + "epoch": 0.24575565378948042, + "step": 274 }, { - "loss": 2.1728, - "grad_norm": 1.6256287097930908, - "learning_rate": 5e-05, - "epoch": 0.7331753857975284, - "step": 11450 + "loss": 379.7555, + "grad_norm": 57.09893798828125, + "learning_rate": 0.0004285129044493169, + "epoch": 0.24665257223396758, + "step": 275 }, { - "loss": 2.1653, - "grad_norm": 1.556431531906128, - "learning_rate": 5e-05, - "epoch": 0.7334955497214574, - "step": 11455 + "loss": 381.1754, + "grad_norm": 60.31880187988281, + "learning_rate": 0.0004280186119463186, + "epoch": 0.24754949067845475, + "step": 276 }, { - "loss": 2.1638, - "grad_norm": 1.600176453590393, - "learning_rate": 5e-05, - "epoch": 0.7338157136453864, - "step": 11460 + "loss": 379.8077, + "grad_norm": 57.53593826293945, + "learning_rate": 0.0004275229036675148, + "epoch": 0.24844640912294189, + "step": 277 }, { - "loss": 2.152, - "grad_norm": 1.5737305879592896, - "learning_rate": 5e-05, - "epoch": 0.7341358775693155, - "step": 11465 + "loss": 381.0815, + "grad_norm": 56.55409240722656, + "learning_rate": 0.00042702578355525615, + "epoch": 0.24934332756742905, + "step": 278 }, { - "loss": 2.1757, - "grad_norm": 1.5820810794830322, - "learning_rate": 5e-05, - "epoch": 0.7344560414932445, - "step": 11470 + "loss": 378.2445, + "grad_norm": 50.37730026245117, + "learning_rate": 0.00042652725556312156, + "epoch": 0.2502402460119162, + "step": 279 }, { - "loss": 2.1806, - "grad_norm": 1.7114322185516357, - "learning_rate": 5e-05, - "epoch": 0.7347762054171736, - "step": 11475 + "loss": 376.4951, + "grad_norm": 50.24005889892578, + "learning_rate": 0.0004260273236558867, + "epoch": 0.2511371644564034, + "step": 280 }, { - "loss": 2.1672, - "grad_norm": 1.5795398950576782, - "learning_rate": 5e-05, - "epoch": 0.7350963693411027, - "step": 11480 + "loss": 379.3927, + "grad_norm": 52.99737548828125, + "learning_rate": 0.0004255259918094926, + "epoch": 0.2520340829008905, + "step": 281 }, { - "loss": 2.178, - "grad_norm": 1.6970211267471313, - "learning_rate": 5e-05, - "epoch": 0.7354165332650316, - "step": 11485 + "loss": 379.7873, + "grad_norm": 53.95462417602539, + "learning_rate": 0.00042502326401101386, + "epoch": 0.25293100134537766, + "step": 282 }, { - "loss": 2.1437, - "grad_norm": 1.6389212608337402, - "learning_rate": 5e-05, - "epoch": 0.7357366971889607, - "step": 11490 + "loss": 370.9284, + "grad_norm": 51.21118927001953, + "learning_rate": 0.0004245191442586273, + "epoch": 0.2538279197898648, + "step": 283 }, { - "loss": 2.1463, - "grad_norm": 1.5622590780258179, - "learning_rate": 5e-05, - "epoch": 0.7360568611128898, - "step": 11495 + "loss": 374.7379, + "grad_norm": 53.918975830078125, + "learning_rate": 0.00042401363656157954, + "epoch": 0.254724838234352, + "step": 284 }, { - "loss": 2.1763, - "grad_norm": 1.6667590141296387, - "learning_rate": 5e-05, - "epoch": 0.7363770250368189, - "step": 11500 + "loss": 373.7905, + "grad_norm": 51.7956428527832, + "learning_rate": 0.00042350674494015566, + "epoch": 0.25562175667883913, + "step": 285 }, { - "loss": 2.1737, - "grad_norm": 1.6279217004776, - "learning_rate": 5e-05, - "epoch": 0.7366971889607479, - "step": 11505 + "loss": 376.9342, + "grad_norm": 51.80348205566406, + "learning_rate": 0.0004229984734256471, + "epoch": 0.25651867512332627, + "step": 286 }, { - "loss": 2.1736, - "grad_norm": 1.5891218185424805, - "learning_rate": 5e-05, - "epoch": 0.737017352884677, - "step": 11510 + "loss": 378.537, + "grad_norm": 53.50684356689453, + "learning_rate": 0.0004224888260603195, + "epoch": 0.25741559356781346, + "step": 287 }, { - "loss": 2.1645, - "grad_norm": 1.5445228815078735, - "learning_rate": 5e-05, - "epoch": 0.737337516808606, - "step": 11515 + "loss": 374.9467, + "grad_norm": 52.037200927734375, + "learning_rate": 0.0004219778068973804, + "epoch": 0.2583125120123006, + "step": 288 }, { - "loss": 2.1715, - "grad_norm": 1.6425042152404785, - "learning_rate": 5e-05, - "epoch": 0.737657680732535, - "step": 11520 + "loss": 382.1371, + "grad_norm": 48.98027420043945, + "learning_rate": 0.0004214654200009475, + "epoch": 0.25920943045678774, + "step": 289 }, { - "loss": 2.1548, - "grad_norm": 1.6075410842895508, - "learning_rate": 5e-05, - "epoch": 0.7379778446564641, - "step": 11525 + "loss": 378.7361, + "grad_norm": 51.1038818359375, + "learning_rate": 0.0004209516694460157, + "epoch": 0.26010634890127493, + "step": 290 }, { - "loss": 2.1311, - "grad_norm": 1.6689494848251343, - "learning_rate": 5e-05, - "epoch": 0.7382980085803932, - "step": 11530 + "loss": 379.9825, + "grad_norm": 53.03129577636719, + "learning_rate": 0.0004204365593184255, + "epoch": 0.26100326734576207, + "step": 291 }, { - "loss": 2.1788, - "grad_norm": 1.6945607662200928, - "learning_rate": 5e-05, - "epoch": 0.7386181725043223, - "step": 11535 + "loss": 376.35, + "grad_norm": 54.52887725830078, + "learning_rate": 0.0004199200937148297, + "epoch": 0.2619001857902492, + "step": 292 }, { - "loss": 2.1646, - "grad_norm": 1.7109571695327759, - "learning_rate": 5e-05, - "epoch": 0.7389383364282512, - "step": 11540 + "loss": 376.654, + "grad_norm": 51.10536575317383, + "learning_rate": 0.00041940227674266105, + "epoch": 0.26279710423473635, + "step": 293 }, { - "loss": 2.1656, - "grad_norm": 1.6874438524246216, - "learning_rate": 5e-05, - "epoch": 0.7392585003521803, - "step": 11545 + "loss": 372.8873, + "grad_norm": 57.231117248535156, + "learning_rate": 0.0004188831125201, + "epoch": 0.26369402267922354, + "step": 294 }, { - "loss": 2.1719, - "grad_norm": 1.6662932634353638, - "learning_rate": 5e-05, - "epoch": 0.7395786642761094, - "step": 11550 + "loss": 372.2591, + "grad_norm": 54.170921325683594, + "learning_rate": 0.0004183626051760415, + "epoch": 0.2645909411237107, + "step": 295 }, { - "loss": 2.1489, - "grad_norm": 1.5764780044555664, - "learning_rate": 5e-05, - "epoch": 0.7398988282000384, - "step": 11555 + "loss": 376.232, + "grad_norm": 48.81595230102539, + "learning_rate": 0.0004178407588500621, + "epoch": 0.2654878595681978, + "step": 296 }, { - "loss": 2.1715, - "grad_norm": 1.6159734725952148, - "learning_rate": 5e-05, - "epoch": 0.7402189921239675, - "step": 11560 + "loss": 377.493, + "grad_norm": 51.22395324707031, + "learning_rate": 0.00041731757769238764, + "epoch": 0.266384778012685, + "step": 297 }, { - "loss": 2.1861, - "grad_norm": 1.5857573747634888, - "learning_rate": 5e-05, - "epoch": 0.7405391560478966, - "step": 11565 + "loss": 373.4135, + "grad_norm": 50.80076217651367, + "learning_rate": 0.00041679306586385944, + "epoch": 0.26728169645717215, + "step": 298 }, { - "loss": 2.1622, - "grad_norm": 1.5534696578979492, - "learning_rate": 5e-05, - "epoch": 0.7408593199718255, - "step": 11570 + "loss": 373.3929, + "grad_norm": 52.78483581542969, + "learning_rate": 0.00041626722753590185, + "epoch": 0.2681786149016593, + "step": 299 }, { - "loss": 2.144, - "grad_norm": 1.5971225500106812, - "learning_rate": 5e-05, - "epoch": 0.7411794838957546, - "step": 11575 + "loss": 374.4973, + "grad_norm": 59.0179328918457, + "learning_rate": 0.0004157400668904887, + "epoch": 0.2690755333461465, + "step": 300 }, { - "loss": 2.1669, - "grad_norm": 1.6196457147598267, - "learning_rate": 5e-05, - "epoch": 0.7414996478196837, - "step": 11580 + "eval_loss": 1.6736700534820557, + "eval_runtime": 48.4303, + "eval_samples_per_second": 42.288, + "eval_steps_per_second": 2.643, + "epoch": 0.2690755333461465, + "step": 300 }, { - "loss": 2.1851, - "grad_norm": 1.584476351737976, - "learning_rate": 5e-05, - "epoch": 0.7418198117436128, - "step": 11585 + "loss": 370.586, + "grad_norm": 51.39365005493164, + "learning_rate": 0.0004152115881201102, + "epoch": 0.2699724517906336, + "step": 301 }, { - "loss": 2.1444, - "grad_norm": 1.6179357767105103, - "learning_rate": 5e-05, - "epoch": 0.7421399756675418, - "step": 11590 + "loss": 371.1306, + "grad_norm": 53.13943862915039, + "learning_rate": 0.0004146817954277395, + "epoch": 0.27086937023512075, + "step": 302 }, { - "loss": 2.1497, - "grad_norm": 1.549419641494751, - "learning_rate": 5e-05, - "epoch": 0.7424601395914708, - "step": 11595 + "loss": 375.8091, + "grad_norm": 46.9393310546875, + "learning_rate": 0.0004141506930267995, + "epoch": 0.2717662886796079, + "step": 303 }, { - "loss": 2.1902, - "grad_norm": 1.6180243492126465, - "learning_rate": 5e-05, - "epoch": 0.7427803035153999, - "step": 11600 + "loss": 378.5063, + "grad_norm": 56.166954040527344, + "learning_rate": 0.00041361828514112884, + "epoch": 0.2726632071240951, + "step": 304 }, { - "eval_loss": 2.032823085784912, - "eval_runtime": 12.5035, - "eval_samples_per_second": 163.795, - "eval_steps_per_second": 20.474, - "epoch": 0.7427803035153999, - "step": 11600 + "loss": 372.5772, + "grad_norm": 52.24879455566406, + "learning_rate": 0.00041308457600494917, + "epoch": 0.2735601255685822, + "step": 305 }, { - "loss": 2.1372, - "grad_norm": 1.7094162702560425, - "learning_rate": 5e-05, - "epoch": 0.7431004674393289, - "step": 11605 + "loss": 371.29, + "grad_norm": 53.966949462890625, + "learning_rate": 0.00041254956986283044, + "epoch": 0.27445704401306936, + "step": 306 }, { - "loss": 2.169, - "grad_norm": 1.5947761535644531, - "learning_rate": 5e-05, - "epoch": 0.743420631363258, - "step": 11610 + "loss": 376.5358, + "grad_norm": 51.999046325683594, + "learning_rate": 0.0004120132709696578, + "epoch": 0.27535396245755656, + "step": 307 }, { - "loss": 2.1527, - "grad_norm": 1.634129524230957, - "learning_rate": 5e-05, - "epoch": 0.7437407952871871, - "step": 11615 + "loss": 377.9629, + "grad_norm": 53.83307647705078, + "learning_rate": 0.0004114756835905976, + "epoch": 0.2762508809020437, + "step": 308 }, { - "loss": 2.1599, - "grad_norm": 1.6361533403396606, - "learning_rate": 5e-05, - "epoch": 0.7440609592111161, - "step": 11620 + "loss": 372.8809, + "grad_norm": 55.104217529296875, + "learning_rate": 0.0004109368120010636, + "epoch": 0.27714779934653083, + "step": 309 }, { - "loss": 2.1682, - "grad_norm": 1.6670771837234497, - "learning_rate": 5e-05, - "epoch": 0.7443811231350451, - "step": 11625 + "loss": 377.9377, + "grad_norm": 51.1360969543457, + "learning_rate": 0.00041039666048668265, + "epoch": 0.278044717791018, + "step": 310 }, { - "loss": 2.1661, - "grad_norm": 1.6897422075271606, - "learning_rate": 5e-05, - "epoch": 0.7447012870589742, - "step": 11630 + "loss": 377.1788, + "grad_norm": 50.87997817993164, + "learning_rate": 0.00040985523334326093, + "epoch": 0.27894163623550516, + "step": 311 }, { - "loss": 2.1425, - "grad_norm": 1.5701279640197754, - "learning_rate": 5e-05, - "epoch": 0.7450214509829033, - "step": 11635 + "loss": 375.3121, + "grad_norm": 49.86625289916992, + "learning_rate": 0.00040931253487674955, + "epoch": 0.2798385546799923, + "step": 312 }, { - "loss": 2.1747, - "grad_norm": 1.6329246759414673, - "learning_rate": 5e-05, - "epoch": 0.7453416149068323, - "step": 11640 + "loss": 373.2664, + "grad_norm": 51.52640151977539, + "learning_rate": 0.00040876856940321056, + "epoch": 0.28073547312447944, + "step": 313 }, { - "loss": 2.186, - "grad_norm": 1.651111364364624, - "learning_rate": 5e-05, - "epoch": 0.7456617788307613, - "step": 11645 + "loss": 373.2856, + "grad_norm": 49.00104904174805, + "learning_rate": 0.00040822334124878236, + "epoch": 0.28163239156896663, + "step": 314 }, { - "loss": 2.1843, - "grad_norm": 1.5741336345672607, - "learning_rate": 5e-05, - "epoch": 0.7459819427546904, - "step": 11650 + "loss": 377.6501, + "grad_norm": 52.83418655395508, + "learning_rate": 0.00040767685474964535, + "epoch": 0.28252931001345377, + "step": 315 }, { - "loss": 2.1641, - "grad_norm": 1.6159652471542358, - "learning_rate": 5e-05, - "epoch": 0.7463021066786194, - "step": 11655 + "loss": 370.6684, + "grad_norm": 49.96600341796875, + "learning_rate": 0.00040712911425198764, + "epoch": 0.2834262284579409, + "step": 316 }, { - "loss": 2.1791, - "grad_norm": 1.538552165031433, - "learning_rate": 5e-05, - "epoch": 0.7466222706025485, - "step": 11660 + "loss": 376.3713, + "grad_norm": 50.470123291015625, + "learning_rate": 0.0004065801241119702, + "epoch": 0.2843231469024281, + "step": 317 }, { - "loss": 2.1707, - "grad_norm": 1.616956353187561, - "learning_rate": 5e-05, - "epoch": 0.7469424345264776, - "step": 11665 + "loss": 374.6679, + "grad_norm": 47.91783142089844, + "learning_rate": 0.0004060298886956926, + "epoch": 0.28522006534691524, + "step": 318 }, { - "loss": 2.1778, - "grad_norm": 1.530639410018921, - "learning_rate": 5e-05, - "epoch": 0.7472625984504067, - "step": 11670 + "loss": 376.8799, + "grad_norm": 52.6668586730957, + "learning_rate": 0.0004054784123791577, + "epoch": 0.2861169837914024, + "step": 319 }, { - "loss": 2.1523, - "grad_norm": 1.5348337888717651, - "learning_rate": 5e-05, - "epoch": 0.7475827623743356, - "step": 11675 + "loss": 371.9651, + "grad_norm": 50.082279205322266, + "learning_rate": 0.00040492569954823763, + "epoch": 0.2870139022358896, + "step": 320 }, { - "loss": 2.1787, - "grad_norm": 1.6515288352966309, - "learning_rate": 5e-05, - "epoch": 0.7479029262982647, - "step": 11680 + "loss": 373.8972, + "grad_norm": 56.001190185546875, + "learning_rate": 0.0004043717545986381, + "epoch": 0.2879108206803767, + "step": 321 }, { - "loss": 2.167, - "grad_norm": 1.654701828956604, - "learning_rate": 5e-05, - "epoch": 0.7482230902221938, - "step": 11685 + "loss": 370.1523, + "grad_norm": 53.00112533569336, + "learning_rate": 0.0004038165819358639, + "epoch": 0.28880773912486385, + "step": 322 }, { - "loss": 2.149, - "grad_norm": 1.5613808631896973, - "learning_rate": 5e-05, - "epoch": 0.7485432541461228, - "step": 11690 + "loss": 377.1375, + "grad_norm": 52.706729888916016, + "learning_rate": 0.0004032601859751839, + "epoch": 0.28970465756935104, + "step": 323 }, { - "loss": 2.1814, - "grad_norm": 1.6129508018493652, - "learning_rate": 5e-05, - "epoch": 0.7488634180700519, - "step": 11695 + "loss": 375.1089, + "grad_norm": 51.362571716308594, + "learning_rate": 0.00040270257114159583, + "epoch": 0.2906015760138382, + "step": 324 }, { - "loss": 2.1738, - "grad_norm": 1.5563007593154907, - "learning_rate": 5e-05, - "epoch": 0.7491835819939809, - "step": 11700 + "loss": 370.7276, + "grad_norm": 54.43815994262695, + "learning_rate": 0.00040214374186979074, + "epoch": 0.2914984944583253, + "step": 325 }, { - "loss": 2.1543, - "grad_norm": 1.5803391933441162, - "learning_rate": 5e-05, - "epoch": 0.74950374591791, - "step": 11705 + "loss": 375.119, + "grad_norm": 51.00381851196289, + "learning_rate": 0.0004015837026041186, + "epoch": 0.29239541290281246, + "step": 326 }, { - "loss": 2.19, - "grad_norm": 1.61357581615448, - "learning_rate": 5e-05, - "epoch": 0.749823909841839, - "step": 11710 + "loss": 371.2367, + "grad_norm": 57.776222229003906, + "learning_rate": 0.000401022457798552, + "epoch": 0.29329233134729965, + "step": 327 }, { - "loss": 2.1295, - "grad_norm": 1.5979573726654053, - "learning_rate": 5e-05, - "epoch": 0.7501440737657681, - "step": 11715 + "loss": 380.1667, + "grad_norm": 53.284149169921875, + "learning_rate": 0.0004004600119166513, + "epoch": 0.2941892497917868, + "step": 328 }, { - "loss": 2.1445, - "grad_norm": 1.6031203269958496, - "learning_rate": 5e-05, - "epoch": 0.7504642376896972, - "step": 11720 + "loss": 369.6853, + "grad_norm": 56.30731964111328, + "learning_rate": 0.000399896369431529, + "epoch": 0.2950861682362739, + "step": 329 }, { - "loss": 2.1808, - "grad_norm": 1.645012378692627, - "learning_rate": 5e-05, - "epoch": 0.7507844016136261, - "step": 11725 + "loss": 374.0436, + "grad_norm": 54.28211975097656, + "learning_rate": 0.00039933153482581406, + "epoch": 0.2959830866807611, + "step": 330 }, { - "loss": 2.161, - "grad_norm": 1.629136562347412, - "learning_rate": 5e-05, - "epoch": 0.7511045655375552, - "step": 11730 + "loss": 372.2117, + "grad_norm": 50.88725280761719, + "learning_rate": 0.00039876551259161643, + "epoch": 0.29688000512524826, + "step": 331 }, { - "loss": 2.1554, - "grad_norm": 1.5675849914550781, - "learning_rate": 5e-05, - "epoch": 0.7514247294614843, - "step": 11735 + "loss": 374.7655, + "grad_norm": 54.17941665649414, + "learning_rate": 0.00039819830723049105, + "epoch": 0.2977769235697354, + "step": 332 }, { - "loss": 2.1769, - "grad_norm": 1.6842530965805054, - "learning_rate": 5e-05, - "epoch": 0.7517448933854133, - "step": 11740 + "loss": 376.0198, + "grad_norm": 52.40755081176758, + "learning_rate": 0.0003976299232534024, + "epoch": 0.2986738420142226, + "step": 333 }, { - "loss": 2.1571, - "grad_norm": 1.645548939704895, - "learning_rate": 5e-05, - "epoch": 0.7520650573093424, - "step": 11745 + "loss": 371.5096, + "grad_norm": 50.74897384643555, + "learning_rate": 0.0003970603651806886, + "epoch": 0.29957076045870973, + "step": 334 }, { - "loss": 2.1565, - "grad_norm": 1.6353763341903687, - "learning_rate": 5e-05, - "epoch": 0.7523852212332715, - "step": 11750 + "loss": 375.5447, + "grad_norm": 47.52690124511719, + "learning_rate": 0.00039648963754202496, + "epoch": 0.30046767890319687, + "step": 335 }, { - "loss": 2.1574, - "grad_norm": 1.6045830249786377, - "learning_rate": 5e-05, - "epoch": 0.7527053851572005, - "step": 11755 + "loss": 376.1951, + "grad_norm": 52.93135070800781, + "learning_rate": 0.0003959177448763883, + "epoch": 0.301364597347684, + "step": 336 }, { - "loss": 2.1557, - "grad_norm": 1.6563011407852173, - "learning_rate": 5e-05, - "epoch": 0.7530255490811295, - "step": 11760 + "loss": 371.1348, + "grad_norm": 50.335418701171875, + "learning_rate": 0.0003953446917320214, + "epoch": 0.3022615157921712, + "step": 337 }, { - "loss": 2.1776, - "grad_norm": 1.662995457649231, - "learning_rate": 5e-05, - "epoch": 0.7533457130050586, - "step": 11765 + "loss": 375.4595, + "grad_norm": 51.26169204711914, + "learning_rate": 0.0003947704826663955, + "epoch": 0.30315843423665834, + "step": 338 }, { - "loss": 2.1611, - "grad_norm": 1.6755454540252686, - "learning_rate": 5e-05, - "epoch": 0.7536658769289877, - "step": 11770 + "loss": 372.898, + "grad_norm": 54.89933776855469, + "learning_rate": 0.0003941951222461756, + "epoch": 0.3040553526811455, + "step": 339 }, { - "loss": 2.164, - "grad_norm": 1.6755766868591309, - "learning_rate": 5e-05, - "epoch": 0.7539860408529167, - "step": 11775 + "loss": 370.8462, + "grad_norm": 54.09654235839844, + "learning_rate": 0.00039361861504718276, + "epoch": 0.30495227112563267, + "step": 340 }, { - "loss": 2.1476, - "grad_norm": 1.637885570526123, - "learning_rate": 5e-05, - "epoch": 0.7543062047768457, - "step": 11780 + "loss": 373.6092, + "grad_norm": 52.41168975830078, + "learning_rate": 0.0003930409656543588, + "epoch": 0.3058491895701198, + "step": 341 }, { - "loss": 2.1591, - "grad_norm": 1.5392520427703857, - "learning_rate": 5e-05, - "epoch": 0.7546263687007748, - "step": 11785 + "loss": 374.9025, + "grad_norm": 45.53563690185547, + "learning_rate": 0.00039246217866172907, + "epoch": 0.30674610801460694, + "step": 342 }, { - "loss": 2.1723, - "grad_norm": 1.702089548110962, - "learning_rate": 5e-05, - "epoch": 0.7549465326247039, - "step": 11790 + "loss": 376.0628, + "grad_norm": 51.11941146850586, + "learning_rate": 0.00039188225867236643, + "epoch": 0.30764302645909414, + "step": 343 }, { - "loss": 2.1572, - "grad_norm": 1.5574604272842407, - "learning_rate": 5e-05, - "epoch": 0.7552666965486329, - "step": 11795 + "loss": 374.4197, + "grad_norm": 50.10179901123047, + "learning_rate": 0.0003913012102983542, + "epoch": 0.3085399449035813, + "step": 344 }, { - "loss": 2.1696, - "grad_norm": 1.565976858139038, - "learning_rate": 5e-05, - "epoch": 0.755586860472562, - "step": 11800 + "loss": 370.0171, + "grad_norm": 50.524696350097656, + "learning_rate": 0.00039071903816074977, + "epoch": 0.3094368633480684, + "step": 345 }, { - "eval_loss": 2.033466339111328, - "eval_runtime": 9.2054, - "eval_samples_per_second": 222.478, - "eval_steps_per_second": 27.81, - "epoch": 0.755586860472562, - "step": 11800 + "loss": 371.2375, + "grad_norm": 51.18245315551758, + "learning_rate": 0.00039013574688954793, + "epoch": 0.31033378179255555, + "step": 346 }, { - "loss": 2.154, - "grad_norm": 1.5538219213485718, - "learning_rate": 5e-05, - "epoch": 0.755907024396491, - "step": 11805 + "loss": 374.7748, + "grad_norm": 64.64472198486328, + "learning_rate": 0.0003895513411236438, + "epoch": 0.31123070023704275, + "step": 347 }, { - "loss": 2.175, - "grad_norm": 1.6611093282699585, - "learning_rate": 5e-05, - "epoch": 0.75622718832042, - "step": 11810 + "loss": 377.3275, + "grad_norm": 56.01545715332031, + "learning_rate": 0.0003889658255107959, + "epoch": 0.3121276186815299, + "step": 348 }, { - "loss": 2.1702, - "grad_norm": 1.61954665184021, - "learning_rate": 5e-05, - "epoch": 0.7565473522443491, - "step": 11815 + "loss": 369.5843, + "grad_norm": 56.439754486083984, + "learning_rate": 0.0003883792047075896, + "epoch": 0.313024537126017, + "step": 349 }, { - "loss": 2.1554, - "grad_norm": 1.5703262090682983, - "learning_rate": 5e-05, - "epoch": 0.7568675161682782, - "step": 11820 + "loss": 368.456, + "grad_norm": 58.23375701904297, + "learning_rate": 0.0003877914833793996, + "epoch": 0.3139214555705042, + "step": 350 }, { - "loss": 2.1639, - "grad_norm": 1.5392646789550781, - "learning_rate": 5e-05, - "epoch": 0.7571876800922072, - "step": 11825 + "eval_loss": 1.661989450454712, + "eval_runtime": 36.2255, + "eval_samples_per_second": 56.535, + "eval_steps_per_second": 3.533, + "epoch": 0.3139214555705042, + "step": 350 }, { - "loss": 2.1582, - "grad_norm": 1.6835620403289795, - "learning_rate": 5e-05, - "epoch": 0.7575078440161362, - "step": 11830 + "loss": 374.9042, + "grad_norm": 52.63510513305664, + "learning_rate": 0.00038720266620035314, + "epoch": 0.31481837401499135, + "step": 351 }, { - "loss": 2.1526, - "grad_norm": 1.6214685440063477, - "learning_rate": 5e-05, - "epoch": 0.7578280079400653, - "step": 11835 + "loss": 367.9091, + "grad_norm": 55.49558639526367, + "learning_rate": 0.0003866127578532927, + "epoch": 0.3157152924594785, + "step": 352 }, { - "loss": 2.1829, - "grad_norm": 1.613537311553955, - "learning_rate": 5e-05, - "epoch": 0.7581481718639944, - "step": 11840 + "loss": 374.5601, + "grad_norm": 52.941497802734375, + "learning_rate": 0.0003860217630297387, + "epoch": 0.3166122109039657, + "step": 353 }, { - "loss": 2.1452, - "grad_norm": 1.5517228841781616, - "learning_rate": 5e-05, - "epoch": 0.7584683357879234, - "step": 11845 + "loss": 371.4058, + "grad_norm": 44.237648010253906, + "learning_rate": 0.0003854296864298523, + "epoch": 0.3175091293484528, + "step": 354 }, { - "loss": 2.1803, - "grad_norm": 1.6012214422225952, - "learning_rate": 5e-05, - "epoch": 0.7587884997118525, - "step": 11850 + "loss": 376.094, + "grad_norm": 52.86402893066406, + "learning_rate": 0.00038483653276239816, + "epoch": 0.31840604779293996, + "step": 355 }, { - "loss": 2.1673, - "grad_norm": 1.5846041440963745, - "learning_rate": 5e-05, - "epoch": 0.7591086636357816, - "step": 11855 + "loss": 374.3872, + "grad_norm": 49.61796569824219, + "learning_rate": 0.0003842423067447066, + "epoch": 0.3193029662374271, + "step": 356 }, { - "loss": 2.1348, - "grad_norm": 1.588087558746338, - "learning_rate": 5e-05, - "epoch": 0.7594288275597105, - "step": 11860 + "loss": 371.5387, + "grad_norm": 49.825504302978516, + "learning_rate": 0.0003836470131026365, + "epoch": 0.3201998846819143, + "step": 357 }, { - "loss": 2.1479, - "grad_norm": 1.585292100906372, - "learning_rate": 5e-05, - "epoch": 0.7597489914836396, - "step": 11865 + "loss": 371.4422, + "grad_norm": 53.598228454589844, + "learning_rate": 0.0003830506565705372, + "epoch": 0.32109680312640143, + "step": 358 }, { - "loss": 2.1432, - "grad_norm": 1.5393071174621582, - "learning_rate": 5e-05, - "epoch": 0.7600691554075687, - "step": 11870 + "loss": 371.03, + "grad_norm": 48.73537063598633, + "learning_rate": 0.00038245324189121153, + "epoch": 0.32199372157088857, + "step": 359 }, { - "loss": 2.1886, - "grad_norm": 1.5727068185806274, - "learning_rate": 5e-05, - "epoch": 0.7603893193314978, - "step": 11875 + "loss": 377.8967, + "grad_norm": 48.377281188964844, + "learning_rate": 0.00038185477381587763, + "epoch": 0.32289064001537576, + "step": 360 }, { - "loss": 2.177, - "grad_norm": 1.6157071590423584, - "learning_rate": 5e-05, - "epoch": 0.7607094832554268, - "step": 11880 + "loss": 374.9411, + "grad_norm": 53.932228088378906, + "learning_rate": 0.0003812552571041311, + "epoch": 0.3237875584598629, + "step": 361 }, { - "loss": 2.1629, - "grad_norm": 1.6175097227096558, - "learning_rate": 5e-05, - "epoch": 0.7610296471793558, - "step": 11885 + "loss": 374.6432, + "grad_norm": 52.54889678955078, + "learning_rate": 0.00038065469652390736, + "epoch": 0.32468447690435004, + "step": 362 }, { - "loss": 2.1848, - "grad_norm": 1.7654550075531006, - "learning_rate": 5e-05, - "epoch": 0.7613498111032849, - "step": 11890 + "loss": 371.9634, + "grad_norm": 53.84141159057617, + "learning_rate": 0.000380053096851444, + "epoch": 0.32558139534883723, + "step": 363 }, { - "loss": 2.1423, - "grad_norm": 1.6491674184799194, - "learning_rate": 5e-05, - "epoch": 0.7616699750272139, - "step": 11895 + "loss": 371.487, + "grad_norm": 49.041019439697266, + "learning_rate": 0.00037945046287124197, + "epoch": 0.32647831379332437, + "step": 364 }, { - "loss": 2.1446, - "grad_norm": 1.5910719633102417, - "learning_rate": 5e-05, - "epoch": 0.761990138951143, - "step": 11900 + "loss": 370.3628, + "grad_norm": 51.356388092041016, + "learning_rate": 0.00037884679937602827, + "epoch": 0.3273752322378115, + "step": 365 }, { - "loss": 2.1554, - "grad_norm": 1.5420751571655273, - "learning_rate": 5e-05, - "epoch": 0.7623103028750721, - "step": 11905 + "loss": 371.4878, + "grad_norm": 49.55571746826172, + "learning_rate": 0.0003782421111667178, + "epoch": 0.32827215068229865, + "step": 366 }, { - "loss": 2.1404, - "grad_norm": 1.6134288311004639, - "learning_rate": 5e-05, - "epoch": 0.762630466799001, - "step": 11910 + "loss": 373.209, + "grad_norm": 51.30101013183594, + "learning_rate": 0.00037763640305237456, + "epoch": 0.32916906912678584, + "step": 367 }, { - "loss": 2.1559, - "grad_norm": 1.4922846555709839, - "learning_rate": 5e-05, - "epoch": 0.7629506307229301, - "step": 11915 + "loss": 369.0127, + "grad_norm": 51.14597702026367, + "learning_rate": 0.000377029679850174, + "epoch": 0.330065987571273, + "step": 368 }, { - "loss": 2.152, - "grad_norm": 1.5739011764526367, - "learning_rate": 5e-05, - "epoch": 0.7632707946468592, - "step": 11920 + "loss": 374.4203, + "grad_norm": 51.925132751464844, + "learning_rate": 0.00037642194638536487, + "epoch": 0.3309629060157601, + "step": 369 }, { - "loss": 2.1655, - "grad_norm": 1.6074965000152588, - "learning_rate": 5e-05, - "epoch": 0.7635909585707883, - "step": 11925 + "loss": 370.4622, + "grad_norm": 53.620052337646484, + "learning_rate": 0.00037581320749123, + "epoch": 0.3318598244602473, + "step": 370 }, { - "loss": 2.1552, - "grad_norm": 1.5739712715148926, - "learning_rate": 5e-05, - "epoch": 0.7639111224947173, - "step": 11930 + "loss": 369.0265, + "grad_norm": 47.18992233276367, + "learning_rate": 0.0003752034680090485, + "epoch": 0.33275674290473445, + "step": 371 }, { - "loss": 2.1671, - "grad_norm": 1.595373272895813, - "learning_rate": 5e-05, - "epoch": 0.7642312864186463, - "step": 11935 + "loss": 372.8077, + "grad_norm": 56.7562141418457, + "learning_rate": 0.0003745927327880574, + "epoch": 0.3336536613492216, + "step": 372 }, { - "loss": 2.1588, - "grad_norm": 1.5630054473876953, - "learning_rate": 5e-05, - "epoch": 0.7645514503425754, - "step": 11940 + "loss": 368.2184, + "grad_norm": 56.05765914916992, + "learning_rate": 0.00037398100668541227, + "epoch": 0.3345505797937088, + "step": 373 }, { - "loss": 2.159, - "grad_norm": 1.6202868223190308, - "learning_rate": 5e-05, - "epoch": 0.7648716142665044, - "step": 11945 + "loss": 376.1522, + "grad_norm": 50.888771057128906, + "learning_rate": 0.00037336829456614975, + "epoch": 0.3354474982381959, + "step": 374 }, { - "loss": 2.2056, - "grad_norm": 1.671276330947876, - "learning_rate": 5e-05, - "epoch": 0.7651917781904335, - "step": 11950 + "loss": 371.1161, + "grad_norm": 49.758975982666016, + "learning_rate": 0.0003727546013031478, + "epoch": 0.33634441668268306, + "step": 375 }, { - "loss": 2.1376, - "grad_norm": 1.714341163635254, - "learning_rate": 5e-05, - "epoch": 0.7655119421143626, - "step": 11955 + "loss": 371.6988, + "grad_norm": 53.891990661621094, + "learning_rate": 0.00037213993177708746, + "epoch": 0.33724133512717025, + "step": 376 }, { - "loss": 2.165, - "grad_norm": 1.734993815422058, - "learning_rate": 5e-05, - "epoch": 0.7658321060382917, - "step": 11960 + "loss": 370.6019, + "grad_norm": 50.557762145996094, + "learning_rate": 0.000371524290876414, + "epoch": 0.3381382535716574, + "step": 377 }, { - "loss": 2.1515, - "grad_norm": 1.7148008346557617, - "learning_rate": 5e-05, - "epoch": 0.7661522699622206, - "step": 11965 + "loss": 373.2912, + "grad_norm": 51.6466064453125, + "learning_rate": 0.00037090768349729833, + "epoch": 0.3390351720161445, + "step": 378 }, { - "loss": 2.1716, - "grad_norm": 1.6618858575820923, - "learning_rate": 5e-05, - "epoch": 0.7664724338861497, - "step": 11970 + "loss": 372.9784, + "grad_norm": 48.213077545166016, + "learning_rate": 0.00037029011454359695, + "epoch": 0.33993209046063166, + "step": 379 }, { - "loss": 2.1366, - "grad_norm": 1.603656530380249, - "learning_rate": 5e-05, - "epoch": 0.7667925978100788, - "step": 11975 + "loss": 368.0577, + "grad_norm": 49.39459991455078, + "learning_rate": 0.0003696715889268145, + "epoch": 0.34082900890511886, + "step": 380 }, { - "loss": 2.1583, - "grad_norm": 1.618535041809082, - "learning_rate": 5e-05, - "epoch": 0.7671127617340078, - "step": 11980 + "loss": 371.9662, + "grad_norm": 49.54859924316406, + "learning_rate": 0.00036905211156606344, + "epoch": 0.341725927349606, + "step": 381 }, { - "loss": 2.14, - "grad_norm": 1.553106665611267, - "learning_rate": 5e-05, - "epoch": 0.7674329256579369, - "step": 11985 + "loss": 376.1466, + "grad_norm": 54.29618835449219, + "learning_rate": 0.00036843168738802574, + "epoch": 0.34262284579409313, + "step": 382 }, { - "loss": 2.1562, - "grad_norm": 1.5241880416870117, - "learning_rate": 5e-05, - "epoch": 0.7677530895818659, - "step": 11990 + "loss": 372.8206, + "grad_norm": 47.55562210083008, + "learning_rate": 0.00036781032132691304, + "epoch": 0.3435197642385803, + "step": 383 }, { - "loss": 2.1738, - "grad_norm": 1.5153862237930298, - "learning_rate": 5e-05, - "epoch": 0.7680732535057949, - "step": 11995 + "loss": 370.9735, + "grad_norm": 49.289615631103516, + "learning_rate": 0.00036718801832442814, + "epoch": 0.34441668268306747, + "step": 384 }, { - "loss": 2.1916, - "grad_norm": 1.577313780784607, - "learning_rate": 5e-05, - "epoch": 0.768393417429724, - "step": 12000 + "loss": 370.5686, + "grad_norm": 50.339176177978516, + "learning_rate": 0.000366564783329725, + "epoch": 0.3453136011275546, + "step": 385 }, { - "eval_loss": 2.0328667163848877, - "eval_runtime": 12.6548, - "eval_samples_per_second": 161.836, - "eval_steps_per_second": 20.23, - "epoch": 0.768393417429724, - "step": 12000 + "loss": 371.3257, + "grad_norm": 49.51339340209961, + "learning_rate": 0.00036594062129936974, + "epoch": 0.3462105195720418, + "step": 386 }, { - "loss": 2.1293, - "grad_norm": 1.6270264387130737, - "learning_rate": 5e-05, - "epoch": 0.7687135813536531, - "step": 12005 + "loss": 366.3475, + "grad_norm": 48.21767044067383, + "learning_rate": 0.0003653155371973012, + "epoch": 0.34710743801652894, + "step": 387 }, { - "loss": 2.1822, - "grad_norm": 1.679459571838379, - "learning_rate": 5e-05, - "epoch": 0.7690337452775822, - "step": 12010 + "loss": 369.8744, + "grad_norm": 52.45291519165039, + "learning_rate": 0.0003646895359947915, + "epoch": 0.3480043564610161, + "step": 388 }, { - "loss": 2.1355, - "grad_norm": 1.6691720485687256, - "learning_rate": 5e-05, - "epoch": 0.7693539092015111, - "step": 12015 + "loss": 372.5318, + "grad_norm": 49.45993423461914, + "learning_rate": 0.00036406262267040624, + "epoch": 0.3489012749055032, + "step": 389 }, { - "loss": 2.1708, - "grad_norm": 1.6165096759796143, - "learning_rate": 5e-05, - "epoch": 0.7696740731254402, - "step": 12020 + "loss": 369.184, + "grad_norm": 48.8317756652832, + "learning_rate": 0.0003634348022099652, + "epoch": 0.3497981933499904, + "step": 390 }, { - "loss": 2.1629, - "grad_norm": 1.6087703704833984, - "learning_rate": 5e-05, - "epoch": 0.7699942370493693, - "step": 12025 + "loss": 373.9739, + "grad_norm": 50.6275634765625, + "learning_rate": 0.0003628060796065027, + "epoch": 0.35069511179447754, + "step": 391 }, { - "loss": 2.169, - "grad_norm": 1.5682085752487183, - "learning_rate": 5e-05, - "epoch": 0.7703144009732983, - "step": 12030 + "loss": 372.0473, + "grad_norm": 48.547447204589844, + "learning_rate": 0.00036217645986022756, + "epoch": 0.3515920302389647, + "step": 392 }, { - "loss": 2.1344, - "grad_norm": 1.5505659580230713, - "learning_rate": 5e-05, - "epoch": 0.7706345648972274, - "step": 12035 + "loss": 364.9705, + "grad_norm": 48.18462371826172, + "learning_rate": 0.0003615459479784837, + "epoch": 0.3524889486834519, + "step": 393 }, { - "loss": 2.1576, - "grad_norm": 1.5263081789016724, - "learning_rate": 5e-05, - "epoch": 0.7709547288211565, - "step": 12040 + "loss": 369.6471, + "grad_norm": 46.10414123535156, + "learning_rate": 0.0003609145489757101, + "epoch": 0.353385867127939, + "step": 394 }, { - "loss": 2.1756, - "grad_norm": 1.700647234916687, - "learning_rate": 5e-05, - "epoch": 0.7712748927450855, - "step": 12045 + "loss": 371.7173, + "grad_norm": 46.38992691040039, + "learning_rate": 0.0003602822678734008, + "epoch": 0.35428278557242615, + "step": 395 }, { - "loss": 2.1783, - "grad_norm": 1.6649285554885864, - "learning_rate": 5e-05, - "epoch": 0.7715950566690145, - "step": 12050 + "loss": 367.3975, + "grad_norm": 45.87107467651367, + "learning_rate": 0.00035964910970006557, + "epoch": 0.35517970401691334, + "step": 396 }, { - "loss": 2.1548, - "grad_norm": 1.6890583038330078, - "learning_rate": 5e-05, - "epoch": 0.7719152205929436, - "step": 12055 + "loss": 371.2871, + "grad_norm": 46.54446029663086, + "learning_rate": 0.00035901507949118915, + "epoch": 0.3560766224614005, + "step": 397 }, { - "loss": 2.1775, - "grad_norm": 1.7615318298339844, - "learning_rate": 5e-05, - "epoch": 0.7722353845168727, - "step": 12060 + "loss": 368.7915, + "grad_norm": 45.7996826171875, + "learning_rate": 0.0003583801822891917, + "epoch": 0.3569735409058876, + "step": 398 }, { - "loss": 2.1643, - "grad_norm": 1.7467012405395508, - "learning_rate": 5e-05, - "epoch": 0.7725555484408017, - "step": 12065 + "loss": 371.0395, + "grad_norm": 48.34632873535156, + "learning_rate": 0.0003577444231433885, + "epoch": 0.35787045935037476, + "step": 399 }, { - "loss": 2.158, - "grad_norm": 1.6698206663131714, - "learning_rate": 5e-05, - "epoch": 0.7728757123647307, - "step": 12070 + "loss": 374.4672, + "grad_norm": 48.63014221191406, + "learning_rate": 0.00035710780710994985, + "epoch": 0.35876737779486195, + "step": 400 }, { - "loss": 2.1472, - "grad_norm": 1.592130422592163, - "learning_rate": 5e-05, - "epoch": 0.7731958762886598, - "step": 12075 + "eval_loss": 1.6527702808380127, + "eval_runtime": 51.2432, + "eval_samples_per_second": 39.966, + "eval_steps_per_second": 2.498, + "epoch": 0.35876737779486195, + "step": 400 }, { - "loss": 2.1408, - "grad_norm": 1.6456536054611206, - "learning_rate": 5e-05, - "epoch": 0.7735160402125888, - "step": 12080 + "loss": 369.2286, + "grad_norm": 50.575950622558594, + "learning_rate": 0.00035647033925186066, + "epoch": 0.3596642962393491, + "step": 401 }, { - "loss": 2.1562, - "grad_norm": 1.637117862701416, - "learning_rate": 5e-05, - "epoch": 0.7738362041365179, - "step": 12085 + "loss": 366.6179, + "grad_norm": 50.074954986572266, + "learning_rate": 0.0003558320246388808, + "epoch": 0.36056121468383623, + "step": 402 }, { - "loss": 2.1518, - "grad_norm": 1.6462478637695312, - "learning_rate": 5e-05, - "epoch": 0.774156368060447, - "step": 12090 + "loss": 370.1017, + "grad_norm": 51.92937088012695, + "learning_rate": 0.00035519286834750403, + "epoch": 0.3614581331283234, + "step": 403 }, { - "loss": 2.1468, - "grad_norm": 1.6634985208511353, - "learning_rate": 5e-05, - "epoch": 0.774476531984376, - "step": 12095 + "loss": 366.74, + "grad_norm": 52.75185775756836, + "learning_rate": 0.00035455287546091785, + "epoch": 0.36235505157281056, + "step": 404 }, { - "loss": 2.1616, - "grad_norm": 1.602612853050232, - "learning_rate": 5e-05, - "epoch": 0.774796695908305, - "step": 12100 + "loss": 369.307, + "grad_norm": 50.451271057128906, + "learning_rate": 0.0003539120510689636, + "epoch": 0.3632519700172977, + "step": 405 }, { - "loss": 2.141, - "grad_norm": 1.580141305923462, - "learning_rate": 5e-05, - "epoch": 0.7751168598322341, - "step": 12105 + "loss": 374.2456, + "grad_norm": 56.06875228881836, + "learning_rate": 0.0003532704002680951, + "epoch": 0.3641488884617849, + "step": 406 }, { - "loss": 2.1513, - "grad_norm": 1.6583237648010254, - "learning_rate": 5e-05, - "epoch": 0.7754370237561632, - "step": 12110 + "loss": 371.9364, + "grad_norm": 49.18859100341797, + "learning_rate": 0.0003526279281613388, + "epoch": 0.36504580690627203, + "step": 407 }, { - "loss": 2.1497, - "grad_norm": 1.6343486309051514, - "learning_rate": 5e-05, - "epoch": 0.7757571876800922, - "step": 12115 + "loss": 375.3452, + "grad_norm": 60.49544143676758, + "learning_rate": 0.00035198463985825303, + "epoch": 0.36594272535075917, + "step": 408 }, { - "loss": 2.1309, - "grad_norm": 1.5499143600463867, - "learning_rate": 5e-05, - "epoch": 0.7760773516040212, - "step": 12120 + "loss": 364.7332, + "grad_norm": 55.390960693359375, + "learning_rate": 0.0003513405404748872, + "epoch": 0.3668396437952463, + "step": 409 }, { - "loss": 2.1546, - "grad_norm": 1.6126841306686401, - "learning_rate": 5e-05, - "epoch": 0.7763975155279503, - "step": 12125 + "loss": 367.328, + "grad_norm": 45.79146194458008, + "learning_rate": 0.00035069563513374105, + "epoch": 0.3677365622397335, + "step": 410 }, { - "loss": 2.1419, - "grad_norm": 1.5695915222167969, - "learning_rate": 5e-05, - "epoch": 0.7767176794518794, - "step": 12130 + "loss": 372.7194, + "grad_norm": 50.601531982421875, + "learning_rate": 0.0003500499289637243, + "epoch": 0.36863348068422064, + "step": 411 }, { - "loss": 2.165, - "grad_norm": 1.7308309078216553, - "learning_rate": 5e-05, - "epoch": 0.7770378433758084, - "step": 12135 + "loss": 373.3177, + "grad_norm": 58.5416374206543, + "learning_rate": 0.0003494034271001158, + "epoch": 0.3695303991287078, + "step": 412 }, { - "loss": 2.1722, - "grad_norm": 1.6199977397918701, - "learning_rate": 5e-05, - "epoch": 0.7773580072997375, - "step": 12140 + "loss": 367.5529, + "grad_norm": 48.93236541748047, + "learning_rate": 0.00034875613468452203, + "epoch": 0.37042731757319497, + "step": 413 }, { - "loss": 2.1572, - "grad_norm": 1.6317228078842163, - "learning_rate": 5e-05, - "epoch": 0.7776781712236666, - "step": 12145 + "loss": 368.6186, + "grad_norm": 49.043251037597656, + "learning_rate": 0.00034810805686483713, + "epoch": 0.3713242360176821, + "step": 414 }, { - "loss": 2.163, - "grad_norm": 1.6066442728042603, - "learning_rate": 5e-05, - "epoch": 0.7779983351475955, - "step": 12150 + "loss": 363.3611, + "grad_norm": 48.577144622802734, + "learning_rate": 0.0003474591987952013, + "epoch": 0.37222115446216925, + "step": 415 }, { - "loss": 2.1893, - "grad_norm": 1.6254138946533203, - "learning_rate": 5e-05, - "epoch": 0.7783184990715246, - "step": 12155 + "loss": 368.0312, + "grad_norm": 48.73127746582031, + "learning_rate": 0.0003468095656359601, + "epoch": 0.37311807290665644, + "step": 416 }, { - "loss": 2.1504, - "grad_norm": 1.650006890296936, - "learning_rate": 5e-05, - "epoch": 0.7786386629954537, - "step": 12160 + "loss": 367.3114, + "grad_norm": 51.46812057495117, + "learning_rate": 0.0003461591625536234, + "epoch": 0.3740149913511436, + "step": 417 }, { - "loss": 2.1524, - "grad_norm": 1.605966567993164, - "learning_rate": 5e-05, - "epoch": 0.7789588269193827, - "step": 12165 + "loss": 375.6931, + "grad_norm": 49.236141204833984, + "learning_rate": 0.0003455079947208242, + "epoch": 0.3749119097956307, + "step": 418 }, { - "loss": 2.1521, - "grad_norm": 1.5684596300125122, - "learning_rate": 5e-05, - "epoch": 0.7792789908433118, - "step": 12170 + "loss": 365.6711, + "grad_norm": 48.81379318237305, + "learning_rate": 0.00034485606731627755, + "epoch": 0.37580882824011785, + "step": 419 }, { - "loss": 2.1581, - "grad_norm": 1.5677785873413086, - "learning_rate": 5e-05, - "epoch": 0.7795991547672408, - "step": 12175 + "loss": 364.9393, + "grad_norm": 51.185340881347656, + "learning_rate": 0.0003442033855247394, + "epoch": 0.37670574668460505, + "step": 420 }, { - "loss": 2.1275, - "grad_norm": 1.6513335704803467, - "learning_rate": 5e-05, - "epoch": 0.7799193186911699, - "step": 12180 + "loss": 369.8553, + "grad_norm": 53.58812713623047, + "learning_rate": 0.000343549954536965, + "epoch": 0.3776026651290922, + "step": 421 }, { - "loss": 2.1446, - "grad_norm": 1.650525450706482, - "learning_rate": 5e-05, - "epoch": 0.7802394826150989, - "step": 12185 + "loss": 372.3922, + "grad_norm": 51.472042083740234, + "learning_rate": 0.0003428957795496685, + "epoch": 0.3784995835735793, + "step": 422 }, { - "loss": 2.152, - "grad_norm": 1.576680302619934, - "learning_rate": 5e-05, - "epoch": 0.780559646539028, - "step": 12190 + "loss": 371.9807, + "grad_norm": 54.97187805175781, + "learning_rate": 0.0003422408657654805, + "epoch": 0.3793965020180665, + "step": 423 }, { - "loss": 2.1527, - "grad_norm": 1.634871244430542, - "learning_rate": 5e-05, - "epoch": 0.7808798104629571, - "step": 12195 + "loss": 370.048, + "grad_norm": 54.97746276855469, + "learning_rate": 0.0003415852183929077, + "epoch": 0.38029342046255366, + "step": 424 }, { - "loss": 2.1764, - "grad_norm": 1.6193811893463135, - "learning_rate": 5e-05, - "epoch": 0.781199974386886, - "step": 12200 + "loss": 370.0667, + "grad_norm": 46.41242980957031, + "learning_rate": 0.0003409288426462904, + "epoch": 0.3811903389070408, + "step": 425 }, { - "eval_loss": 2.030806303024292, - "eval_runtime": 10.0212, - "eval_samples_per_second": 204.366, - "eval_steps_per_second": 25.546, - "epoch": 0.781199974386886, - "step": 12200 + "loss": 366.4669, + "grad_norm": 51.722904205322266, + "learning_rate": 0.0003402717437457624, + "epoch": 0.382087257351528, + "step": 426 }, { - "loss": 2.1441, - "grad_norm": 1.6034654378890991, - "learning_rate": 5e-05, - "epoch": 0.7815201383108151, - "step": 12205 + "loss": 367.8651, + "grad_norm": 51.60542678833008, + "learning_rate": 0.00033961392691720803, + "epoch": 0.3829841757960151, + "step": 427 }, { - "loss": 2.1311, - "grad_norm": 1.5852737426757812, - "learning_rate": 5e-05, - "epoch": 0.7818403022347442, - "step": 12210 + "loss": 364.8575, + "grad_norm": 46.896331787109375, + "learning_rate": 0.0003389553973922217, + "epoch": 0.38388109424050226, + "step": 428 }, { - "loss": 2.1637, - "grad_norm": 1.6145589351654053, - "learning_rate": 5e-05, - "epoch": 0.7821604661586733, - "step": 12215 + "loss": 366.1106, + "grad_norm": 47.48381042480469, + "learning_rate": 0.00033829616040806566, + "epoch": 0.38477801268498946, + "step": 429 }, { - "loss": 2.1387, - "grad_norm": 1.6558140516281128, - "learning_rate": 5e-05, - "epoch": 0.7824806300826023, - "step": 12220 + "loss": 369.6983, + "grad_norm": 47.15787124633789, + "learning_rate": 0.0003376362212076287, + "epoch": 0.3856749311294766, + "step": 430 }, { - "loss": 2.1663, - "grad_norm": 1.5966055393218994, - "learning_rate": 5e-05, - "epoch": 0.7828007940065314, - "step": 12225 + "loss": 372.8012, + "grad_norm": 49.67255401611328, + "learning_rate": 0.0003369755850393841, + "epoch": 0.38657184957396373, + "step": 431 }, { - "loss": 2.1405, - "grad_norm": 1.6753573417663574, - "learning_rate": 5e-05, - "epoch": 0.7831209579304604, - "step": 12230 + "loss": 369.0824, + "grad_norm": 50.87350082397461, + "learning_rate": 0.0003363142571573484, + "epoch": 0.38746876801845087, + "step": 432 }, { - "loss": 2.1673, - "grad_norm": 1.6698678731918335, - "learning_rate": 5e-05, - "epoch": 0.7834411218543894, - "step": 12235 + "loss": 368.5385, + "grad_norm": 52.32754135131836, + "learning_rate": 0.0003356522428210391, + "epoch": 0.38836568646293806, + "step": 433 }, { - "loss": 2.1487, - "grad_norm": 1.6776996850967407, - "learning_rate": 5e-05, - "epoch": 0.7837612857783185, - "step": 12240 + "loss": 370.1974, + "grad_norm": 46.638084411621094, + "learning_rate": 0.0003349895472954331, + "epoch": 0.3892626049074252, + "step": 434 }, { - "loss": 2.1743, - "grad_norm": 1.6791410446166992, - "learning_rate": 5e-05, - "epoch": 0.7840814497022476, - "step": 12245 + "loss": 367.2549, + "grad_norm": 51.39384460449219, + "learning_rate": 0.00033432617585092467, + "epoch": 0.39015952335191234, + "step": 435 }, { - "loss": 2.2024, - "grad_norm": 1.6838544607162476, - "learning_rate": 5e-05, - "epoch": 0.7844016136261766, - "step": 12250 + "loss": 368.2899, + "grad_norm": 49.1676139831543, + "learning_rate": 0.00033366213376328396, + "epoch": 0.39105644179639953, + "step": 436 }, { - "loss": 2.1468, - "grad_norm": 1.572723627090454, - "learning_rate": 5e-05, - "epoch": 0.7847217775501056, - "step": 12255 + "loss": 372.2977, + "grad_norm": 51.6141242980957, + "learning_rate": 0.0003329974263136144, + "epoch": 0.3919533602408867, + "step": 437 }, { - "loss": 2.1581, - "grad_norm": 1.5989056825637817, - "learning_rate": 5e-05, - "epoch": 0.7850419414740347, - "step": 12260 + "loss": 368.3735, + "grad_norm": 49.94230270385742, + "learning_rate": 0.0003323320587883111, + "epoch": 0.3928502786853738, + "step": 438 }, { - "loss": 2.1342, - "grad_norm": 1.5742263793945312, - "learning_rate": 5e-05, - "epoch": 0.7853621053979638, - "step": 12265 + "loss": 370.6481, + "grad_norm": 49.947837829589844, + "learning_rate": 0.0003316660364790188, + "epoch": 0.393747197129861, + "step": 439 }, { - "loss": 2.1455, - "grad_norm": 1.6334121227264404, - "learning_rate": 5e-05, - "epoch": 0.7856822693218928, - "step": 12270 + "loss": 369.6432, + "grad_norm": 48.53517532348633, + "learning_rate": 0.0003309993646825896, + "epoch": 0.39464411557434814, + "step": 440 }, { - "loss": 2.1602, - "grad_norm": 1.6674188375473022, - "learning_rate": 5e-05, - "epoch": 0.7860024332458219, - "step": 12275 + "loss": 366.7539, + "grad_norm": 50.93443298339844, + "learning_rate": 0.00033033204870104116, + "epoch": 0.3955410340188353, + "step": 441 }, { - "loss": 2.1542, - "grad_norm": 1.6382502317428589, - "learning_rate": 5e-05, - "epoch": 0.786322597169751, - "step": 12280 + "loss": 367.3075, + "grad_norm": 49.63651657104492, + "learning_rate": 0.000329664093841514, + "epoch": 0.3964379524633224, + "step": 442 }, { - "loss": 2.1901, - "grad_norm": 1.642731785774231, - "learning_rate": 5e-05, - "epoch": 0.7866427610936799, - "step": 12285 + "loss": 369.597, + "grad_norm": 48.85470962524414, + "learning_rate": 0.00032899550541623, + "epoch": 0.3973348709078096, + "step": 443 }, { - "loss": 2.1802, - "grad_norm": 1.6009544134140015, - "learning_rate": 5e-05, - "epoch": 0.786962925017609, - "step": 12290 + "loss": 366.1455, + "grad_norm": 49.675559997558594, + "learning_rate": 0.0003283262887424494, + "epoch": 0.39823178935229675, + "step": 444 }, { - "loss": 2.1362, - "grad_norm": 1.54159414768219, - "learning_rate": 5e-05, - "epoch": 0.7872830889415381, - "step": 12295 + "loss": 362.2254, + "grad_norm": 48.583370208740234, + "learning_rate": 0.0003276564491424292, + "epoch": 0.3991287077967839, + "step": 445 }, { - "loss": 2.1792, - "grad_norm": 1.5700483322143555, - "learning_rate": 5e-05, - "epoch": 0.7876032528654672, - "step": 12300 + "loss": 372.5689, + "grad_norm": 50.507293701171875, + "learning_rate": 0.0003269859919433802, + "epoch": 0.4000256262412711, + "step": 446 }, { - "loss": 2.1434, - "grad_norm": 1.6987546682357788, - "learning_rate": 5e-05, - "epoch": 0.7879234167893961, - "step": 12305 + "loss": 366.7801, + "grad_norm": 50.75261688232422, + "learning_rate": 0.0003263149224774251, + "epoch": 0.4009225446857582, + "step": 447 }, { - "loss": 2.1475, - "grad_norm": 1.595435380935669, - "learning_rate": 5e-05, - "epoch": 0.7882435807133252, - "step": 12310 + "loss": 369.5224, + "grad_norm": 49.42384719848633, + "learning_rate": 0.00032564324608155604, + "epoch": 0.40181946313024536, + "step": 448 }, { - "loss": 2.1381, - "grad_norm": 1.5674047470092773, - "learning_rate": 5e-05, - "epoch": 0.7885637446372543, - "step": 12315 + "loss": 369.6519, + "grad_norm": 49.12044143676758, + "learning_rate": 0.00032497096809759184, + "epoch": 0.40271638157473255, + "step": 449 }, { - "loss": 2.186, - "grad_norm": 1.5283838510513306, - "learning_rate": 5e-05, - "epoch": 0.7888839085611833, - "step": 12320 + "loss": 370.9763, + "grad_norm": 53.04697036743164, + "learning_rate": 0.0003242980938721359, + "epoch": 0.4036133000192197, + "step": 450 }, { - "loss": 2.1748, - "grad_norm": 1.6678552627563477, - "learning_rate": 5e-05, - "epoch": 0.7892040724851124, - "step": 12325 + "eval_loss": 1.6399173736572266, + "eval_runtime": 36.1587, + "eval_samples_per_second": 56.639, + "eval_steps_per_second": 3.54, + "epoch": 0.4036133000192197, + "step": 450 }, { - "loss": 2.1484, - "grad_norm": 1.5990204811096191, - "learning_rate": 5e-05, - "epoch": 0.7895242364090415, - "step": 12330 + "loss": 367.9265, + "grad_norm": 52.0450553894043, + "learning_rate": 0.00032362462875653355, + "epoch": 0.4045102184637068, + "step": 451 }, { - "loss": 2.1156, - "grad_norm": 1.5516057014465332, - "learning_rate": 5e-05, - "epoch": 0.7898444003329704, - "step": 12335 + "loss": 372.4974, + "grad_norm": 48.33359146118164, + "learning_rate": 0.0003229505781068291, + "epoch": 0.40540713690819397, + "step": 452 }, { - "loss": 2.1274, - "grad_norm": 1.4775924682617188, - "learning_rate": 5e-05, - "epoch": 0.7901645642568995, - "step": 12340 + "loss": 366.6081, + "grad_norm": 49.462974548339844, + "learning_rate": 0.00032227594728372397, + "epoch": 0.40630405535268116, + "step": 453 }, { - "loss": 2.1675, - "grad_norm": 1.6825110912322998, - "learning_rate": 5e-05, - "epoch": 0.7904847281808286, - "step": 12345 + "loss": 366.3152, + "grad_norm": 48.31398391723633, + "learning_rate": 0.0003216007416525335, + "epoch": 0.4072009737971683, + "step": 454 }, { - "loss": 2.144, - "grad_norm": 1.600658655166626, - "learning_rate": 5e-05, - "epoch": 0.7908048921047577, - "step": 12350 + "loss": 369.983, + "grad_norm": 47.523338317871094, + "learning_rate": 0.0003209249665831445, + "epoch": 0.40809789224165544, + "step": 455 }, { - "loss": 2.1516, - "grad_norm": 1.5016052722930908, - "learning_rate": 5e-05, - "epoch": 0.7911250560286867, - "step": 12355 + "loss": 366.8036, + "grad_norm": 45.295806884765625, + "learning_rate": 0.00032024862744997265, + "epoch": 0.40899481068614263, + "step": 456 }, { - "loss": 2.1524, - "grad_norm": 1.5548697710037231, - "learning_rate": 5e-05, - "epoch": 0.7914452199526157, - "step": 12360 + "loss": 366.4848, + "grad_norm": 49.89873504638672, + "learning_rate": 0.0003195717296319193, + "epoch": 0.40989172913062977, + "step": 457 }, { - "loss": 2.1653, - "grad_norm": 1.5483704805374146, - "learning_rate": 5e-05, - "epoch": 0.7917653838765448, - "step": 12365 + "loss": 365.4414, + "grad_norm": 46.948055267333984, + "learning_rate": 0.00031889427851232915, + "epoch": 0.4107886475751169, + "step": 458 }, { - "loss": 2.1624, - "grad_norm": 1.5267717838287354, - "learning_rate": 5e-05, - "epoch": 0.7920855478004738, - "step": 12370 + "loss": 369.7285, + "grad_norm": 48.40359115600586, + "learning_rate": 0.0003182162794789474, + "epoch": 0.4116855660196041, + "step": 459 }, { - "loss": 2.1488, - "grad_norm": 1.738539457321167, - "learning_rate": 5e-05, - "epoch": 0.7924057117244029, - "step": 12375 + "loss": 370.345, + "grad_norm": 48.55045700073242, + "learning_rate": 0.0003175377379238767, + "epoch": 0.41258248446409124, + "step": 460 }, { - "loss": 2.1473, - "grad_norm": 1.6237080097198486, - "learning_rate": 5e-05, - "epoch": 0.792725875648332, - "step": 12380 + "loss": 366.95, + "grad_norm": 47.37104415893555, + "learning_rate": 0.0003168586592435341, + "epoch": 0.4134794029085784, + "step": 461 }, { - "loss": 2.166, - "grad_norm": 1.6005204916000366, - "learning_rate": 5e-05, - "epoch": 0.793046039572261, - "step": 12385 + "loss": 370.2368, + "grad_norm": 51.285888671875, + "learning_rate": 0.00031617904883860903, + "epoch": 0.4143763213530655, + "step": 462 }, { - "loss": 2.1519, - "grad_norm": 1.5341179370880127, - "learning_rate": 5e-05, - "epoch": 0.79336620349619, - "step": 12390 + "loss": 365.4067, + "grad_norm": 50.595340728759766, + "learning_rate": 0.000315498912114019, + "epoch": 0.4152732397975527, + "step": 463 }, { - "loss": 2.1747, - "grad_norm": 1.566355586051941, - "learning_rate": 5e-05, - "epoch": 0.7936863674201191, - "step": 12395 + "loss": 366.4186, + "grad_norm": 45.943519592285156, + "learning_rate": 0.0003148182544788678, + "epoch": 0.41617015824203984, + "step": 464 }, { - "loss": 2.1942, - "grad_norm": 1.706238865852356, - "learning_rate": 5e-05, - "epoch": 0.7940065313440482, - "step": 12400 + "loss": 362.8856, + "grad_norm": 52.45280075073242, + "learning_rate": 0.0003141370813464018, + "epoch": 0.417067076686527, + "step": 465 }, { - "eval_loss": 2.0258755683898926, - "eval_runtime": 11.8297, - "eval_samples_per_second": 173.124, - "eval_steps_per_second": 21.64, - "epoch": 0.7940065313440482, - "step": 12400 + "loss": 366.827, + "grad_norm": 47.95954132080078, + "learning_rate": 0.0003134553981339672, + "epoch": 0.4179639951310142, + "step": 466 }, { - "loss": 2.1421, - "grad_norm": 1.6088368892669678, - "learning_rate": 5e-05, - "epoch": 0.7943266952679772, - "step": 12405 + "loss": 370.8824, + "grad_norm": 51.57919692993164, + "learning_rate": 0.00031277321026296657, + "epoch": 0.4188609135755013, + "step": 467 }, { - "loss": 2.1716, - "grad_norm": 1.6068791151046753, - "learning_rate": 5e-05, - "epoch": 0.7946468591919063, - "step": 12410 + "loss": 368.826, + "grad_norm": 51.78611755371094, + "learning_rate": 0.0003120905231588164, + "epoch": 0.41975783201998845, + "step": 468 }, { - "loss": 2.1201, - "grad_norm": 1.5572909116744995, - "learning_rate": 5e-05, - "epoch": 0.7949670231158353, - "step": 12415 + "loss": 369.1159, + "grad_norm": 46.962074279785156, + "learning_rate": 0.0003114073422509034, + "epoch": 0.42065475046447565, + "step": 469 }, { - "loss": 2.1499, - "grad_norm": 1.5897774696350098, - "learning_rate": 5e-05, - "epoch": 0.7952871870397643, - "step": 12420 + "loss": 361.8488, + "grad_norm": 46.85802459716797, + "learning_rate": 0.0003107236729725414, + "epoch": 0.4215516689089628, + "step": 470 }, { - "loss": 2.1673, - "grad_norm": 1.5571075677871704, - "learning_rate": 5e-05, - "epoch": 0.7956073509636934, - "step": 12425 + "loss": 367.4666, + "grad_norm": 54.017906188964844, + "learning_rate": 0.0003100395207609284, + "epoch": 0.4224485873534499, + "step": 471 }, { - "loss": 2.1511, - "grad_norm": 1.6140809059143066, - "learning_rate": 5e-05, - "epoch": 0.7959275148876225, - "step": 12430 + "loss": 366.9775, + "grad_norm": 53.34091567993164, + "learning_rate": 0.000309354891057103, + "epoch": 0.42334550579793706, + "step": 472 }, { - "loss": 2.1885, - "grad_norm": 1.743863821029663, - "learning_rate": 5e-05, - "epoch": 0.7962476788115516, - "step": 12435 + "loss": 366.0834, + "grad_norm": 47.76055908203125, + "learning_rate": 0.00030866978930590126, + "epoch": 0.42424242424242425, + "step": 473 }, { - "loss": 2.1563, - "grad_norm": 1.5255663394927979, - "learning_rate": 5e-05, - "epoch": 0.7965678427354805, - "step": 12440 + "loss": 368.5773, + "grad_norm": 49.945613861083984, + "learning_rate": 0.00030798422095591364, + "epoch": 0.4251393426869114, + "step": 474 }, { - "loss": 2.1199, - "grad_norm": 1.540199875831604, - "learning_rate": 5e-05, - "epoch": 0.7968880066594096, - "step": 12445 + "loss": 363.8445, + "grad_norm": 48.995609283447266, + "learning_rate": 0.00030729819145944114, + "epoch": 0.42603626113139853, + "step": 475 }, { - "loss": 2.1226, - "grad_norm": 1.5515542030334473, - "learning_rate": 5e-05, - "epoch": 0.7972081705833387, - "step": 12450 + "loss": 362.6448, + "grad_norm": 45.06385040283203, + "learning_rate": 0.00030661170627245256, + "epoch": 0.4269331795758857, + "step": 476 }, { - "loss": 2.1358, - "grad_norm": 1.60235595703125, - "learning_rate": 5e-05, - "epoch": 0.7975283345072677, - "step": 12455 + "loss": 364.0858, + "grad_norm": 49.73957061767578, + "learning_rate": 0.00030592477085454047, + "epoch": 0.42783009802037286, + "step": 477 }, { - "loss": 2.1982, - "grad_norm": 1.604455590248108, - "learning_rate": 5e-05, - "epoch": 0.7978484984311968, - "step": 12460 + "loss": 371.1085, + "grad_norm": 49.45321273803711, + "learning_rate": 0.00030523739066887836, + "epoch": 0.42872701646486, + "step": 478 }, { - "loss": 2.148, - "grad_norm": 1.5921660661697388, - "learning_rate": 5e-05, - "epoch": 0.7981686623551258, - "step": 12465 + "loss": 363.6934, + "grad_norm": 49.325355529785156, + "learning_rate": 0.00030454957118217674, + "epoch": 0.4296239349093472, + "step": 479 }, { - "loss": 2.1461, - "grad_norm": 1.583845615386963, - "learning_rate": 5e-05, - "epoch": 0.7984888262790549, - "step": 12470 + "loss": 368.4297, + "grad_norm": 47.509742736816406, + "learning_rate": 0.0003038613178646401, + "epoch": 0.43052085335383433, + "step": 480 }, { - "loss": 2.1552, - "grad_norm": 1.599627137184143, - "learning_rate": 5e-05, - "epoch": 0.7988089902029839, - "step": 12475 + "loss": 366.2455, + "grad_norm": 48.50214767456055, + "learning_rate": 0.000303172636189923, + "epoch": 0.43141777179832147, + "step": 481 }, { - "loss": 2.1764, - "grad_norm": 1.6735512018203735, - "learning_rate": 5e-05, - "epoch": 0.799129154126913, - "step": 12480 + "loss": 362.4247, + "grad_norm": 46.59059143066406, + "learning_rate": 0.00030248353163508674, + "epoch": 0.43231469024280866, + "step": 482 }, { - "loss": 2.1477, - "grad_norm": 1.571876049041748, - "learning_rate": 5e-05, - "epoch": 0.7994493180508421, - "step": 12485 + "loss": 368.7481, + "grad_norm": 47.74319839477539, + "learning_rate": 0.0003017940096805557, + "epoch": 0.4332116086872958, + "step": 483 }, { - "loss": 2.1418, - "grad_norm": 1.580018162727356, - "learning_rate": 5e-05, - "epoch": 0.799769481974771, - "step": 12490 + "loss": 365.7433, + "grad_norm": 53.59490203857422, + "learning_rate": 0.0003011040758100741, + "epoch": 0.43410852713178294, + "step": 484 }, { - "loss": 2.1611, - "grad_norm": 1.6157264709472656, - "learning_rate": 5e-05, - "epoch": 0.8000896458987001, - "step": 12495 + "loss": 366.9239, + "grad_norm": 49.87615966796875, + "learning_rate": 0.00030041373551066173, + "epoch": 0.4350054455762701, + "step": 485 }, { - "loss": 2.1363, - "grad_norm": 1.5641580820083618, - "learning_rate": 5e-05, - "epoch": 0.8004098098226292, - "step": 12500 + "loss": 360.9555, + "grad_norm": 44.795536041259766, + "learning_rate": 0.0002997229942725711, + "epoch": 0.43590236402075727, + "step": 486 }, { - "loss": 2.1749, - "grad_norm": 1.6098390817642212, - "learning_rate": 5e-05, - "epoch": 0.8007299737465582, - "step": 12505 + "loss": 370.6934, + "grad_norm": 56.454227447509766, + "learning_rate": 0.000299031857589243, + "epoch": 0.4367992824652444, + "step": 487 }, { - "loss": 2.1095, - "grad_norm": 1.5591392517089844, - "learning_rate": 5e-05, - "epoch": 0.8010501376704873, - "step": 12510 + "loss": 369.9133, + "grad_norm": 48.472312927246094, + "learning_rate": 0.00029834033095726335, + "epoch": 0.43769620090973155, + "step": 488 }, { - "loss": 2.1488, - "grad_norm": 1.5563864707946777, - "learning_rate": 5e-05, - "epoch": 0.8013703015944164, - "step": 12515 + "loss": 361.5723, + "grad_norm": 51.665260314941406, + "learning_rate": 0.00029764841987631933, + "epoch": 0.43859311935421874, + "step": 489 }, { - "loss": 2.1685, - "grad_norm": 1.564354419708252, - "learning_rate": 5e-05, - "epoch": 0.8016904655183454, - "step": 12520 + "loss": 366.223, + "grad_norm": 51.25084686279297, + "learning_rate": 0.0002969561298491557, + "epoch": 0.4394900377987059, + "step": 490 }, { - "loss": 2.1467, - "grad_norm": 1.567794680595398, - "learning_rate": 5e-05, - "epoch": 0.8020106294422744, - "step": 12525 + "loss": 367.7071, + "grad_norm": 50.52541732788086, + "learning_rate": 0.00029626346638153073, + "epoch": 0.440386956243193, + "step": 491 }, { - "loss": 2.1129, - "grad_norm": 1.5561848878860474, - "learning_rate": 5e-05, - "epoch": 0.8023307933662035, - "step": 12530 + "loss": 367.0807, + "grad_norm": 50.71653366088867, + "learning_rate": 0.0002955704349821729, + "epoch": 0.4412838746876802, + "step": 492 }, { - "loss": 2.1407, - "grad_norm": 1.5778709650039673, - "learning_rate": 5e-05, - "epoch": 0.8026509572901326, - "step": 12535 + "loss": 366.5776, + "grad_norm": 44.603485107421875, + "learning_rate": 0.0002948770411627367, + "epoch": 0.44218079313216735, + "step": 493 }, { - "loss": 2.1594, - "grad_norm": 1.5555260181427002, - "learning_rate": 5e-05, - "epoch": 0.8029711212140616, - "step": 12540 + "loss": 367.2019, + "grad_norm": 49.68048858642578, + "learning_rate": 0.0002941832904377589, + "epoch": 0.4430777115766545, + "step": 494 }, { - "loss": 2.1593, - "grad_norm": 1.5886701345443726, - "learning_rate": 5e-05, - "epoch": 0.8032912851379906, - "step": 12545 + "loss": 367.4325, + "grad_norm": 56.277896881103516, + "learning_rate": 0.000293489188324615, + "epoch": 0.4439746300211416, + "step": 495 }, { - "loss": 2.1298, - "grad_norm": 1.5618678331375122, - "learning_rate": 5e-05, - "epoch": 0.8036114490619197, - "step": 12550 + "loss": 369.3215, + "grad_norm": 46.4665412902832, + "learning_rate": 0.00029279474034347465, + "epoch": 0.4448715484656288, + "step": 496 }, { - "loss": 2.1498, - "grad_norm": 1.7027949094772339, - "learning_rate": 5e-05, - "epoch": 0.8039316129858488, - "step": 12555 + "loss": 368.6407, + "grad_norm": 51.84563446044922, + "learning_rate": 0.00029209995201725836, + "epoch": 0.44576846691011596, + "step": 497 }, { - "loss": 2.1509, - "grad_norm": 1.625081181526184, - "learning_rate": 5e-05, - "epoch": 0.8042517769097778, - "step": 12560 + "loss": 366.8856, + "grad_norm": 55.93694305419922, + "learning_rate": 0.0002914048288715937, + "epoch": 0.4466653853546031, + "step": 498 }, { - "loss": 2.1231, - "grad_norm": 1.631125569343567, - "learning_rate": 5e-05, - "epoch": 0.8045719408337069, - "step": 12565 + "loss": 367.8516, + "grad_norm": 50.97298812866211, + "learning_rate": 0.00029070937643477056, + "epoch": 0.4475623037990903, + "step": 499 }, { - "loss": 2.165, - "grad_norm": 1.5852395296096802, - "learning_rate": 5e-05, - "epoch": 0.804892104757636, - "step": 12570 + "loss": 364.7996, + "grad_norm": 53.179847717285156, + "learning_rate": 0.000290013600237698, + "epoch": 0.4484592222435774, + "step": 500 }, { - "loss": 2.1538, - "grad_norm": 1.7171579599380493, - "learning_rate": 5e-05, - "epoch": 0.8052122686815649, - "step": 12575 + "eval_loss": 1.6293703317642212, + "eval_runtime": 47.4683, + "eval_samples_per_second": 43.145, + "eval_steps_per_second": 2.697, + "epoch": 0.4484592222435774, + "step": 500 }, { - "loss": 2.1685, - "grad_norm": 1.660180687904358, - "learning_rate": 5e-05, - "epoch": 0.805532432605494, - "step": 12580 + "loss": 364.7999, + "grad_norm": 53.32307434082031, + "learning_rate": 0.00028931750581385975, + "epoch": 0.44935614068806456, + "step": 501 }, { - "loss": 2.1729, - "grad_norm": 1.589772343635559, - "learning_rate": 5e-05, - "epoch": 0.8058525965294231, - "step": 12585 + "loss": 368.2321, + "grad_norm": 48.1343994140625, + "learning_rate": 0.00028862109869927057, + "epoch": 0.45025305913255176, + "step": 502 }, { - "loss": 2.1709, - "grad_norm": 1.6301699876785278, - "learning_rate": 5e-05, - "epoch": 0.8061727604533521, - "step": 12590 + "loss": 363.4522, + "grad_norm": 48.97591781616211, + "learning_rate": 0.00028792438443243175, + "epoch": 0.4511499775770389, + "step": 503 }, { - "loss": 2.1447, - "grad_norm": 1.6717979907989502, - "learning_rate": 5e-05, - "epoch": 0.8064929243772812, - "step": 12595 + "loss": 367.3519, + "grad_norm": 48.5214729309082, + "learning_rate": 0.00028722736855428755, + "epoch": 0.45204689602152603, + "step": 504 }, { - "loss": 2.1402, - "grad_norm": 1.5903136730194092, - "learning_rate": 5e-05, - "epoch": 0.8068130883012102, - "step": 12600 + "loss": 366.9135, + "grad_norm": 48.30058288574219, + "learning_rate": 0.00028653005660818115, + "epoch": 0.4529438144660132, + "step": 505 }, { - "eval_loss": 2.0219223499298096, - "eval_runtime": 9.19, - "eval_samples_per_second": 222.851, - "eval_steps_per_second": 27.856, - "epoch": 0.8068130883012102, - "step": 12600 + "loss": 365.4208, + "grad_norm": 48.56584548950195, + "learning_rate": 0.00028583245413980993, + "epoch": 0.45384073291050037, + "step": 506 }, { - "loss": 2.1761, - "grad_norm": 1.5364222526550293, - "learning_rate": 5e-05, - "epoch": 0.8071332522251393, - "step": 12605 + "loss": 366.6342, + "grad_norm": 44.84033203125, + "learning_rate": 0.0002851345666971819, + "epoch": 0.4547376513549875, + "step": 507 }, { - "loss": 2.1475, - "grad_norm": 1.5884590148925781, - "learning_rate": 5e-05, - "epoch": 0.8074534161490683, - "step": 12610 + "loss": 366.2589, + "grad_norm": 46.03631591796875, + "learning_rate": 0.0002844363998305717, + "epoch": 0.45563456979947464, + "step": 508 }, { - "loss": 2.1438, - "grad_norm": 1.6371623277664185, - "learning_rate": 5e-05, - "epoch": 0.8077735800729974, - "step": 12615 + "loss": 368.2724, + "grad_norm": 52.3626708984375, + "learning_rate": 0.0002837379590924759, + "epoch": 0.45653148824396184, + "step": 509 }, { - "loss": 2.1583, - "grad_norm": 1.5629013776779175, - "learning_rate": 5e-05, - "epoch": 0.8080937439969265, - "step": 12620 + "loss": 366.9325, + "grad_norm": 42.26225280761719, + "learning_rate": 0.0002830392500375694, + "epoch": 0.457428406688449, + "step": 510 }, { - "loss": 2.1505, - "grad_norm": 1.6306496858596802, - "learning_rate": 5e-05, - "epoch": 0.8084139079208554, - "step": 12625 + "loss": 363.1102, + "grad_norm": 47.719661712646484, + "learning_rate": 0.0002823402782226608, + "epoch": 0.4583253251329361, + "step": 511 }, { - "loss": 2.1618, - "grad_norm": 1.6803566217422485, - "learning_rate": 5e-05, - "epoch": 0.8087340718447845, - "step": 12630 + "loss": 369.943, + "grad_norm": 48.35748291015625, + "learning_rate": 0.00028164104920664864, + "epoch": 0.4592222435774233, + "step": 512 }, { - "loss": 2.155, - "grad_norm": 1.5759366750717163, - "learning_rate": 5e-05, - "epoch": 0.8090542357687136, - "step": 12635 + "loss": 366.7622, + "grad_norm": 47.81887435913086, + "learning_rate": 0.00028094156855047687, + "epoch": 0.46011916202191044, + "step": 513 }, { - "loss": 2.1685, - "grad_norm": 1.5736042261123657, - "learning_rate": 5e-05, - "epoch": 0.8093743996926427, - "step": 12640 + "loss": 369.4684, + "grad_norm": 51.35517883300781, + "learning_rate": 0.0002802418418170908, + "epoch": 0.4610160804663976, + "step": 514 }, { - "loss": 2.1334, - "grad_norm": 1.639487862586975, - "learning_rate": 5e-05, - "epoch": 0.8096945636165717, - "step": 12645 + "loss": 367.9245, + "grad_norm": 52.903011322021484, + "learning_rate": 0.0002795418745713925, + "epoch": 0.4619129989108847, + "step": 515 }, { - "loss": 2.1347, - "grad_norm": 1.5598664283752441, - "learning_rate": 5e-05, - "epoch": 0.8100147275405007, - "step": 12650 + "loss": 363.503, + "grad_norm": 50.455223083496094, + "learning_rate": 0.00027884167238019714, + "epoch": 0.4628099173553719, + "step": 516 }, { - "loss": 2.163, - "grad_norm": 1.6298346519470215, - "learning_rate": 5e-05, - "epoch": 0.8103348914644298, - "step": 12655 + "loss": 361.0208, + "grad_norm": 48.27017593383789, + "learning_rate": 0.0002781412408121884, + "epoch": 0.46370683579985905, + "step": 517 }, { - "loss": 2.1457, - "grad_norm": 1.593445062637329, - "learning_rate": 5e-05, - "epoch": 0.8106550553883588, - "step": 12660 + "loss": 364.5886, + "grad_norm": 49.851619720458984, + "learning_rate": 0.0002774405854378739, + "epoch": 0.4646037542443462, + "step": 518 }, { - "loss": 2.199, - "grad_norm": 1.5960693359375, - "learning_rate": 5e-05, - "epoch": 0.8109752193122879, - "step": 12665 + "loss": 359.5211, + "grad_norm": 49.12308120727539, + "learning_rate": 0.00027673971182954157, + "epoch": 0.4655006726888334, + "step": 519 }, { - "loss": 2.1398, - "grad_norm": 1.595033884048462, - "learning_rate": 5e-05, - "epoch": 0.811295383236217, - "step": 12670 + "loss": 366.8299, + "grad_norm": 47.60043716430664, + "learning_rate": 0.00027603862556121463, + "epoch": 0.4663975911333205, + "step": 520 }, { - "loss": 2.1474, - "grad_norm": 1.6295945644378662, - "learning_rate": 5e-05, - "epoch": 0.811615547160146, - "step": 12675 + "loss": 368.2267, + "grad_norm": 41.944801330566406, + "learning_rate": 0.0002753373322086077, + "epoch": 0.46729450957780766, + "step": 521 }, { - "loss": 2.1516, - "grad_norm": 1.6431686878204346, - "learning_rate": 5e-05, - "epoch": 0.811935711084075, - "step": 12680 + "loss": 368.1608, + "grad_norm": 45.84396743774414, + "learning_rate": 0.00027463583734908234, + "epoch": 0.46819142802229485, + "step": 522 }, { - "loss": 2.1303, - "grad_norm": 1.6344900131225586, - "learning_rate": 5e-05, - "epoch": 0.8122558750080041, - "step": 12685 + "loss": 359.4468, + "grad_norm": 44.122989654541016, + "learning_rate": 0.0002739341465616026, + "epoch": 0.469088346466782, + "step": 523 }, { - "loss": 2.1446, - "grad_norm": 1.6356950998306274, - "learning_rate": 5e-05, - "epoch": 0.8125760389319332, - "step": 12690 + "loss": 367.6043, + "grad_norm": 44.97038269042969, + "learning_rate": 0.000273232265426691, + "epoch": 0.46998526491126913, + "step": 524 }, { - "loss": 2.1369, - "grad_norm": 1.6093062162399292, - "learning_rate": 5e-05, - "epoch": 0.8128962028558622, - "step": 12695 + "loss": 367.8859, + "grad_norm": 49.4835319519043, + "learning_rate": 0.0002725301995263835, + "epoch": 0.47088218335575627, + "step": 525 }, { - "loss": 2.1676, - "grad_norm": 1.7104160785675049, - "learning_rate": 5e-05, - "epoch": 0.8132163667797913, - "step": 12700 + "loss": 365.9901, + "grad_norm": 46.08525466918945, + "learning_rate": 0.00027182795444418583, + "epoch": 0.47177910180024346, + "step": 526 }, { - "loss": 2.1573, - "grad_norm": 1.6704941987991333, - "learning_rate": 5e-05, - "epoch": 0.8135365307037203, - "step": 12705 + "loss": 362.7762, + "grad_norm": 45.26884841918945, + "learning_rate": 0.0002711255357650286, + "epoch": 0.4726760202447306, + "step": 527 }, { - "loss": 2.1416, - "grad_norm": 1.568360686302185, - "learning_rate": 5e-05, - "epoch": 0.8138566946276493, - "step": 12710 + "loss": 363.5254, + "grad_norm": 52.6630973815918, + "learning_rate": 0.0002704229490752229, + "epoch": 0.47357293868921774, + "step": 528 }, { - "loss": 2.1544, - "grad_norm": 1.5690499544143677, - "learning_rate": 5e-05, - "epoch": 0.8141768585515784, - "step": 12715 + "loss": 362.2083, + "grad_norm": 49.639488220214844, + "learning_rate": 0.00026972019996241635, + "epoch": 0.47446985713370493, + "step": 529 }, { - "loss": 2.1495, - "grad_norm": 1.6691917181015015, - "learning_rate": 5e-05, - "epoch": 0.8144970224755075, - "step": 12720 + "loss": 370.2541, + "grad_norm": 51.361610412597656, + "learning_rate": 0.00026901729401554805, + "epoch": 0.47536677557819207, + "step": 530 }, { - "loss": 2.1458, - "grad_norm": 1.5928118228912354, - "learning_rate": 5e-05, - "epoch": 0.8148171863994366, - "step": 12725 + "loss": 364.9506, + "grad_norm": 45.84967803955078, + "learning_rate": 0.00026831423682480425, + "epoch": 0.4762636940226792, + "step": 531 }, { - "loss": 2.1354, - "grad_norm": 1.573602557182312, - "learning_rate": 5e-05, - "epoch": 0.8151373503233655, - "step": 12730 + "loss": 373.7259, + "grad_norm": 48.99913024902344, + "learning_rate": 0.00026761103398157456, + "epoch": 0.4771606124671664, + "step": 532 }, { - "loss": 2.1671, - "grad_norm": 1.608485221862793, - "learning_rate": 5e-05, - "epoch": 0.8154575142472946, - "step": 12735 + "loss": 367.0407, + "grad_norm": 53.0494270324707, + "learning_rate": 0.00026690769107840634, + "epoch": 0.47805753091165354, + "step": 533 }, { - "loss": 2.1242, - "grad_norm": 1.6055208444595337, - "learning_rate": 5e-05, - "epoch": 0.8157776781712237, - "step": 12740 + "loss": 366.3498, + "grad_norm": 46.16975784301758, + "learning_rate": 0.00026620421370896136, + "epoch": 0.4789544493561407, + "step": 534 }, { - "loss": 2.1676, - "grad_norm": 1.4978792667388916, - "learning_rate": 5e-05, - "epoch": 0.8160978420951527, - "step": 12745 + "loss": 363.5735, + "grad_norm": 45.147125244140625, + "learning_rate": 0.00026550060746797057, + "epoch": 0.47985136780062787, + "step": 535 }, { - "loss": 2.1317, - "grad_norm": 1.5075442790985107, - "learning_rate": 5e-05, - "epoch": 0.8164180060190818, - "step": 12750 + "loss": 362.9278, + "grad_norm": 47.262821197509766, + "learning_rate": 0.0002647968779511897, + "epoch": 0.480748286245115, + "step": 536 }, { - "loss": 2.153, - "grad_norm": 1.5905604362487793, - "learning_rate": 5e-05, - "epoch": 0.8167381699430108, - "step": 12755 + "loss": 366.6017, + "grad_norm": 49.1768913269043, + "learning_rate": 0.00026409303075535504, + "epoch": 0.48164520468960215, + "step": 537 }, { - "loss": 2.1718, - "grad_norm": 1.6042873859405518, - "learning_rate": 5e-05, - "epoch": 0.8170583338669398, - "step": 12760 + "loss": 363.7893, + "grad_norm": 47.41939163208008, + "learning_rate": 0.00026338907147813894, + "epoch": 0.4825421231340893, + "step": 538 }, { - "loss": 2.1638, - "grad_norm": 1.515039086341858, - "learning_rate": 5e-05, - "epoch": 0.8173784977908689, - "step": 12765 + "loss": 362.325, + "grad_norm": 45.2095947265625, + "learning_rate": 0.0002626850057181048, + "epoch": 0.4834390415785765, + "step": 539 }, { - "loss": 2.1604, - "grad_norm": 1.7022101879119873, - "learning_rate": 5e-05, - "epoch": 0.817698661714798, - "step": 12770 + "loss": 368.0108, + "grad_norm": 44.87570571899414, + "learning_rate": 0.000261980839074663, + "epoch": 0.4843359600230636, + "step": 540 }, { - "loss": 2.1893, - "grad_norm": 1.5627135038375854, - "learning_rate": 5e-05, - "epoch": 0.8180188256387271, - "step": 12775 + "loss": 363.8844, + "grad_norm": 44.87836456298828, + "learning_rate": 0.0002612765771480264, + "epoch": 0.48523287846755075, + "step": 541 }, { - "loss": 2.1391, - "grad_norm": 1.6236927509307861, - "learning_rate": 5e-05, - "epoch": 0.818338989562656, - "step": 12780 + "loss": 366.2256, + "grad_norm": 52.47968292236328, + "learning_rate": 0.00026057222553916545, + "epoch": 0.48612979691203795, + "step": 542 }, { - "loss": 2.1661, - "grad_norm": 1.627759337425232, - "learning_rate": 5e-05, - "epoch": 0.8186591534865851, - "step": 12785 + "loss": 364.6898, + "grad_norm": 49.18819808959961, + "learning_rate": 0.0002598677898497638, + "epoch": 0.4870267153565251, + "step": 543 }, { - "loss": 2.1579, - "grad_norm": 1.5399779081344604, - "learning_rate": 5e-05, - "epoch": 0.8189793174105142, - "step": 12790 + "loss": 364.0697, + "grad_norm": 47.542850494384766, + "learning_rate": 0.00025916327568217416, + "epoch": 0.4879236338010122, + "step": 544 }, { - "loss": 2.1772, - "grad_norm": 1.5167617797851562, - "learning_rate": 5e-05, - "epoch": 0.8192994813344432, - "step": 12795 + "loss": 362.7703, + "grad_norm": 44.471256256103516, + "learning_rate": 0.0002584586886393729, + "epoch": 0.4888205522454994, + "step": 545 }, { - "loss": 2.1573, - "grad_norm": 1.5164883136749268, - "learning_rate": 5e-05, - "epoch": 0.8196196452583723, - "step": 12800 + "loss": 370.4043, + "grad_norm": 46.374263763427734, + "learning_rate": 0.0002577540343249162, + "epoch": 0.48971747068998656, + "step": 546 }, { - "eval_loss": 2.017529010772705, - "eval_runtime": 10.1774, - "eval_samples_per_second": 201.231, - "eval_steps_per_second": 25.154, - "epoch": 0.8196196452583723, - "step": 12800 + "loss": 362.8738, + "grad_norm": 44.021278381347656, + "learning_rate": 0.0002570493183428952, + "epoch": 0.4906143891344737, + "step": 547 }, { - "loss": 2.1762, - "grad_norm": 1.6219836473464966, - "learning_rate": 5e-05, - "epoch": 0.8199398091823014, - "step": 12805 + "loss": 365.418, + "grad_norm": 47.044212341308594, + "learning_rate": 0.00025634454629789156, + "epoch": 0.49151130757896083, + "step": 548 }, { - "loss": 2.1406, - "grad_norm": 1.5230660438537598, - "learning_rate": 5e-05, - "epoch": 0.8202599731062304, - "step": 12810 + "loss": 363.5009, + "grad_norm": 48.60353469848633, + "learning_rate": 0.00025563972379493273, + "epoch": 0.492408226023448, + "step": 549 }, { - "loss": 2.1179, - "grad_norm": 1.6092220544815063, - "learning_rate": 5e-05, - "epoch": 0.8205801370301594, - "step": 12815 + "loss": 365.955, + "grad_norm": 47.8569221496582, + "learning_rate": 0.00025493485643944753, + "epoch": 0.49330514446793516, + "step": 550 }, { - "loss": 2.157, - "grad_norm": 1.5099693536758423, - "learning_rate": 5e-05, - "epoch": 0.8209003009540885, - "step": 12820 + "eval_loss": 1.6247297525405884, + "eval_runtime": 36.2552, + "eval_samples_per_second": 56.488, + "eval_steps_per_second": 3.531, + "epoch": 0.49330514446793516, + "step": 550 }, { - "loss": 2.1253, - "grad_norm": 1.5344569683074951, - "learning_rate": 5e-05, - "epoch": 0.8212204648780176, - "step": 12825 + "loss": 361.769, + "grad_norm": 52.47264099121094, + "learning_rate": 0.00025422994983722127, + "epoch": 0.4942020629124223, + "step": 551 }, { - "loss": 2.1538, - "grad_norm": 1.6269365549087524, - "learning_rate": 5e-05, - "epoch": 0.8215406288019466, - "step": 12830 + "loss": 369.0356, + "grad_norm": 51.903358459472656, + "learning_rate": 0.0002535250095943517, + "epoch": 0.4950989813569095, + "step": 552 }, { - "loss": 2.1528, - "grad_norm": 1.5916081666946411, - "learning_rate": 5e-05, - "epoch": 0.8218607927258756, - "step": 12835 + "loss": 362.5946, + "grad_norm": 55.91824722290039, + "learning_rate": 0.0002528200413172039, + "epoch": 0.49599589980139663, + "step": 553 }, { - "loss": 2.1554, - "grad_norm": 1.5954616069793701, - "learning_rate": 5e-05, - "epoch": 0.8221809566498047, - "step": 12840 + "loss": 364.1907, + "grad_norm": 49.117069244384766, + "learning_rate": 0.00025211505061236583, + "epoch": 0.49689281824588377, + "step": 554 }, { - "loss": 2.1217, - "grad_norm": 1.5432181358337402, - "learning_rate": 5e-05, - "epoch": 0.8225011205737337, - "step": 12845 + "loss": 363.2774, + "grad_norm": 44.69606018066406, + "learning_rate": 0.00025141004308660414, + "epoch": 0.49778973669037097, + "step": 555 }, { - "loss": 2.1388, - "grad_norm": 1.579157829284668, - "learning_rate": 5e-05, - "epoch": 0.8228212844976628, - "step": 12850 + "loss": 363.2139, + "grad_norm": 52.18587112426758, + "learning_rate": 0.00025070502434681915, + "epoch": 0.4986866551348581, + "step": 556 }, { - "loss": 2.1628, - "grad_norm": 1.5736826658248901, - "learning_rate": 5e-05, - "epoch": 0.8231414484215919, - "step": 12855 + "loss": 365.6665, + "grad_norm": 57.393428802490234, + "learning_rate": 0.00025, + "epoch": 0.49958357357934524, + "step": 557 }, { - "loss": 2.1405, - "grad_norm": 1.5886597633361816, - "learning_rate": 5e-05, - "epoch": 0.823461612345521, - "step": 12860 + "loss": 363.4536, + "grad_norm": 52.89313507080078, + "learning_rate": 0.0002492949756531809, + "epoch": 0.5004804920238324, + "step": 558 }, { - "loss": 2.1396, - "grad_norm": 1.6655110120773315, - "learning_rate": 5e-05, - "epoch": 0.8237817762694499, - "step": 12865 + "loss": 363.2097, + "grad_norm": 51.265533447265625, + "learning_rate": 0.00024858995691339587, + "epoch": 0.5013774104683195, + "step": 559 }, { - "loss": 2.157, - "grad_norm": 1.6713991165161133, - "learning_rate": 5e-05, - "epoch": 0.824101940193379, - "step": 12870 + "loss": 366.4611, + "grad_norm": 56.473567962646484, + "learning_rate": 0.0002478849493876342, + "epoch": 0.5022743289128068, + "step": 560 }, { - "loss": 2.1416, - "grad_norm": 1.5701757669448853, - "learning_rate": 5e-05, - "epoch": 0.8244221041173081, - "step": 12875 + "loss": 361.8987, + "grad_norm": 49.68058776855469, + "learning_rate": 0.0002471799586827962, + "epoch": 0.5031712473572939, + "step": 561 }, { - "loss": 2.1355, - "grad_norm": 1.609489917755127, - "learning_rate": 5e-05, - "epoch": 0.8247422680412371, - "step": 12880 + "loss": 360.8694, + "grad_norm": 42.74179458618164, + "learning_rate": 0.00024647499040564844, + "epoch": 0.504068165801781, + "step": 562 }, { - "loss": 2.1341, - "grad_norm": 1.637588620185852, - "learning_rate": 5e-05, - "epoch": 0.8250624319651662, - "step": 12885 + "loss": 364.9089, + "grad_norm": 45.61265563964844, + "learning_rate": 0.00024577005016277885, + "epoch": 0.5049650842462682, + "step": 563 }, { - "loss": 2.1274, - "grad_norm": 1.5560768842697144, - "learning_rate": 5e-05, - "epoch": 0.8253825958890952, - "step": 12890 + "loss": 365.8124, + "grad_norm": 46.97050857543945, + "learning_rate": 0.0002450651435605526, + "epoch": 0.5058620026907553, + "step": 564 }, { - "loss": 2.1296, - "grad_norm": 1.547018051147461, - "learning_rate": 5e-05, - "epoch": 0.8257027598130243, - "step": 12895 + "loss": 360.1623, + "grad_norm": 46.26262664794922, + "learning_rate": 0.0002443602762050673, + "epoch": 0.5067589211352425, + "step": 565 }, { - "loss": 2.1605, - "grad_norm": 1.576536774635315, - "learning_rate": 5e-05, - "epoch": 0.8260229237369533, - "step": 12900 + "loss": 363.2248, + "grad_norm": 44.43347930908203, + "learning_rate": 0.00024365545370210842, + "epoch": 0.5076558395797296, + "step": 566 }, { - "loss": 2.1281, - "grad_norm": 1.5521260499954224, - "learning_rate": 5e-05, - "epoch": 0.8263430876608824, - "step": 12905 + "loss": 365.1527, + "grad_norm": 46.19889831542969, + "learning_rate": 0.00024295068165710478, + "epoch": 0.5085527580242168, + "step": 567 }, { - "loss": 2.1792, - "grad_norm": 1.5745978355407715, - "learning_rate": 5e-05, - "epoch": 0.8266632515848115, - "step": 12910 + "loss": 365.0658, + "grad_norm": 49.645484924316406, + "learning_rate": 0.00024224596567508385, + "epoch": 0.509449676468704, + "step": 568 }, { - "loss": 2.1682, - "grad_norm": 1.5464338064193726, - "learning_rate": 5e-05, - "epoch": 0.8269834155087404, - "step": 12915 + "loss": 362.5722, + "grad_norm": 47.69388961791992, + "learning_rate": 0.00024154131136062715, + "epoch": 0.5103465949131911, + "step": 569 }, { - "loss": 2.1368, - "grad_norm": 1.6071354150772095, - "learning_rate": 5e-05, - "epoch": 0.8273035794326695, - "step": 12920 + "loss": 361.0171, + "grad_norm": 44.855857849121094, + "learning_rate": 0.00024083672431782585, + "epoch": 0.5112435133576783, + "step": 570 }, { - "loss": 2.1642, - "grad_norm": 1.6548957824707031, - "learning_rate": 5e-05, - "epoch": 0.8276237433565986, - "step": 12925 + "loss": 361.5502, + "grad_norm": 48.860435485839844, + "learning_rate": 0.00024013221015023619, + "epoch": 0.5121404318021654, + "step": 571 }, { - "loss": 2.1636, - "grad_norm": 1.6200032234191895, - "learning_rate": 5e-05, - "epoch": 0.8279439072805276, - "step": 12930 + "loss": 360.8487, + "grad_norm": 45.69166564941406, + "learning_rate": 0.0002394277744608346, + "epoch": 0.5130373502466525, + "step": 572 }, { - "loss": 2.1587, - "grad_norm": 1.5933823585510254, - "learning_rate": 5e-05, - "epoch": 0.8282640712044567, - "step": 12935 + "loss": 361.6857, + "grad_norm": 45.67158889770508, + "learning_rate": 0.00023872342285197366, + "epoch": 0.5139342686911397, + "step": 573 }, { - "loss": 2.1459, - "grad_norm": 1.6406372785568237, - "learning_rate": 5e-05, - "epoch": 0.8285842351283857, - "step": 12940 + "loss": 364.0296, + "grad_norm": 51.487369537353516, + "learning_rate": 0.00023801916092533706, + "epoch": 0.5148311871356269, + "step": 574 }, { - "loss": 2.1455, - "grad_norm": 1.5688544511795044, - "learning_rate": 5e-05, - "epoch": 0.8289043990523148, - "step": 12945 + "loss": 366.4655, + "grad_norm": 49.884727478027344, + "learning_rate": 0.0002373149942818953, + "epoch": 0.5157281055801141, + "step": 575 }, { - "loss": 2.1704, - "grad_norm": 1.5574911832809448, - "learning_rate": 5e-05, - "epoch": 0.8292245629762438, - "step": 12950 + "loss": 360.9107, + "grad_norm": 42.73551940917969, + "learning_rate": 0.00023661092852186118, + "epoch": 0.5166250240246012, + "step": 576 }, { - "loss": 2.1366, - "grad_norm": 1.5636203289031982, - "learning_rate": 5e-05, - "epoch": 0.8295447269001729, - "step": 12955 + "loss": 364.7719, + "grad_norm": 44.425777435302734, + "learning_rate": 0.000235906969244645, + "epoch": 0.5175219424690883, + "step": 577 }, { - "loss": 2.1216, - "grad_norm": 1.5622069835662842, - "learning_rate": 5e-05, - "epoch": 0.829864890824102, - "step": 12960 + "loss": 362.6983, + "grad_norm": 52.82978057861328, + "learning_rate": 0.00023520312204881045, + "epoch": 0.5184188609135755, + "step": 578 }, { - "loss": 2.1306, - "grad_norm": 1.5027891397476196, - "learning_rate": 5e-05, - "epoch": 0.830185054748031, - "step": 12965 + "loss": 359.655, + "grad_norm": 46.826904296875, + "learning_rate": 0.0002344993925320295, + "epoch": 0.5193157793580626, + "step": 579 }, { - "loss": 2.1621, - "grad_norm": 1.565948486328125, - "learning_rate": 5e-05, - "epoch": 0.83050521867196, - "step": 12970 + "loss": 364.8085, + "grad_norm": 42.24338150024414, + "learning_rate": 0.00023379578629103865, + "epoch": 0.5202126978025499, + "step": 580 }, { - "loss": 2.1489, - "grad_norm": 1.7101725339889526, - "learning_rate": 5e-05, - "epoch": 0.8308253825958891, - "step": 12975 + "loss": 358.4188, + "grad_norm": 49.714271545410156, + "learning_rate": 0.00023309230892159364, + "epoch": 0.521109616247037, + "step": 581 }, { - "loss": 2.1543, - "grad_norm": 1.6061644554138184, - "learning_rate": 5e-05, - "epoch": 0.8311455465198182, - "step": 12980 + "loss": 364.1614, + "grad_norm": 47.561073303222656, + "learning_rate": 0.0002323889660184255, + "epoch": 0.5220065346915241, + "step": 582 }, { - "loss": 2.1695, - "grad_norm": 1.5707058906555176, - "learning_rate": 5e-05, - "epoch": 0.8314657104437472, - "step": 12985 + "loss": 361.0988, + "grad_norm": 45.20221710205078, + "learning_rate": 0.00023168576317519576, + "epoch": 0.5229034531360113, + "step": 583 }, { - "loss": 2.1433, - "grad_norm": 1.5309937000274658, - "learning_rate": 5e-05, - "epoch": 0.8317858743676763, - "step": 12990 + "loss": 367.0533, + "grad_norm": 47.38787078857422, + "learning_rate": 0.00023098270598445204, + "epoch": 0.5238003715804984, + "step": 584 }, { - "loss": 2.1318, - "grad_norm": 1.538222074508667, - "learning_rate": 5e-05, - "epoch": 0.8321060382916053, - "step": 12995 + "loss": 366.2763, + "grad_norm": 47.23054122924805, + "learning_rate": 0.00023027980003758363, + "epoch": 0.5246972900249856, + "step": 585 }, { - "loss": 2.1446, - "grad_norm": 1.738500714302063, - "learning_rate": 5e-05, - "epoch": 0.8324262022155343, - "step": 13000 + "loss": 365.6816, + "grad_norm": 43.855403900146484, + "learning_rate": 0.0002295770509247771, + "epoch": 0.5255942084694727, + "step": 586 }, { - "eval_loss": 2.0216612815856934, - "eval_runtime": 9.2438, - "eval_samples_per_second": 221.554, - "eval_steps_per_second": 27.694, - "epoch": 0.8324262022155343, - "step": 13000 + "loss": 365.6198, + "grad_norm": 51.30084228515625, + "learning_rate": 0.00022887446423497146, + "epoch": 0.5264911269139599, + "step": 587 }, { - "loss": 2.1179, - "grad_norm": 1.625812292098999, - "learning_rate": 5e-05, - "epoch": 0.8327463661394634, - "step": 13005 + "loss": 362.4194, + "grad_norm": 50.142330169677734, + "learning_rate": 0.00022817204555581418, + "epoch": 0.5273880453584471, + "step": 588 }, { - "loss": 2.1423, - "grad_norm": 1.795238733291626, - "learning_rate": 5e-05, - "epoch": 0.8330665300633925, - "step": 13010 + "loss": 364.2704, + "grad_norm": 46.52515411376953, + "learning_rate": 0.00022746980047361654, + "epoch": 0.5282849638029342, + "step": 589 }, { - "loss": 2.1532, - "grad_norm": 1.6200240850448608, - "learning_rate": 5e-05, - "epoch": 0.8333866939873215, - "step": 13015 + "loss": 362.0045, + "grad_norm": 48.26958465576172, + "learning_rate": 0.00022676773457330906, + "epoch": 0.5291818822474214, + "step": 590 }, { - "loss": 2.1766, - "grad_norm": 1.5638982057571411, - "learning_rate": 5e-05, - "epoch": 0.8337068579112505, - "step": 13020 + "loss": 364.3056, + "grad_norm": 45.78593063354492, + "learning_rate": 0.0002260658534383974, + "epoch": 0.5300788006919085, + "step": 591 }, { - "loss": 2.1575, - "grad_norm": 1.6555575132369995, - "learning_rate": 5e-05, - "epoch": 0.8340270218351796, - "step": 13025 + "loss": 364.2805, + "grad_norm": 47.130184173583984, + "learning_rate": 0.00022536416265091775, + "epoch": 0.5309757191363956, + "step": 592 }, { - "loss": 2.1378, - "grad_norm": 1.560592532157898, - "learning_rate": 5e-05, - "epoch": 0.8343471857591087, - "step": 13030 + "loss": 362.9882, + "grad_norm": 43.309181213378906, + "learning_rate": 0.0002246626677913923, + "epoch": 0.5318726375808829, + "step": 593 }, { - "loss": 2.144, - "grad_norm": 1.5742806196212769, - "learning_rate": 5e-05, - "epoch": 0.8346673496830377, - "step": 13035 + "loss": 362.9743, + "grad_norm": 40.39152145385742, + "learning_rate": 0.00022396137443878535, + "epoch": 0.53276955602537, + "step": 594 }, { - "loss": 2.1508, - "grad_norm": 1.6514239311218262, - "learning_rate": 5e-05, - "epoch": 0.8349875136069668, - "step": 13040 + "loss": 359.4163, + "grad_norm": 47.722068786621094, + "learning_rate": 0.00022326028817045844, + "epoch": 0.5336664744698572, + "step": 595 }, { - "loss": 2.1589, - "grad_norm": 1.5992977619171143, - "learning_rate": 5e-05, - "epoch": 0.8353076775308959, - "step": 13045 + "loss": 364.6919, + "grad_norm": 42.61846160888672, + "learning_rate": 0.00022255941456212605, + "epoch": 0.5345633929143443, + "step": 596 }, { - "loss": 2.1343, - "grad_norm": 1.783820629119873, - "learning_rate": 5e-05, - "epoch": 0.8356278414548248, - "step": 13050 + "loss": 368.3342, + "grad_norm": 44.96833038330078, + "learning_rate": 0.00022185875918781162, + "epoch": 0.5354603113588314, + "step": 597 }, { - "loss": 2.1822, - "grad_norm": 1.6316050291061401, - "learning_rate": 5e-05, - "epoch": 0.8359480053787539, - "step": 13055 + "loss": 363.2259, + "grad_norm": 43.944881439208984, + "learning_rate": 0.00022115832761980287, + "epoch": 0.5363572298033186, + "step": 598 }, { - "loss": 2.1543, - "grad_norm": 1.723607063293457, - "learning_rate": 5e-05, - "epoch": 0.836268169302683, - "step": 13060 + "loss": 362.7245, + "grad_norm": 47.073341369628906, + "learning_rate": 0.00022045812542860756, + "epoch": 0.5372541482478057, + "step": 599 }, { - "loss": 2.1511, - "grad_norm": 1.5759859085083008, - "learning_rate": 5e-05, - "epoch": 0.8365883332266121, - "step": 13065 + "loss": 363.0497, + "grad_norm": 44.11311721801758, + "learning_rate": 0.00021975815818290928, + "epoch": 0.538151066692293, + "step": 600 }, { - "loss": 2.162, - "grad_norm": 1.5511044263839722, - "learning_rate": 5e-05, - "epoch": 0.836908497150541, - "step": 13070 + "eval_loss": 1.61993408203125, + "eval_runtime": 65.3564, + "eval_samples_per_second": 31.336, + "eval_steps_per_second": 1.958, + "epoch": 0.538151066692293, + "step": 600 }, { - "loss": 2.1452, - "grad_norm": 1.5764498710632324, - "learning_rate": 5e-05, - "epoch": 0.8372286610744701, - "step": 13075 + "loss": 360.9368, + "grad_norm": 45.97838592529297, + "learning_rate": 0.00021905843144952316, + "epoch": 0.5390479851367801, + "step": 601 }, { - "loss": 2.1494, - "grad_norm": 1.6332306861877441, - "learning_rate": 5e-05, - "epoch": 0.8375488249983992, - "step": 13080 + "loss": 363.959, + "grad_norm": 45.36203384399414, + "learning_rate": 0.0002183589507933514, + "epoch": 0.5399449035812672, + "step": 602 }, { - "loss": 2.1495, - "grad_norm": 1.5492876768112183, - "learning_rate": 5e-05, - "epoch": 0.8378689889223282, - "step": 13085 + "loss": 363.9291, + "grad_norm": 43.02581024169922, + "learning_rate": 0.00021765972177733924, + "epoch": 0.5408418220257544, + "step": 603 }, { - "loss": 2.1547, - "grad_norm": 1.6958086490631104, - "learning_rate": 5e-05, - "epoch": 0.8381891528462573, - "step": 13090 + "loss": 363.5491, + "grad_norm": 47.46310806274414, + "learning_rate": 0.0002169607499624307, + "epoch": 0.5417387404702415, + "step": 604 }, { - "loss": 2.1328, - "grad_norm": 1.6327171325683594, - "learning_rate": 5e-05, - "epoch": 0.8385093167701864, - "step": 13095 + "loss": 367.6017, + "grad_norm": 47.89605712890625, + "learning_rate": 0.00021626204090752422, + "epoch": 0.5426356589147286, + "step": 605 }, { - "loss": 2.1196, - "grad_norm": 1.4885859489440918, - "learning_rate": 5e-05, - "epoch": 0.8388294806941153, - "step": 13100 + "loss": 364.9732, + "grad_norm": 45.463443756103516, + "learning_rate": 0.00021556360016942842, + "epoch": 0.5435325773592158, + "step": 606 }, { - "loss": 2.1367, - "grad_norm": 1.5167481899261475, - "learning_rate": 5e-05, - "epoch": 0.8391496446180444, - "step": 13105 + "loss": 364.4341, + "grad_norm": 43.64617919921875, + "learning_rate": 0.00021486543330281812, + "epoch": 0.544429495803703, + "step": 607 }, { - "loss": 2.1373, - "grad_norm": 1.5418943166732788, - "learning_rate": 5e-05, - "epoch": 0.8394698085419735, - "step": 13110 + "loss": 366.3894, + "grad_norm": 41.575531005859375, + "learning_rate": 0.0002141675458601901, + "epoch": 0.5453264142481902, + "step": 608 }, { - "loss": 2.1341, - "grad_norm": 1.5980948209762573, - "learning_rate": 5e-05, - "epoch": 0.8397899724659026, - "step": 13115 + "loss": 363.112, + "grad_norm": 46.79388427734375, + "learning_rate": 0.00021346994339181883, + "epoch": 0.5462233326926773, + "step": 609 }, { - "loss": 2.1283, - "grad_norm": 1.5357916355133057, - "learning_rate": 5e-05, - "epoch": 0.8401101363898316, - "step": 13120 + "loss": 361.5751, + "grad_norm": 48.13455581665039, + "learning_rate": 0.0002127726314457124, + "epoch": 0.5471202511371644, + "step": 610 }, { - "loss": 2.1357, - "grad_norm": 1.6638524532318115, - "learning_rate": 5e-05, - "epoch": 0.8404303003137606, - "step": 13125 + "loss": 361.1321, + "grad_norm": 45.220550537109375, + "learning_rate": 0.0002120756155675683, + "epoch": 0.5480171695816516, + "step": 611 }, { - "loss": 2.1465, - "grad_norm": 1.622886061668396, - "learning_rate": 5e-05, - "epoch": 0.8407504642376897, - "step": 13130 + "loss": 365.0866, + "grad_norm": 46.22264099121094, + "learning_rate": 0.0002113789013007295, + "epoch": 0.5489140880261387, + "step": 612 }, { - "loss": 2.1373, - "grad_norm": 1.618874430656433, - "learning_rate": 5e-05, - "epoch": 0.8410706281616187, - "step": 13135 + "loss": 360.2099, + "grad_norm": 47.99028015136719, + "learning_rate": 0.00021068249418614027, + "epoch": 0.549811006470626, + "step": 613 }, { - "loss": 2.1567, - "grad_norm": 1.6146790981292725, - "learning_rate": 5e-05, - "epoch": 0.8413907920855478, - "step": 13140 + "loss": 362.4004, + "grad_norm": 45.35298538208008, + "learning_rate": 0.00020998639976230202, + "epoch": 0.5507079249151131, + "step": 614 }, { - "loss": 2.1464, - "grad_norm": 1.5791168212890625, - "learning_rate": 5e-05, - "epoch": 0.8417109560094769, - "step": 13145 + "loss": 362.9482, + "grad_norm": 45.84006118774414, + "learning_rate": 0.00020929062356522942, + "epoch": 0.5516048433596002, + "step": 615 }, { - "loss": 2.158, - "grad_norm": 1.506654143333435, - "learning_rate": 5e-05, - "epoch": 0.842031119933406, - "step": 13150 + "loss": 361.6893, + "grad_norm": 46.06373977661133, + "learning_rate": 0.00020859517112840637, + "epoch": 0.5525017618040874, + "step": 616 }, { - "loss": 2.1604, - "grad_norm": 1.5425060987472534, - "learning_rate": 5e-05, - "epoch": 0.8423512838573349, - "step": 13155 + "loss": 368.1667, + "grad_norm": 43.56032180786133, + "learning_rate": 0.00020790004798274165, + "epoch": 0.5533986802485745, + "step": 617 }, { - "loss": 2.156, - "grad_norm": 1.5820544958114624, - "learning_rate": 5e-05, - "epoch": 0.842671447781264, - "step": 13160 + "loss": 363.2073, + "grad_norm": 43.215370178222656, + "learning_rate": 0.00020720525965652544, + "epoch": 0.5542955986930617, + "step": 618 }, { - "loss": 2.136, - "grad_norm": 1.6569517850875854, - "learning_rate": 5e-05, - "epoch": 0.8429916117051931, - "step": 13165 + "loss": 358.3785, + "grad_norm": 47.84462356567383, + "learning_rate": 0.00020651081167538508, + "epoch": 0.5551925171375488, + "step": 619 }, { - "loss": 2.135, - "grad_norm": 1.7146352529525757, - "learning_rate": 5e-05, - "epoch": 0.8433117756291221, - "step": 13170 + "loss": 365.6581, + "grad_norm": 49.96092987060547, + "learning_rate": 0.00020581670956224113, + "epoch": 0.556089435582036, + "step": 620 }, { - "loss": 2.1443, - "grad_norm": 1.6757546663284302, - "learning_rate": 5e-05, - "epoch": 0.8436319395530512, - "step": 13175 + "loss": 363.1918, + "grad_norm": 44.61714172363281, + "learning_rate": 0.00020512295883726338, + "epoch": 0.5569863540265232, + "step": 621 }, { - "loss": 2.1559, - "grad_norm": 1.5445369482040405, - "learning_rate": 5e-05, - "epoch": 0.8439521034769802, - "step": 13180 + "loss": 363.2948, + "grad_norm": 44.841495513916016, + "learning_rate": 0.00020442956501782713, + "epoch": 0.5578832724710103, + "step": 622 }, { - "loss": 2.1461, - "grad_norm": 1.6682908535003662, - "learning_rate": 5e-05, - "epoch": 0.8442722674009092, - "step": 13185 + "loss": 358.7636, + "grad_norm": 46.29624938964844, + "learning_rate": 0.00020373653361846925, + "epoch": 0.5587801909154975, + "step": 623 }, { - "loss": 2.1599, - "grad_norm": 1.5197023153305054, - "learning_rate": 5e-05, - "epoch": 0.8445924313248383, - "step": 13190 + "loss": 362.0233, + "grad_norm": 43.61477279663086, + "learning_rate": 0.0002030438701508443, + "epoch": 0.5596771093599846, + "step": 624 }, { - "loss": 2.1202, - "grad_norm": 1.605768084526062, - "learning_rate": 5e-05, - "epoch": 0.8449125952487674, - "step": 13195 + "loss": 366.3086, + "grad_norm": 44.28224182128906, + "learning_rate": 0.00020235158012368065, + "epoch": 0.5605740278044717, + "step": 625 }, { - "loss": 2.1589, - "grad_norm": 1.7030861377716064, - "learning_rate": 5e-05, - "epoch": 0.8452327591726965, - "step": 13200 + "loss": 357.9655, + "grad_norm": 43.08799362182617, + "learning_rate": 0.00020165966904273666, + "epoch": 0.5614709462489589, + "step": 626 }, { - "eval_loss": 2.025972843170166, - "eval_runtime": 8.9924, - "eval_samples_per_second": 227.748, - "eval_steps_per_second": 28.469, - "epoch": 0.8452327591726965, - "step": 13200 + "loss": 364.1879, + "grad_norm": 45.73900604248047, + "learning_rate": 0.00020096814241075703, + "epoch": 0.5623678646934461, + "step": 627 }, { - "loss": 2.1577, - "grad_norm": 1.5597788095474243, - "learning_rate": 5e-05, - "epoch": 0.8455529230966254, - "step": 13205 + "loss": 359.9633, + "grad_norm": 48.213985443115234, + "learning_rate": 0.00020027700572742895, + "epoch": 0.5632647831379333, + "step": 628 }, { - "loss": 2.1244, - "grad_norm": 1.5069013833999634, - "learning_rate": 5e-05, - "epoch": 0.8458730870205545, - "step": 13210 + "loss": 365.9498, + "grad_norm": 43.3817253112793, + "learning_rate": 0.00019958626448933825, + "epoch": 0.5641617015824204, + "step": 629 }, { - "loss": 2.1457, - "grad_norm": 1.4982860088348389, - "learning_rate": 5e-05, - "epoch": 0.8461932509444836, - "step": 13215 + "loss": 362.1366, + "grad_norm": 42.70503234863281, + "learning_rate": 0.00019889592418992594, + "epoch": 0.5650586200269075, + "step": 630 }, { - "loss": 2.1392, - "grad_norm": 1.5416525602340698, - "learning_rate": 5e-05, - "epoch": 0.8465134148684126, - "step": 13220 + "loss": 361.433, + "grad_norm": 46.60575485229492, + "learning_rate": 0.00019820599031944436, + "epoch": 0.5659555384713947, + "step": 631 }, { - "loss": 2.1341, - "grad_norm": 1.5496692657470703, - "learning_rate": 5e-05, - "epoch": 0.8468335787923417, - "step": 13225 + "loss": 364.1061, + "grad_norm": 42.36573791503906, + "learning_rate": 0.00019751646836491338, + "epoch": 0.5668524569158818, + "step": 632 }, { - "loss": 2.1419, - "grad_norm": 1.55703604221344, - "learning_rate": 5e-05, - "epoch": 0.8471537427162708, - "step": 13230 + "loss": 360.4161, + "grad_norm": 43.14451599121094, + "learning_rate": 0.00019682736381007707, + "epoch": 0.5677493753603691, + "step": 633 }, { - "loss": 2.1623, - "grad_norm": 1.5972099304199219, - "learning_rate": 5e-05, - "epoch": 0.8474739066401998, - "step": 13235 + "loss": 357.0567, + "grad_norm": 44.19496154785156, + "learning_rate": 0.00019613868213535997, + "epoch": 0.5686462938048562, + "step": 634 }, { - "loss": 2.15, - "grad_norm": 1.569321870803833, - "learning_rate": 5e-05, - "epoch": 0.8477940705641288, - "step": 13240 + "loss": 361.1339, + "grad_norm": 42.32905960083008, + "learning_rate": 0.00019545042881782333, + "epoch": 0.5695432122493433, + "step": 635 }, { - "loss": 2.1483, - "grad_norm": 1.549758791923523, - "learning_rate": 5e-05, - "epoch": 0.8481142344880579, - "step": 13245 + "loss": 361.2873, + "grad_norm": 47.53689956665039, + "learning_rate": 0.00019476260933112163, + "epoch": 0.5704401306938305, + "step": 636 }, { - "loss": 2.156, - "grad_norm": 1.6335357427597046, - "learning_rate": 5e-05, - "epoch": 0.848434398411987, - "step": 13250 + "loss": 362.2348, + "grad_norm": 47.5960578918457, + "learning_rate": 0.00019407522914545957, + "epoch": 0.5713370491383176, + "step": 637 }, { - "loss": 2.1435, - "grad_norm": 1.6042168140411377, - "learning_rate": 5e-05, - "epoch": 0.848754562335916, - "step": 13255 + "loss": 366.9183, + "grad_norm": 43.92160415649414, + "learning_rate": 0.00019338829372754745, + "epoch": 0.5722339675828048, + "step": 638 }, { - "loss": 2.1489, - "grad_norm": 1.5949348211288452, - "learning_rate": 5e-05, - "epoch": 0.849074726259845, - "step": 13260 + "loss": 361.6643, + "grad_norm": 46.373863220214844, + "learning_rate": 0.0001927018085405588, + "epoch": 0.5731308860272919, + "step": 639 }, { - "loss": 2.1586, - "grad_norm": 1.5524771213531494, - "learning_rate": 5e-05, - "epoch": 0.8493948901837741, - "step": 13265 + "loss": 362.9005, + "grad_norm": 45.955814361572266, + "learning_rate": 0.0001920157790440864, + "epoch": 0.5740278044717791, + "step": 640 }, { - "loss": 2.1251, - "grad_norm": 1.6360365152359009, - "learning_rate": 5e-05, - "epoch": 0.8497150541077031, - "step": 13270 + "loss": 360.8845, + "grad_norm": 46.01215362548828, + "learning_rate": 0.00019133021069409872, + "epoch": 0.5749247229162663, + "step": 641 }, { - "loss": 2.1508, - "grad_norm": 1.500728964805603, - "learning_rate": 5e-05, - "epoch": 0.8500352180316322, - "step": 13275 + "loss": 361.9622, + "grad_norm": 46.09065628051758, + "learning_rate": 0.00019064510894289705, + "epoch": 0.5758216413607534, + "step": 642 }, { - "loss": 2.1457, - "grad_norm": 1.6618624925613403, - "learning_rate": 5e-05, - "epoch": 0.8503553819555613, - "step": 13280 + "loss": 363.2684, + "grad_norm": 45.370140075683594, + "learning_rate": 0.00018996047923907166, + "epoch": 0.5767185598052406, + "step": 643 }, { - "loss": 2.1543, - "grad_norm": 1.5445833206176758, - "learning_rate": 5e-05, - "epoch": 0.8506755458794903, - "step": 13285 + "loss": 362.285, + "grad_norm": 43.416664123535156, + "learning_rate": 0.00018927632702745866, + "epoch": 0.5776154782497277, + "step": 644 }, { - "loss": 2.1769, - "grad_norm": 1.5497047901153564, - "learning_rate": 5e-05, - "epoch": 0.8509957098034193, - "step": 13290 + "loss": 360.188, + "grad_norm": 44.63084030151367, + "learning_rate": 0.00018859265774909668, + "epoch": 0.5785123966942148, + "step": 645 }, { - "loss": 2.1584, - "grad_norm": 1.6006513833999634, - "learning_rate": 5e-05, - "epoch": 0.8513158737273484, - "step": 13295 + "loss": 362.1082, + "grad_norm": 43.95875930786133, + "learning_rate": 0.00018790947684118364, + "epoch": 0.5794093151387021, + "step": 646 }, { - "loss": 2.1348, - "grad_norm": 1.671229600906372, - "learning_rate": 5e-05, - "epoch": 0.8516360376512775, - "step": 13300 + "loss": 364.6595, + "grad_norm": 46.196041107177734, + "learning_rate": 0.00018722678973703355, + "epoch": 0.5803062335831892, + "step": 647 }, { - "loss": 2.1527, - "grad_norm": 1.5722646713256836, - "learning_rate": 5e-05, - "epoch": 0.8519562015752065, - "step": 13305 + "loss": 367.5318, + "grad_norm": 52.50529479980469, + "learning_rate": 0.00018654460186603295, + "epoch": 0.5812031520276764, + "step": 648 }, { - "loss": 2.156, - "grad_norm": 1.5288598537445068, - "learning_rate": 5e-05, - "epoch": 0.8522763654991355, - "step": 13310 + "loss": 364.7477, + "grad_norm": 44.10645294189453, + "learning_rate": 0.00018586291865359822, + "epoch": 0.5821000704721635, + "step": 649 }, { - "loss": 2.1362, - "grad_norm": 1.6392875909805298, - "learning_rate": 5e-05, - "epoch": 0.8525965294230646, - "step": 13315 + "loss": 362.5089, + "grad_norm": 42.808326721191406, + "learning_rate": 0.00018518174552113216, + "epoch": 0.5829969889166506, + "step": 650 }, { - "loss": 2.1416, - "grad_norm": 1.5657522678375244, - "learning_rate": 5e-05, - "epoch": 0.8529166933469937, - "step": 13320 + "eval_loss": 1.6019372940063477, + "eval_runtime": 17.6903, + "eval_samples_per_second": 115.769, + "eval_steps_per_second": 14.471, + "epoch": 0.5829969889166506, + "step": 650 }, { - "loss": 2.1231, - "grad_norm": 1.6025290489196777, - "learning_rate": 5e-05, - "epoch": 0.8532368572709227, - "step": 13325 + "loss": 361.447, + "grad_norm": 45.0283088684082, + "learning_rate": 0.0001845010878859809, + "epoch": 0.5838939073611378, + "step": 651 }, { - "loss": 2.1308, - "grad_norm": 1.575181245803833, - "learning_rate": 5e-05, - "epoch": 0.8535570211948518, - "step": 13330 + "loss": 363.9907, + "grad_norm": 45.77663040161133, + "learning_rate": 0.00018382095116139098, + "epoch": 0.5847908258056249, + "step": 652 }, { - "loss": 2.1447, - "grad_norm": 1.7142658233642578, - "learning_rate": 5e-05, - "epoch": 0.8538771851187809, - "step": 13335 + "loss": 358.2193, + "grad_norm": 47.19649124145508, + "learning_rate": 0.00018314134075646582, + "epoch": 0.5856877442501122, + "step": 653 }, { - "loss": 2.1016, - "grad_norm": 1.573530912399292, - "learning_rate": 5e-05, - "epoch": 0.8541973490427098, - "step": 13340 + "loss": 362.618, + "grad_norm": 45.46641540527344, + "learning_rate": 0.00018246226207612338, + "epoch": 0.5865846626945993, + "step": 654 }, { - "loss": 2.1634, - "grad_norm": 1.64275324344635, - "learning_rate": 5e-05, - "epoch": 0.8545175129666389, - "step": 13345 + "loss": 364.6533, + "grad_norm": 45.993873596191406, + "learning_rate": 0.00018178372052105263, + "epoch": 0.5874815811390864, + "step": 655 }, { - "loss": 2.1234, - "grad_norm": 1.6251364946365356, - "learning_rate": 5e-05, - "epoch": 0.854837676890568, - "step": 13350 + "loss": 359.9103, + "grad_norm": 49.62721252441406, + "learning_rate": 0.00018110572148767089, + "epoch": 0.5883784995835736, + "step": 656 }, { - "loss": 2.1461, - "grad_norm": 1.5351976156234741, - "learning_rate": 5e-05, - "epoch": 0.855157840814497, - "step": 13355 + "loss": 362.929, + "grad_norm": 47.14739227294922, + "learning_rate": 0.00018042827036808074, + "epoch": 0.5892754180280607, + "step": 657 }, { - "loss": 2.1326, - "grad_norm": 1.557486891746521, - "learning_rate": 5e-05, - "epoch": 0.8554780047384261, - "step": 13360 + "loss": 364.1747, + "grad_norm": 46.9727897644043, + "learning_rate": 0.00017975137255002744, + "epoch": 0.5901723364725479, + "step": 658 }, { - "loss": 2.1336, - "grad_norm": 1.638458013534546, - "learning_rate": 5e-05, - "epoch": 0.8557981686623551, - "step": 13365 + "loss": 362.2029, + "grad_norm": 45.876277923583984, + "learning_rate": 0.0001790750334168555, + "epoch": 0.591069254917035, + "step": 659 }, { - "loss": 2.1435, - "grad_norm": 1.5943306684494019, - "learning_rate": 5e-05, - "epoch": 0.8561183325862842, - "step": 13370 + "loss": 359.2526, + "grad_norm": 42.93642807006836, + "learning_rate": 0.00017839925834746653, + "epoch": 0.5919661733615222, + "step": 660 }, { - "loss": 2.1446, - "grad_norm": 1.5555797815322876, - "learning_rate": 5e-05, - "epoch": 0.8564384965102132, - "step": 13375 + "loss": 363.6162, + "grad_norm": 41.57487487792969, + "learning_rate": 0.0001777240527162761, + "epoch": 0.5928630918060094, + "step": 661 }, { - "loss": 2.1498, - "grad_norm": 1.5393874645233154, - "learning_rate": 5e-05, - "epoch": 0.8567586604341423, - "step": 13380 + "loss": 361.9038, + "grad_norm": 46.25205993652344, + "learning_rate": 0.00017704942189317104, + "epoch": 0.5937600102504965, + "step": 662 }, { - "loss": 2.1332, - "grad_norm": 1.7171132564544678, - "learning_rate": 5e-05, - "epoch": 0.8570788243580714, - "step": 13385 + "loss": 358.8016, + "grad_norm": 45.354007720947266, + "learning_rate": 0.0001763753712434666, + "epoch": 0.5946569286949837, + "step": 663 }, { - "loss": 2.1282, - "grad_norm": 1.6407911777496338, - "learning_rate": 5e-05, - "epoch": 0.8573989882820003, - "step": 13390 + "loss": 361.5577, + "grad_norm": 42.980037689208984, + "learning_rate": 0.00017570190612786413, + "epoch": 0.5955538471394708, + "step": 664 }, { - "loss": 2.1315, - "grad_norm": 1.6605337858200073, - "learning_rate": 5e-05, - "epoch": 0.8577191522059294, - "step": 13395 + "loss": 361.3445, + "grad_norm": 44.7468147277832, + "learning_rate": 0.00017502903190240815, + "epoch": 0.5964507655839579, + "step": 665 }, { - "loss": 2.129, - "grad_norm": 1.655444860458374, - "learning_rate": 5e-05, - "epoch": 0.8580393161298585, - "step": 13400 + "loss": 360.489, + "grad_norm": 43.96569061279297, + "learning_rate": 0.00017435675391844397, + "epoch": 0.5973476840284452, + "step": 666 }, { - "eval_loss": 2.0151822566986084, - "eval_runtime": 12.5576, - "eval_samples_per_second": 163.088, - "eval_steps_per_second": 20.386, - "epoch": 0.8580393161298585, - "step": 13400 + "loss": 365.539, + "grad_norm": 45.040103912353516, + "learning_rate": 0.00017368507752257495, + "epoch": 0.5982446024729323, + "step": 667 }, { - "loss": 2.1229, - "grad_norm": 1.6577417850494385, - "learning_rate": 5e-05, - "epoch": 0.8583594800537876, - "step": 13405 + "loss": 363.3497, + "grad_norm": 45.93570327758789, + "learning_rate": 0.00017301400805661989, + "epoch": 0.5991415209174195, + "step": 668 }, { - "loss": 2.1458, - "grad_norm": 1.5821425914764404, - "learning_rate": 5e-05, - "epoch": 0.8586796439777166, - "step": 13410 + "loss": 356.2852, + "grad_norm": 41.94508743286133, + "learning_rate": 0.00017234355085757086, + "epoch": 0.6000384393619066, + "step": 669 }, { - "loss": 2.1436, - "grad_norm": 1.529543161392212, - "learning_rate": 5e-05, - "epoch": 0.8589998079016457, - "step": 13415 + "loss": 364.3321, + "grad_norm": 40.20936584472656, + "learning_rate": 0.00017167371125755064, + "epoch": 0.6009353578063937, + "step": 670 }, { - "loss": 2.1617, - "grad_norm": 1.5528123378753662, - "learning_rate": 5e-05, - "epoch": 0.8593199718255747, - "step": 13420 + "loss": 365.0333, + "grad_norm": 42.29598617553711, + "learning_rate": 0.00017100449458377003, + "epoch": 0.6018322762508809, + "step": 671 }, { - "loss": 2.1315, - "grad_norm": 1.5557422637939453, - "learning_rate": 5e-05, - "epoch": 0.8596401357495037, - "step": 13425 + "loss": 356.7194, + "grad_norm": 41.43622589111328, + "learning_rate": 0.00017033590615848598, + "epoch": 0.602729194695368, + "step": 672 }, { - "loss": 2.1229, - "grad_norm": 1.7199292182922363, - "learning_rate": 5e-05, - "epoch": 0.8599602996734328, - "step": 13430 + "loss": 362.7276, + "grad_norm": 44.03760528564453, + "learning_rate": 0.0001696679512989589, + "epoch": 0.6036261131398553, + "step": 673 }, { - "loss": 2.1343, - "grad_norm": 1.5561965703964233, - "learning_rate": 5e-05, - "epoch": 0.8602804635973619, - "step": 13435 + "loss": 359.1711, + "grad_norm": 39.68849182128906, + "learning_rate": 0.00016900063531741048, + "epoch": 0.6045230315843424, + "step": 674 }, { - "loss": 2.1587, - "grad_norm": 1.6795167922973633, - "learning_rate": 5e-05, - "epoch": 0.8606006275212909, - "step": 13440 + "loss": 357.2, + "grad_norm": 40.92485809326172, + "learning_rate": 0.0001683339635209813, + "epoch": 0.6054199500288295, + "step": 675 }, { - "loss": 2.1469, - "grad_norm": 1.5269910097122192, - "learning_rate": 5e-05, - "epoch": 0.8609207914452199, - "step": 13445 + "loss": 362.3214, + "grad_norm": 41.29072189331055, + "learning_rate": 0.000167667941211689, + "epoch": 0.6063168684733167, + "step": 676 }, { - "loss": 2.1506, - "grad_norm": 1.5669056177139282, - "learning_rate": 5e-05, - "epoch": 0.861240955369149, - "step": 13450 + "loss": 361.0124, + "grad_norm": 41.026676177978516, + "learning_rate": 0.00016700257368638572, + "epoch": 0.6072137869178038, + "step": 677 }, { - "loss": 2.1485, - "grad_norm": 1.6965998411178589, - "learning_rate": 5e-05, - "epoch": 0.8615611192930781, - "step": 13455 + "loss": 360.2582, + "grad_norm": 43.93520736694336, + "learning_rate": 0.0001663378662367161, + "epoch": 0.608110705362291, + "step": 678 }, { - "loss": 2.1213, - "grad_norm": 1.7064182758331299, - "learning_rate": 5e-05, - "epoch": 0.8618812832170071, - "step": 13460 + "loss": 358.0945, + "grad_norm": 43.4892578125, + "learning_rate": 0.00016567382414907532, + "epoch": 0.6090076238067781, + "step": 679 }, { - "loss": 2.1397, - "grad_norm": 1.587528944015503, - "learning_rate": 5e-05, - "epoch": 0.8622014471409362, - "step": 13465 + "loss": 360.7998, + "grad_norm": 43.67966842651367, + "learning_rate": 0.00016501045270456694, + "epoch": 0.6099045422512653, + "step": 680 }, { - "loss": 2.1396, - "grad_norm": 1.616944670677185, - "learning_rate": 5e-05, - "epoch": 0.8625216110648652, - "step": 13470 + "loss": 359.6815, + "grad_norm": 42.92584991455078, + "learning_rate": 0.0001643477571789609, + "epoch": 0.6108014606957525, + "step": 681 }, { - "loss": 2.1513, - "grad_norm": 1.6129498481750488, - "learning_rate": 5e-05, - "epoch": 0.8628417749887942, - "step": 13475 + "loss": 361.6625, + "grad_norm": 42.53407287597656, + "learning_rate": 0.00016368574284265165, + "epoch": 0.6116983791402396, + "step": 682 }, { - "loss": 2.1434, - "grad_norm": 1.612741231918335, - "learning_rate": 5e-05, - "epoch": 0.8631619389127233, - "step": 13480 + "loss": 363.5579, + "grad_norm": 41.2686767578125, + "learning_rate": 0.00016302441496061592, + "epoch": 0.6125952975847268, + "step": 683 }, { - "loss": 2.1399, - "grad_norm": 1.6746602058410645, - "learning_rate": 5e-05, - "epoch": 0.8634821028366524, - "step": 13485 + "loss": 360.9108, + "grad_norm": 42.09267044067383, + "learning_rate": 0.00016236377879237136, + "epoch": 0.6134922160292139, + "step": 684 }, { - "loss": 2.1382, - "grad_norm": 1.6259815692901611, - "learning_rate": 5e-05, - "epoch": 0.8638022667605815, - "step": 13490 + "loss": 360.2266, + "grad_norm": 42.135650634765625, + "learning_rate": 0.0001617038395919344, + "epoch": 0.614389134473701, + "step": 685 }, { - "loss": 2.1307, - "grad_norm": 1.5910407304763794, - "learning_rate": 5e-05, - "epoch": 0.8641224306845104, - "step": 13495 + "loss": 355.2124, + "grad_norm": 41.78007888793945, + "learning_rate": 0.00016104460260777837, + "epoch": 0.6152860529181883, + "step": 686 }, { - "loss": 2.1576, - "grad_norm": 1.6779932975769043, - "learning_rate": 5e-05, - "epoch": 0.8644425946084395, - "step": 13500 + "loss": 357.8339, + "grad_norm": 41.49577713012695, + "learning_rate": 0.00016038607308279198, + "epoch": 0.6161829713626754, + "step": 687 }, { - "loss": 2.1316, - "grad_norm": 1.5591740608215332, - "learning_rate": 5e-05, - "epoch": 0.8647627585323686, - "step": 13505 + "loss": 361.7785, + "grad_norm": 47.102848052978516, + "learning_rate": 0.00015972825625423765, + "epoch": 0.6170798898071626, + "step": 688 }, { - "loss": 2.1338, - "grad_norm": 1.5584458112716675, - "learning_rate": 5e-05, - "epoch": 0.8650829224562976, - "step": 13510 + "loss": 357.3535, + "grad_norm": 41.43706512451172, + "learning_rate": 0.0001590711573537096, + "epoch": 0.6179768082516497, + "step": 689 }, { - "loss": 2.147, - "grad_norm": 1.5230597257614136, - "learning_rate": 5e-05, - "epoch": 0.8654030863802267, - "step": 13515 + "loss": 359.8207, + "grad_norm": 40.92182540893555, + "learning_rate": 0.00015841478160709242, + "epoch": 0.6188737266961368, + "step": 690 }, { - "loss": 2.1538, - "grad_norm": 1.5795865058898926, - "learning_rate": 5e-05, - "epoch": 0.8657232503041558, - "step": 13520 + "loss": 358.1373, + "grad_norm": 49.461273193359375, + "learning_rate": 0.0001577591342345195, + "epoch": 0.619770645140624, + "step": 691 }, { - "loss": 2.1305, - "grad_norm": 1.5907702445983887, - "learning_rate": 5e-05, - "epoch": 0.8660434142280847, - "step": 13525 + "loss": 361.2856, + "grad_norm": 50.03120040893555, + "learning_rate": 0.00015710422045033158, + "epoch": 0.6206675635851111, + "step": 692 }, { - "loss": 2.1522, - "grad_norm": 1.5211584568023682, - "learning_rate": 5e-05, - "epoch": 0.8663635781520138, - "step": 13530 + "loss": 359.0531, + "grad_norm": 43.81147003173828, + "learning_rate": 0.00015645004546303493, + "epoch": 0.6215644820295984, + "step": 693 }, { - "loss": 2.1708, - "grad_norm": 1.5619218349456787, - "learning_rate": 5e-05, - "epoch": 0.8666837420759429, - "step": 13535 + "loss": 357.6739, + "grad_norm": 44.85881042480469, + "learning_rate": 0.00015579661447526067, + "epoch": 0.6224614004740855, + "step": 694 }, { - "loss": 2.121, - "grad_norm": 1.7085723876953125, - "learning_rate": 5e-05, - "epoch": 0.867003905999872, - "step": 13540 + "loss": 358.5413, + "grad_norm": 45.34134292602539, + "learning_rate": 0.00015514393268372247, + "epoch": 0.6233583189185726, + "step": 695 }, { - "loss": 2.1772, - "grad_norm": 1.5624220371246338, - "learning_rate": 5e-05, - "epoch": 0.867324069923801, - "step": 13545 + "loss": 362.4291, + "grad_norm": 44.94168472290039, + "learning_rate": 0.00015449200527917578, + "epoch": 0.6242552373630598, + "step": 696 }, { - "loss": 2.1555, - "grad_norm": 1.5626081228256226, - "learning_rate": 5e-05, - "epoch": 0.86764423384773, - "step": 13550 + "loss": 353.4212, + "grad_norm": 43.28814697265625, + "learning_rate": 0.00015384083744637663, + "epoch": 0.6251521558075469, + "step": 697 }, { - "loss": 2.1631, - "grad_norm": 1.6283838748931885, - "learning_rate": 5e-05, - "epoch": 0.8679643977716591, - "step": 13555 + "loss": 361.8906, + "grad_norm": 42.88665008544922, + "learning_rate": 0.00015319043436403992, + "epoch": 0.626049074252034, + "step": 698 }, { - "loss": 2.1582, - "grad_norm": 1.5527262687683105, - "learning_rate": 5e-05, - "epoch": 0.8682845616955881, - "step": 13560 + "loss": 357.3509, + "grad_norm": 46.005001068115234, + "learning_rate": 0.00015254080120479874, + "epoch": 0.6269459926965213, + "step": 699 }, { - "loss": 2.1323, - "grad_norm": 1.5154231786727905, - "learning_rate": 5e-05, - "epoch": 0.8686047256195172, - "step": 13565 + "loss": 356.4296, + "grad_norm": 44.4104118347168, + "learning_rate": 0.00015189194313516288, + "epoch": 0.6278429111410084, + "step": 700 }, { - "loss": 2.1506, - "grad_norm": 1.5625056028366089, - "learning_rate": 5e-05, - "epoch": 0.8689248895434463, - "step": 13570 + "eval_loss": 1.597915768623352, + "eval_runtime": 17.571, + "eval_samples_per_second": 116.555, + "eval_steps_per_second": 14.569, + "epoch": 0.6278429111410084, + "step": 700 }, { - "loss": 2.1419, - "grad_norm": 1.569746494293213, - "learning_rate": 5e-05, - "epoch": 0.8692450534673754, - "step": 13575 + "loss": 358.631, + "grad_norm": 43.341407775878906, + "learning_rate": 0.000151243865315478, + "epoch": 0.6287398295854956, + "step": 701 }, { - "loss": 2.1709, - "grad_norm": 1.5288923978805542, - "learning_rate": 5e-05, - "epoch": 0.8695652173913043, - "step": 13580 + "loss": 361.772, + "grad_norm": 43.18885803222656, + "learning_rate": 0.00015059657289988426, + "epoch": 0.6296367480299827, + "step": 702 }, { - "loss": 2.149, - "grad_norm": 1.5755292177200317, - "learning_rate": 5e-05, - "epoch": 0.8698853813152334, - "step": 13585 + "loss": 359.0464, + "grad_norm": 41.106483459472656, + "learning_rate": 0.00014995007103627567, + "epoch": 0.6305336664744698, + "step": 703 }, { - "loss": 2.1528, - "grad_norm": 1.5749847888946533, - "learning_rate": 5e-05, - "epoch": 0.8702055452391625, - "step": 13590 + "loss": 358.0773, + "grad_norm": 42.815834045410156, + "learning_rate": 0.00014930436486625907, + "epoch": 0.631430584918957, + "step": 704 }, { - "loss": 2.1523, - "grad_norm": 1.5499904155731201, - "learning_rate": 5e-05, - "epoch": 0.8705257091630915, - "step": 13595 + "loss": 358.7279, + "grad_norm": 39.7459602355957, + "learning_rate": 0.00014865945952511296, + "epoch": 0.6323275033634441, + "step": 705 }, { - "loss": 2.1692, - "grad_norm": 1.6534972190856934, - "learning_rate": 5e-05, - "epoch": 0.8708458730870205, - "step": 13600 + "loss": 358.3263, + "grad_norm": 42.54743576049805, + "learning_rate": 0.00014801536014174706, + "epoch": 0.6332244218079314, + "step": 706 }, { - "eval_loss": 2.002166748046875, - "eval_runtime": 9.601, - "eval_samples_per_second": 213.311, - "eval_steps_per_second": 26.664, - "epoch": 0.8708458730870205, - "step": 13600 + "loss": 365.4639, + "grad_norm": 45.69781494140625, + "learning_rate": 0.00014737207183866118, + "epoch": 0.6341213402524185, + "step": 707 }, { - "loss": 2.1695, - "grad_norm": 1.6838092803955078, - "learning_rate": 5e-05, - "epoch": 0.8711660370109496, - "step": 13605 + "loss": 357.4766, + "grad_norm": 44.834136962890625, + "learning_rate": 0.0001467295997319049, + "epoch": 0.6350182586969056, + "step": 708 }, { - "loss": 2.1425, - "grad_norm": 1.5831983089447021, - "learning_rate": 5e-05, - "epoch": 0.8714862009348786, - "step": 13610 + "loss": 361.5132, + "grad_norm": 40.79405975341797, + "learning_rate": 0.00014608794893103646, + "epoch": 0.6359151771413928, + "step": 709 }, { - "loss": 2.1434, - "grad_norm": 1.620985746383667, - "learning_rate": 5e-05, - "epoch": 0.8718063648588077, - "step": 13615 + "loss": 361.108, + "grad_norm": 40.1624870300293, + "learning_rate": 0.00014544712453908216, + "epoch": 0.6368120955858799, + "step": 710 }, { - "loss": 2.1263, - "grad_norm": 1.5088939666748047, - "learning_rate": 5e-05, - "epoch": 0.8721265287827368, - "step": 13620 + "loss": 357.4099, + "grad_norm": 42.602073669433594, + "learning_rate": 0.00014480713165249609, + "epoch": 0.6377090140303671, + "step": 711 }, { - "loss": 2.1572, - "grad_norm": 1.5807956457138062, - "learning_rate": 5e-05, - "epoch": 0.8724466927066659, - "step": 13625 + "loss": 360.979, + "grad_norm": 43.97264099121094, + "learning_rate": 0.00014416797536111919, + "epoch": 0.6386059324748542, + "step": 712 }, { - "loss": 2.1683, - "grad_norm": 1.6206260919570923, - "learning_rate": 5e-05, - "epoch": 0.8727668566305948, - "step": 13630 + "loss": 361.3081, + "grad_norm": 40.94137191772461, + "learning_rate": 0.00014352966074813932, + "epoch": 0.6395028509193414, + "step": 713 }, { - "loss": 2.1358, - "grad_norm": 1.5690107345581055, - "learning_rate": 5e-05, - "epoch": 0.8730870205545239, - "step": 13635 + "loss": 359.9567, + "grad_norm": 40.18381881713867, + "learning_rate": 0.00014289219289005027, + "epoch": 0.6403997693638286, + "step": 714 }, { - "loss": 2.1412, - "grad_norm": 1.515356183052063, - "learning_rate": 5e-05, - "epoch": 0.873407184478453, - "step": 13640 + "loss": 353.732, + "grad_norm": 45.907203674316406, + "learning_rate": 0.0001422555768566115, + "epoch": 0.6412966878083157, + "step": 715 }, { - "loss": 2.1233, - "grad_norm": 1.603214979171753, - "learning_rate": 5e-05, - "epoch": 0.873727348402382, - "step": 13645 + "loss": 358.1761, + "grad_norm": 46.9672737121582, + "learning_rate": 0.0001416198177108083, + "epoch": 0.6421936062528029, + "step": 716 }, { - "loss": 2.1411, - "grad_norm": 1.5593687295913696, - "learning_rate": 5e-05, - "epoch": 0.8740475123263111, - "step": 13650 + "loss": 358.2166, + "grad_norm": 40.92546081542969, + "learning_rate": 0.0001409849205088109, + "epoch": 0.64309052469729, + "step": 717 }, { - "loss": 2.1334, - "grad_norm": 1.5858949422836304, - "learning_rate": 5e-05, - "epoch": 0.8743676762502401, - "step": 13655 + "loss": 358.0281, + "grad_norm": 39.04634475708008, + "learning_rate": 0.00014035089029993444, + "epoch": 0.6439874431417771, + "step": 718 }, { - "loss": 2.1357, - "grad_norm": 1.5124623775482178, - "learning_rate": 5e-05, - "epoch": 0.8746878401741692, - "step": 13660 + "loss": 358.9151, + "grad_norm": 41.55719757080078, + "learning_rate": 0.00013971773212659929, + "epoch": 0.6448843615862644, + "step": 719 }, { - "loss": 2.1547, - "grad_norm": 1.6396076679229736, - "learning_rate": 5e-05, - "epoch": 0.8750080040980982, - "step": 13665 + "loss": 356.5345, + "grad_norm": 41.81498336791992, + "learning_rate": 0.00013908545102429, + "epoch": 0.6457812800307515, + "step": 720 }, { - "loss": 2.1258, - "grad_norm": 1.53853440284729, - "learning_rate": 5e-05, - "epoch": 0.8753281680220273, - "step": 13670 + "loss": 358.3629, + "grad_norm": 40.042484283447266, + "learning_rate": 0.00013845405202151637, + "epoch": 0.6466781984752387, + "step": 721 }, { - "loss": 2.1557, - "grad_norm": 1.5771784782409668, - "learning_rate": 5e-05, - "epoch": 0.8756483319459564, - "step": 13675 + "loss": 360.9086, + "grad_norm": 44.207122802734375, + "learning_rate": 0.00013782354013977245, + "epoch": 0.6475751169197258, + "step": 722 }, { - "loss": 2.1493, - "grad_norm": 1.5845115184783936, - "learning_rate": 5e-05, - "epoch": 0.8759684958698853, - "step": 13680 + "loss": 357.7452, + "grad_norm": 45.20026779174805, + "learning_rate": 0.00013719392039349734, + "epoch": 0.6484720353642129, + "step": 723 }, { - "loss": 2.1492, - "grad_norm": 1.4543273448944092, - "learning_rate": 5e-05, - "epoch": 0.8762886597938144, - "step": 13685 + "loss": 358.4982, + "grad_norm": 41.07488250732422, + "learning_rate": 0.00013656519779003476, + "epoch": 0.6493689538087001, + "step": 724 }, { - "loss": 2.1413, - "grad_norm": 1.622414469718933, - "learning_rate": 5e-05, - "epoch": 0.8766088237177435, - "step": 13690 + "loss": 361.3215, + "grad_norm": 43.69713592529297, + "learning_rate": 0.00013593737732959382, + "epoch": 0.6502658722531872, + "step": 725 }, { - "loss": 2.1324, - "grad_norm": 1.5697697401046753, - "learning_rate": 5e-05, - "epoch": 0.8769289876416725, - "step": 13695 + "loss": 356.6879, + "grad_norm": 45.356109619140625, + "learning_rate": 0.00013531046400520858, + "epoch": 0.6511627906976745, + "step": 726 }, { - "loss": 2.138, - "grad_norm": 1.5314058065414429, - "learning_rate": 5e-05, - "epoch": 0.8772491515656016, - "step": 13700 + "loss": 363.6577, + "grad_norm": 44.325103759765625, + "learning_rate": 0.0001346844628026988, + "epoch": 0.6520597091421616, + "step": 727 }, { - "loss": 2.1417, - "grad_norm": 1.576690673828125, - "learning_rate": 5e-05, - "epoch": 0.8775693154895307, - "step": 13705 + "loss": 358.3399, + "grad_norm": 40.79582595825195, + "learning_rate": 0.0001340593787006303, + "epoch": 0.6529566275866487, + "step": 728 }, { - "loss": 2.1656, - "grad_norm": 1.596118450164795, - "learning_rate": 5e-05, - "epoch": 0.8778894794134597, - "step": 13710 + "loss": 360.8162, + "grad_norm": 40.47697448730469, + "learning_rate": 0.0001334352166702751, + "epoch": 0.6538535460311359, + "step": 729 }, { - "loss": 2.1618, - "grad_norm": 1.6418870687484741, - "learning_rate": 5e-05, - "epoch": 0.8782096433373887, - "step": 13715 + "loss": 356.254, + "grad_norm": 43.549407958984375, + "learning_rate": 0.00013281198167557185, + "epoch": 0.654750464475623, + "step": 730 }, { - "loss": 2.1169, - "grad_norm": 1.5763425827026367, - "learning_rate": 5e-05, - "epoch": 0.8785298072613178, - "step": 13720 + "loss": 356.3695, + "grad_norm": 41.08717727661133, + "learning_rate": 0.00013218967867308694, + "epoch": 0.6556473829201102, + "step": 731 }, { - "loss": 2.1293, - "grad_norm": 1.6006461381912231, - "learning_rate": 5e-05, - "epoch": 0.8788499711852469, - "step": 13725 + "loss": 359.2961, + "grad_norm": 44.06740951538086, + "learning_rate": 0.00013156831261197438, + "epoch": 0.6565443013645973, + "step": 732 }, { - "loss": 2.1461, - "grad_norm": 1.6010429859161377, - "learning_rate": 5e-05, - "epoch": 0.8791701351091759, - "step": 13730 + "loss": 354.8276, + "grad_norm": 44.14928436279297, + "learning_rate": 0.00013094788843393657, + "epoch": 0.6574412198090845, + "step": 733 }, { - "loss": 2.1372, - "grad_norm": 1.5433728694915771, - "learning_rate": 5e-05, - "epoch": 0.8794902990331049, - "step": 13735 + "loss": 356.655, + "grad_norm": 41.25139236450195, + "learning_rate": 0.0001303284110731856, + "epoch": 0.6583381382535717, + "step": 734 }, { - "loss": 2.1334, - "grad_norm": 1.6275326013565063, - "learning_rate": 5e-05, - "epoch": 0.879810462957034, - "step": 13740 + "loss": 359.9945, + "grad_norm": 43.141475677490234, + "learning_rate": 0.00012970988545640307, + "epoch": 0.6592350566980588, + "step": 735 }, { - "loss": 2.1458, - "grad_norm": 1.6575015783309937, - "learning_rate": 5e-05, - "epoch": 0.8801306268809631, - "step": 13745 + "loss": 354.7369, + "grad_norm": 45.27100372314453, + "learning_rate": 0.0001290923165027017, + "epoch": 0.660131975142546, + "step": 736 }, { - "loss": 2.1217, - "grad_norm": 1.591966152191162, - "learning_rate": 5e-05, - "epoch": 0.8804507908048921, - "step": 13750 + "loss": 357.4191, + "grad_norm": 41.795658111572266, + "learning_rate": 0.0001284757091235859, + "epoch": 0.6610288935870331, + "step": 737 }, { - "loss": 2.1051, - "grad_norm": 1.569419503211975, - "learning_rate": 5e-05, - "epoch": 0.8807709547288212, - "step": 13755 + "loss": 353.508, + "grad_norm": 43.1330680847168, + "learning_rate": 0.0001278600682229126, + "epoch": 0.6619258120315202, + "step": 738 }, { - "loss": 2.1461, - "grad_norm": 1.5138541460037231, - "learning_rate": 5e-05, - "epoch": 0.8810911186527502, - "step": 13760 + "loss": 356.3365, + "grad_norm": 43.488121032714844, + "learning_rate": 0.00012724539869685226, + "epoch": 0.6628227304760075, + "step": 739 }, { - "loss": 2.1666, - "grad_norm": 1.5544683933258057, - "learning_rate": 5e-05, - "epoch": 0.8814112825766792, - "step": 13765 + "loss": 357.6046, + "grad_norm": 42.182777404785156, + "learning_rate": 0.0001266317054338503, + "epoch": 0.6637196489204946, + "step": 740 }, { - "loss": 2.1572, - "grad_norm": 1.6272661685943604, - "learning_rate": 5e-05, - "epoch": 0.8817314465006083, - "step": 13770 + "loss": 358.7371, + "grad_norm": 43.06134796142578, + "learning_rate": 0.00012601899331458777, + "epoch": 0.6646165673649818, + "step": 741 }, { - "loss": 2.1285, - "grad_norm": 1.59254789352417, - "learning_rate": 5e-05, - "epoch": 0.8820516104245374, - "step": 13775 + "loss": 358.2452, + "grad_norm": 40.01738357543945, + "learning_rate": 0.00012540726721194266, + "epoch": 0.6655134858094689, + "step": 742 }, { - "loss": 2.132, - "grad_norm": 1.5939842462539673, - "learning_rate": 5e-05, - "epoch": 0.8823717743484664, - "step": 13780 + "loss": 361.5233, + "grad_norm": 40.66733169555664, + "learning_rate": 0.0001247965319909515, + "epoch": 0.666410404253956, + "step": 743 }, { - "loss": 2.1404, - "grad_norm": 1.512245774269104, - "learning_rate": 5e-05, - "epoch": 0.8826919382723954, - "step": 13785 + "loss": 354.1553, + "grad_norm": 39.47666931152344, + "learning_rate": 0.0001241867925087701, + "epoch": 0.6673073226984432, + "step": 744 }, { - "loss": 2.1267, - "grad_norm": 1.4971529245376587, - "learning_rate": 5e-05, - "epoch": 0.8830121021963245, - "step": 13790 + "loss": 358.3203, + "grad_norm": 39.22403335571289, + "learning_rate": 0.00012357805361463514, + "epoch": 0.6682042411429303, + "step": 745 }, { - "loss": 2.1288, - "grad_norm": 1.554547667503357, - "learning_rate": 5e-05, - "epoch": 0.8833322661202536, - "step": 13795 + "loss": 357.0617, + "grad_norm": 39.071529388427734, + "learning_rate": 0.00012297032014982597, + "epoch": 0.6691011595874176, + "step": 746 }, { - "loss": 2.124, - "grad_norm": 1.5361340045928955, - "learning_rate": 5e-05, - "epoch": 0.8836524300441826, - "step": 13800 + "loss": 362.905, + "grad_norm": 40.75625228881836, + "learning_rate": 0.0001223635969476255, + "epoch": 0.6699980780319047, + "step": 747 }, { - "eval_loss": 2.003274917602539, - "eval_runtime": 9.0968, - "eval_samples_per_second": 225.134, - "eval_steps_per_second": 28.142, - "epoch": 0.8836524300441826, - "step": 13800 + "loss": 354.9351, + "grad_norm": 42.89009094238281, + "learning_rate": 0.00012175788883328232, + "epoch": 0.6708949964763918, + "step": 748 }, { - "loss": 2.1452, - "grad_norm": 1.6627310514450073, - "learning_rate": 5e-05, - "epoch": 0.8839725939681117, - "step": 13805 + "loss": 359.415, + "grad_norm": 43.072513580322266, + "learning_rate": 0.0001211532006239718, + "epoch": 0.671791914920879, + "step": 749 }, { - "loss": 2.1283, - "grad_norm": 1.6165390014648438, - "learning_rate": 5e-05, - "epoch": 0.8842927578920408, - "step": 13810 + "loss": 357.7546, + "grad_norm": 40.25785446166992, + "learning_rate": 0.00012054953712875807, + "epoch": 0.6726888333653661, + "step": 750 }, { - "loss": 2.1567, - "grad_norm": 1.6117839813232422, - "learning_rate": 5e-05, - "epoch": 0.8846129218159697, - "step": 13815 + "eval_loss": 1.609327793121338, + "eval_runtime": 17.5285, + "eval_samples_per_second": 116.839, + "eval_steps_per_second": 14.605, + "epoch": 0.6726888333653661, + "step": 750 }, { - "loss": 2.1299, - "grad_norm": 1.6143478155136108, - "learning_rate": 5e-05, - "epoch": 0.8849330857398988, - "step": 13820 + "loss": 357.2794, + "grad_norm": 41.602596282958984, + "learning_rate": 0.00011994690314855598, + "epoch": 0.6735857518098533, + "step": 751 }, { - "loss": 2.1314, - "grad_norm": 1.5183966159820557, - "learning_rate": 5e-05, - "epoch": 0.8852532496638279, - "step": 13825 + "loss": 361.091, + "grad_norm": 41.749717712402344, + "learning_rate": 0.00011934530347609257, + "epoch": 0.6744826702543405, + "step": 752 }, { - "loss": 2.1788, - "grad_norm": 1.5318809747695923, - "learning_rate": 5e-05, - "epoch": 0.885573413587757, - "step": 13830 + "loss": 362.0817, + "grad_norm": 39.51606369018555, + "learning_rate": 0.00011874474289586895, + "epoch": 0.6753795886988276, + "step": 753 }, { - "loss": 2.1037, - "grad_norm": 1.6234662532806396, - "learning_rate": 5e-05, - "epoch": 0.885893577511686, - "step": 13835 + "loss": 356.8317, + "grad_norm": 40.00758743286133, + "learning_rate": 0.00011814522618412235, + "epoch": 0.6762765071433148, + "step": 754 }, { - "loss": 2.1407, - "grad_norm": 1.6068276166915894, - "learning_rate": 5e-05, - "epoch": 0.886213741435615, - "step": 13840 + "loss": 359.7722, + "grad_norm": 41.676292419433594, + "learning_rate": 0.00011754675810878845, + "epoch": 0.6771734255878019, + "step": 755 }, { - "loss": 2.1296, - "grad_norm": 1.6118892431259155, - "learning_rate": 5e-05, - "epoch": 0.8865339053595441, - "step": 13845 + "loss": 359.641, + "grad_norm": 41.25587463378906, + "learning_rate": 0.00011694934342946287, + "epoch": 0.678070344032289, + "step": 756 }, { - "loss": 2.1491, - "grad_norm": 1.5626516342163086, - "learning_rate": 5e-05, - "epoch": 0.8868540692834731, - "step": 13850 + "loss": 352.955, + "grad_norm": 40.348514556884766, + "learning_rate": 0.00011635298689736357, + "epoch": 0.6789672624767762, + "step": 757 }, { - "loss": 2.1442, - "grad_norm": 1.560748815536499, - "learning_rate": 5e-05, - "epoch": 0.8871742332074022, - "step": 13855 + "loss": 362.8987, + "grad_norm": 43.387184143066406, + "learning_rate": 0.00011575769325529342, + "epoch": 0.6798641809212633, + "step": 758 }, { - "loss": 2.1449, - "grad_norm": 1.6533335447311401, - "learning_rate": 5e-05, - "epoch": 0.8874943971313313, - "step": 13860 + "loss": 357.0482, + "grad_norm": 40.06668472290039, + "learning_rate": 0.00011516346723760193, + "epoch": 0.6807610993657506, + "step": 759 }, { - "loss": 2.1301, - "grad_norm": 1.671207070350647, - "learning_rate": 5e-05, - "epoch": 0.8878145610552602, - "step": 13865 + "loss": 359.7377, + "grad_norm": 39.39516830444336, + "learning_rate": 0.00011457031357014772, + "epoch": 0.6816580178102377, + "step": 760 }, { - "loss": 2.1246, - "grad_norm": 1.6996030807495117, - "learning_rate": 5e-05, - "epoch": 0.8881347249791893, - "step": 13870 + "loss": 362.0869, + "grad_norm": 39.07398223876953, + "learning_rate": 0.0001139782369702614, + "epoch": 0.6825549362547249, + "step": 761 }, { - "loss": 2.1249, - "grad_norm": 1.563635230064392, - "learning_rate": 5e-05, - "epoch": 0.8884548889031184, - "step": 13875 + "loss": 357.4482, + "grad_norm": 42.54057312011719, + "learning_rate": 0.00011338724214670734, + "epoch": 0.683451854699212, + "step": 762 }, { - "loss": 2.1062, - "grad_norm": 1.5765130519866943, - "learning_rate": 5e-05, - "epoch": 0.8887750528270475, - "step": 13880 + "loss": 360.6057, + "grad_norm": 40.7839241027832, + "learning_rate": 0.00011279733379964691, + "epoch": 0.6843487731436991, + "step": 763 }, { - "loss": 2.1372, - "grad_norm": 1.61628258228302, - "learning_rate": 5e-05, - "epoch": 0.8890952167509765, - "step": 13885 + "loss": 362.9106, + "grad_norm": 41.402889251708984, + "learning_rate": 0.00011220851662060047, + "epoch": 0.6852456915881863, + "step": 764 }, { - "loss": 2.1548, - "grad_norm": 1.6101058721542358, - "learning_rate": 5e-05, - "epoch": 0.8894153806749056, - "step": 13890 + "loss": 357.1811, + "grad_norm": 41.3732795715332, + "learning_rate": 0.00011162079529241042, + "epoch": 0.6861426100326734, + "step": 765 }, { - "loss": 2.1294, - "grad_norm": 1.6023646593093872, - "learning_rate": 5e-05, - "epoch": 0.8897355445988346, - "step": 13895 + "loss": 358.0857, + "grad_norm": 42.31522750854492, + "learning_rate": 0.00011103417448920406, + "epoch": 0.6870395284771607, + "step": 766 }, { - "loss": 2.1555, - "grad_norm": 1.5916895866394043, - "learning_rate": 5e-05, - "epoch": 0.8900557085227636, - "step": 13900 + "loss": 357.946, + "grad_norm": 38.36897277832031, + "learning_rate": 0.00011044865887635625, + "epoch": 0.6879364469216478, + "step": 767 }, { - "loss": 2.1373, - "grad_norm": 1.6304248571395874, - "learning_rate": 5e-05, - "epoch": 0.8903758724466927, - "step": 13905 + "loss": 360.9647, + "grad_norm": 43.01420974731445, + "learning_rate": 0.00010986425311045212, + "epoch": 0.6888333653661349, + "step": 768 }, { - "loss": 2.1543, - "grad_norm": 1.578668475151062, - "learning_rate": 5e-05, - "epoch": 0.8906960363706218, - "step": 13910 + "loss": 362.1032, + "grad_norm": 40.731163024902344, + "learning_rate": 0.00010928096183925024, + "epoch": 0.6897302838106221, + "step": 769 }, { - "loss": 2.1368, - "grad_norm": 1.7462356090545654, - "learning_rate": 5e-05, - "epoch": 0.8910162002945509, - "step": 13915 + "loss": 363.3222, + "grad_norm": 41.69025421142578, + "learning_rate": 0.00010869878970164587, + "epoch": 0.6906272022551092, + "step": 770 }, { - "loss": 2.126, - "grad_norm": 1.5746142864227295, - "learning_rate": 5e-05, - "epoch": 0.8913363642184798, - "step": 13920 + "loss": 358.3542, + "grad_norm": 37.463043212890625, + "learning_rate": 0.00010811774132763366, + "epoch": 0.6915241206995963, + "step": 771 }, { - "loss": 2.1295, - "grad_norm": 1.598989486694336, - "learning_rate": 5e-05, - "epoch": 0.8916565281424089, - "step": 13925 + "loss": 364.5648, + "grad_norm": 38.481815338134766, + "learning_rate": 0.00010753782133827093, + "epoch": 0.6924210391440836, + "step": 772 }, { - "loss": 2.1271, - "grad_norm": 1.582812786102295, - "learning_rate": 5e-05, - "epoch": 0.891976692066338, - "step": 13930 + "loss": 361.0055, + "grad_norm": 39.70282745361328, + "learning_rate": 0.00010695903434564124, + "epoch": 0.6933179575885707, + "step": 773 }, { - "loss": 2.1282, - "grad_norm": 1.6400415897369385, - "learning_rate": 5e-05, - "epoch": 0.892296855990267, - "step": 13935 + "loss": 359.3154, + "grad_norm": 38.182132720947266, + "learning_rate": 0.00010638138495281725, + "epoch": 0.6942148760330579, + "step": 774 }, { - "loss": 2.1327, - "grad_norm": 1.5493074655532837, - "learning_rate": 5e-05, - "epoch": 0.8926170199141961, - "step": 13940 + "loss": 356.322, + "grad_norm": 37.12331008911133, + "learning_rate": 0.00010580487775382449, + "epoch": 0.695111794477545, + "step": 775 }, { - "loss": 2.1679, - "grad_norm": 1.6458313465118408, - "learning_rate": 5e-05, - "epoch": 0.8929371838381251, - "step": 13945 + "loss": 356.3972, + "grad_norm": 40.065006256103516, + "learning_rate": 0.00010522951733360456, + "epoch": 0.6960087129220321, + "step": 776 }, { - "loss": 2.1539, - "grad_norm": 1.5993520021438599, - "learning_rate": 5e-05, - "epoch": 0.8932573477620541, - "step": 13950 + "loss": 351.4366, + "grad_norm": 40.21229553222656, + "learning_rate": 0.0001046553082679787, + "epoch": 0.6969056313665193, + "step": 777 }, { - "loss": 2.1376, - "grad_norm": 1.5654582977294922, - "learning_rate": 5e-05, - "epoch": 0.8935775116859832, - "step": 13955 + "loss": 356.3872, + "grad_norm": 39.17121124267578, + "learning_rate": 0.00010408225512361171, + "epoch": 0.6978025498110064, + "step": 778 }, { - "loss": 2.1552, - "grad_norm": 1.5213743448257446, - "learning_rate": 5e-05, - "epoch": 0.8938976756099123, - "step": 13960 + "loss": 358.5863, + "grad_norm": 38.62257766723633, + "learning_rate": 0.0001035103624579751, + "epoch": 0.6986994682554937, + "step": 779 }, { - "loss": 2.1289, - "grad_norm": 1.5369318723678589, - "learning_rate": 5e-05, - "epoch": 0.8942178395338414, - "step": 13965 + "loss": 359.1902, + "grad_norm": 39.73896408081055, + "learning_rate": 0.00010293963481931143, + "epoch": 0.6995963866999808, + "step": 780 }, { - "loss": 2.1416, - "grad_norm": 1.6027822494506836, - "learning_rate": 5e-05, - "epoch": 0.8945380034577703, - "step": 13970 + "loss": 357.0757, + "grad_norm": 38.72207260131836, + "learning_rate": 0.00010237007674659752, + "epoch": 0.700493305144468, + "step": 781 }, { - "loss": 2.1395, - "grad_norm": 1.5644904375076294, - "learning_rate": 5e-05, - "epoch": 0.8948581673816994, - "step": 13975 + "loss": 359.07, + "grad_norm": 39.15367126464844, + "learning_rate": 0.00010180169276950899, + "epoch": 0.7013902235889551, + "step": 782 }, { - "loss": 2.1543, - "grad_norm": 1.6486353874206543, - "learning_rate": 5e-05, - "epoch": 0.8951783313056285, - "step": 13980 + "loss": 357.7226, + "grad_norm": 39.2513542175293, + "learning_rate": 0.00010123448740838367, + "epoch": 0.7022871420334422, + "step": 783 }, { - "loss": 2.1565, - "grad_norm": 1.6289517879486084, - "learning_rate": 5e-05, - "epoch": 0.8954984952295575, - "step": 13985 + "loss": 359.4571, + "grad_norm": 41.660953521728516, + "learning_rate": 0.00010066846517418596, + "epoch": 0.7031840604779294, + "step": 784 }, { - "loss": 2.1493, - "grad_norm": 1.700780987739563, - "learning_rate": 5e-05, - "epoch": 0.8958186591534866, - "step": 13990 + "loss": 358.3033, + "grad_norm": 40.074806213378906, + "learning_rate": 0.00010010363056847103, + "epoch": 0.7040809789224165, + "step": 785 }, { - "loss": 2.1413, - "grad_norm": 1.6621733903884888, - "learning_rate": 5e-05, - "epoch": 0.8961388230774157, - "step": 13995 + "loss": 358.5859, + "grad_norm": 40.53306198120117, + "learning_rate": 9.953998808334874e-05, + "epoch": 0.7049778973669037, + "step": 786 }, { - "loss": 2.1535, - "grad_norm": 1.5872445106506348, - "learning_rate": 5e-05, - "epoch": 0.8964589870013447, - "step": 14000 + "loss": 353.3639, + "grad_norm": 43.58430099487305, + "learning_rate": 9.8977542201448e-05, + "epoch": 0.7058748158113909, + "step": 787 }, { - "eval_loss": 2.003190517425537, - "eval_runtime": 9.0419, - "eval_samples_per_second": 226.5, - "eval_steps_per_second": 28.313, - "epoch": 0.8964589870013447, - "step": 14000 + "loss": 359.5676, + "grad_norm": 39.986785888671875, + "learning_rate": 9.841629739588145e-05, + "epoch": 0.706771734255878, + "step": 788 }, { - "loss": 2.1151, - "grad_norm": 1.5972435474395752, - "learning_rate": 5e-05, - "epoch": 0.8967791509252737, - "step": 14005 + "loss": 361.0522, + "grad_norm": 41.356590270996094, + "learning_rate": 9.785625813020923e-05, + "epoch": 0.7076686527003652, + "step": 789 }, { - "loss": 2.1639, - "grad_norm": 1.5537108182907104, - "learning_rate": 5e-05, - "epoch": 0.8970993148492028, - "step": 14010 + "loss": 355.244, + "grad_norm": 40.596397399902344, + "learning_rate": 9.729742885840429e-05, + "epoch": 0.7085655711448523, + "step": 790 }, { - "loss": 2.1425, - "grad_norm": 1.5643787384033203, - "learning_rate": 5e-05, - "epoch": 0.8974194787731319, - "step": 14015 + "loss": 358.6471, + "grad_norm": 39.8510627746582, + "learning_rate": 9.673981402481619e-05, + "epoch": 0.7094624895893394, + "step": 791 }, { - "loss": 2.1339, - "grad_norm": 1.5768754482269287, - "learning_rate": 5e-05, - "epoch": 0.8977396426970609, - "step": 14020 + "loss": 355.7997, + "grad_norm": 37.443397521972656, + "learning_rate": 9.618341806413614e-05, + "epoch": 0.7103594080338267, + "step": 792 }, { - "loss": 2.1362, - "grad_norm": 1.6462067365646362, - "learning_rate": 5e-05, - "epoch": 0.8980598066209899, - "step": 14025 + "loss": 358.5055, + "grad_norm": 38.937034606933594, + "learning_rate": 9.562824540136192e-05, + "epoch": 0.7112563264783138, + "step": 793 }, { - "loss": 2.1413, - "grad_norm": 1.540134072303772, - "learning_rate": 5e-05, - "epoch": 0.898379970544919, - "step": 14030 + "loss": 357.9367, + "grad_norm": 39.378326416015625, + "learning_rate": 9.507430045176238e-05, + "epoch": 0.712153244922801, + "step": 794 }, { - "loss": 2.1417, - "grad_norm": 1.5940558910369873, - "learning_rate": 5e-05, - "epoch": 0.898700134468848, - "step": 14035 + "loss": 356.7012, + "grad_norm": 40.44821548461914, + "learning_rate": 9.452158762084228e-05, + "epoch": 0.7130501633672881, + "step": 795 }, { - "loss": 2.1372, - "grad_norm": 1.5724999904632568, - "learning_rate": 5e-05, - "epoch": 0.8990202983927771, - "step": 14040 + "loss": 361.7253, + "grad_norm": 39.721378326416016, + "learning_rate": 9.397011130430741e-05, + "epoch": 0.7139470818117752, + "step": 796 }, { - "loss": 2.1628, - "grad_norm": 1.611562728881836, - "learning_rate": 5e-05, - "epoch": 0.8993404623167062, - "step": 14045 + "loss": 359.5762, + "grad_norm": 40.48420333862305, + "learning_rate": 9.341987588802984e-05, + "epoch": 0.7148440002562624, + "step": 797 }, { - "loss": 2.1253, - "grad_norm": 1.5111039876937866, - "learning_rate": 5e-05, - "epoch": 0.8996606262406353, - "step": 14050 + "loss": 355.1304, + "grad_norm": 38.8956413269043, + "learning_rate": 9.287088574801248e-05, + "epoch": 0.7157409187007495, + "step": 798 }, { - "loss": 2.17, - "grad_norm": 1.5524070262908936, - "learning_rate": 5e-05, - "epoch": 0.8999807901645642, - "step": 14055 + "loss": 360.5678, + "grad_norm": 41.26605987548828, + "learning_rate": 9.23231452503547e-05, + "epoch": 0.7166378371452368, + "step": 799 }, { - "loss": 2.1206, - "grad_norm": 1.5320378541946411, - "learning_rate": 4.999919096867105e-05, - "epoch": 0.9003009540884933, - "step": 14060 + "loss": 359.8319, + "grad_norm": 36.14881134033203, + "learning_rate": 9.177665875121774e-05, + "epoch": 0.7175347555897239, + "step": 800 }, { - "loss": 2.1422, - "grad_norm": 1.5573452711105347, - "learning_rate": 4.99959043686394e-05, - "epoch": 0.9006211180124224, - "step": 14065 + "eval_loss": 1.5968618392944336, + "eval_runtime": 17.8479, + "eval_samples_per_second": 114.747, + "eval_steps_per_second": 14.343, + "epoch": 0.7175347555897239, + "step": 800 }, { - "loss": 2.1634, - "grad_norm": 1.5653362274169922, - "learning_rate": 4.999008996756062e-05, - "epoch": 0.9009412819363514, - "step": 14070 + "loss": 361.1777, + "grad_norm": 40.25320053100586, + "learning_rate": 9.123143059678952e-05, + "epoch": 0.718431674034211, + "step": 801 }, { - "loss": 2.129, - "grad_norm": 1.6500377655029297, - "learning_rate": 4.998174835343699e-05, - "epoch": 0.9012614458602805, - "step": 14075 + "loss": 355.5561, + "grad_norm": 39.248783111572266, + "learning_rate": 9.068746512325046e-05, + "epoch": 0.7193285924786982, + "step": 802 }, { - "loss": 2.1201, - "grad_norm": 1.5795848369598389, - "learning_rate": 4.9970880369844344e-05, - "epoch": 0.9015816097842095, - "step": 14080 + "loss": 353.493, + "grad_norm": 41.21136474609375, + "learning_rate": 9.014476665673915e-05, + "epoch": 0.7202255109231853, + "step": 803 }, { - "loss": 2.1409, - "grad_norm": 1.48246431350708, - "learning_rate": 4.995748711584676e-05, - "epoch": 0.9019017737081386, - "step": 14085 + "loss": 355.8681, + "grad_norm": 38.923973083496094, + "learning_rate": 8.960333951331739e-05, + "epoch": 0.7211224293676725, + "step": 804 }, { - "loss": 2.1532, - "grad_norm": 1.6394940614700317, - "learning_rate": 4.9941569945885383e-05, - "epoch": 0.9022219376320676, - "step": 14090 + "loss": 355.0969, + "grad_norm": 43.01164627075195, + "learning_rate": 8.906318799893648e-05, + "epoch": 0.7220193478121597, + "step": 805 }, { - "loss": 2.1489, - "grad_norm": 1.6183077096939087, - "learning_rate": 4.992313046964147e-05, - "epoch": 0.9025421015559967, - "step": 14095 + "loss": 354.1833, + "grad_norm": 39.02459716796875, + "learning_rate": 8.852431640940247e-05, + "epoch": 0.7229162662566468, + "step": 806 }, { - "loss": 2.1459, - "grad_norm": 1.5216248035430908, - "learning_rate": 4.990217055187362e-05, - "epoch": 0.9028622654799258, - "step": 14100 + "loss": 359.125, + "grad_norm": 37.63704299926758, + "learning_rate": 8.798672903034225e-05, + "epoch": 0.723813184701134, + "step": 807 }, { - "loss": 2.1391, - "grad_norm": 1.5191268920898438, - "learning_rate": 4.987869231222917e-05, - "epoch": 0.9031824294038547, - "step": 14105 + "loss": 355.6418, + "grad_norm": 38.401512145996094, + "learning_rate": 8.745043013716955e-05, + "epoch": 0.7247101031456211, + "step": 808 }, { - "loss": 2.1464, - "grad_norm": 1.555911660194397, - "learning_rate": 4.985269812502983e-05, - "epoch": 0.9035025933277838, - "step": 14110 + "loss": 358.6194, + "grad_norm": 37.391685485839844, + "learning_rate": 8.691542399505081e-05, + "epoch": 0.7256070215901083, + "step": 809 }, { - "loss": 2.1511, - "grad_norm": 1.577181339263916, - "learning_rate": 4.9824190619031616e-05, - "epoch": 0.9038227572517129, - "step": 14115 + "loss": 359.1611, + "grad_norm": 40.48008728027344, + "learning_rate": 8.638171485887111e-05, + "epoch": 0.7265039400345954, + "step": 810 }, { - "loss": 2.1753, - "grad_norm": 1.580073595046997, - "learning_rate": 4.979317267715895e-05, - "epoch": 0.9041429211756419, - "step": 14120 + "loss": 359.4613, + "grad_norm": 40.47174835205078, + "learning_rate": 8.584930697320053e-05, + "epoch": 0.7274008584790825, + "step": 811 }, { - "loss": 2.1283, - "grad_norm": 1.5809470415115356, - "learning_rate": 4.975964743621318e-05, - "epoch": 0.904463085099571, - "step": 14125 + "loss": 351.1801, + "grad_norm": 39.59210968017578, + "learning_rate": 8.531820457226055e-05, + "epoch": 0.7282977769235698, + "step": 812 }, { - "loss": 2.1146, - "grad_norm": 1.4928776025772095, - "learning_rate": 4.972361828655526e-05, - "epoch": 0.9047832490235, - "step": 14130 + "loss": 355.662, + "grad_norm": 36.89620590209961, + "learning_rate": 8.478841187988992e-05, + "epoch": 0.7291946953680569, + "step": 813 }, { - "loss": 2.1686, - "grad_norm": 1.5114343166351318, - "learning_rate": 4.968508887176303e-05, - "epoch": 0.9051034129474291, - "step": 14135 + "loss": 361.7194, + "grad_norm": 38.956214904785156, + "learning_rate": 8.425993310951132e-05, + "epoch": 0.7300916138125441, + "step": 814 }, { - "loss": 2.1386, - "grad_norm": 1.6162500381469727, - "learning_rate": 4.964406308826261e-05, - "epoch": 0.9054235768713581, - "step": 14140 + "loss": 359.9547, + "grad_norm": 36.15619659423828, + "learning_rate": 8.373277246409818e-05, + "epoch": 0.7309885322570312, + "step": 815 }, { - "loss": 2.1504, - "grad_norm": 1.7190195322036743, - "learning_rate": 4.960054508493442e-05, - "epoch": 0.9057437407952872, - "step": 14145 + "loss": 353.2803, + "grad_norm": 41.085899353027344, + "learning_rate": 8.320693413614053e-05, + "epoch": 0.7318854507015183, + "step": 816 }, { - "loss": 2.0969, - "grad_norm": 1.6564409732818604, - "learning_rate": 4.9554539262693636e-05, - "epoch": 0.9060639047192163, - "step": 14150 + "loss": 356.6743, + "grad_norm": 40.31721878051758, + "learning_rate": 8.268242230761239e-05, + "epoch": 0.7327823691460055, + "step": 817 }, { - "loss": 2.1156, - "grad_norm": 1.5606110095977783, - "learning_rate": 4.9506050274045076e-05, - "epoch": 0.9063840686431452, - "step": 14155 + "loss": 356.205, + "grad_norm": 41.351558685302734, + "learning_rate": 8.215924114993792e-05, + "epoch": 0.7336792875904926, + "step": 818 }, { - "loss": 2.133, - "grad_norm": 1.4608800411224365, - "learning_rate": 4.945508302261271e-05, - "epoch": 0.9067042325670743, - "step": 14160 + "loss": 360.4526, + "grad_norm": 39.119476318359375, + "learning_rate": 8.163739482395851e-05, + "epoch": 0.7345762060349799, + "step": 819 }, { - "loss": 2.113, - "grad_norm": 1.5593559741973877, - "learning_rate": 4.940164266264382e-05, - "epoch": 0.9070243964910034, - "step": 14165 + "loss": 361.5057, + "grad_norm": 38.80229949951172, + "learning_rate": 8.111688747990001e-05, + "epoch": 0.735473124479467, + "step": 820 }, { - "loss": 2.1405, - "grad_norm": 1.5246703624725342, - "learning_rate": 4.934573459848768e-05, - "epoch": 0.9073445604149325, - "step": 14170 + "loss": 352.7518, + "grad_norm": 40.22185134887695, + "learning_rate": 8.059772325733899e-05, + "epoch": 0.7363700429239541, + "step": 821 }, { - "loss": 2.1462, - "grad_norm": 1.6130108833312988, - "learning_rate": 4.928736448404907e-05, - "epoch": 0.9076647243388615, - "step": 14175 + "loss": 356.2066, + "grad_norm": 40.426979064941406, + "learning_rate": 8.007990628517034e-05, + "epoch": 0.7372669613684413, + "step": 822 }, { - "loss": 2.1721, - "grad_norm": 1.5642423629760742, - "learning_rate": 4.9226538222216476e-05, - "epoch": 0.9079848882627906, - "step": 14180 + "loss": 358.5974, + "grad_norm": 39.50589370727539, + "learning_rate": 7.956344068157443e-05, + "epoch": 0.7381638798129284, + "step": 823 }, { - "loss": 2.1491, - "grad_norm": 1.6237996816635132, - "learning_rate": 4.9163261964265184e-05, - "epoch": 0.9083050521867196, - "step": 14185 + "loss": 360.1032, + "grad_norm": 38.537113189697266, + "learning_rate": 7.904833055398428e-05, + "epoch": 0.7390607982574156, + "step": 824 }, { - "loss": 2.1562, - "grad_norm": 1.5854694843292236, - "learning_rate": 4.909754210923515e-05, - "epoch": 0.9086252161106486, - "step": 14190 + "loss": 358.6521, + "grad_norm": 38.09297180175781, + "learning_rate": 7.853457999905264e-05, + "epoch": 0.7399577167019028, + "step": 825 }, { - "loss": 2.1569, - "grad_norm": 1.578338861465454, - "learning_rate": 4.902938530328393e-05, - "epoch": 0.9089453800345777, - "step": 14195 + "loss": 358.724, + "grad_norm": 38.27792739868164, + "learning_rate": 7.802219310261965e-05, + "epoch": 0.7408546351463899, + "step": 826 }, { - "loss": 2.119, - "grad_norm": 1.6499290466308594, - "learning_rate": 4.895879843901451e-05, - "epoch": 0.9092655439585068, - "step": 14200 + "loss": 361.0538, + "grad_norm": 40.946353912353516, + "learning_rate": 7.75111739396806e-05, + "epoch": 0.7417515535908771, + "step": 827 }, { - "eval_loss": 1.9983034133911133, - "eval_runtime": 9.4635, - "eval_samples_per_second": 216.41, - "eval_steps_per_second": 27.051, - "epoch": 0.9092655439585068, - "step": 14200 + "loss": 354.2574, + "grad_norm": 37.80830764770508, + "learning_rate": 7.700152657435297e-05, + "epoch": 0.7426484720353642, + "step": 828 }, { - "loss": 2.1602, - "grad_norm": 1.5854569673538208, - "learning_rate": 4.888578865477831e-05, - "epoch": 0.9095857078824358, - "step": 14205 + "loss": 356.4567, + "grad_norm": 39.698429107666016, + "learning_rate": 7.649325505984434e-05, + "epoch": 0.7435453904798514, + "step": 829 }, { - "loss": 2.1274, - "grad_norm": 1.5863550901412964, - "learning_rate": 4.881036333395329e-05, - "epoch": 0.9099058718063648, - "step": 14210 + "loss": 355.0162, + "grad_norm": 38.21966552734375, + "learning_rate": 7.598636343842053e-05, + "epoch": 0.7444423089243385, + "step": 830 }, { - "loss": 2.1412, - "grad_norm": 1.5058035850524902, - "learning_rate": 4.873253010419724e-05, - "epoch": 0.9102260357302939, - "step": 14215 + "loss": 356.4822, + "grad_norm": 39.37642288208008, + "learning_rate": 7.548085574137273e-05, + "epoch": 0.7453392273688256, + "step": 831 }, { - "loss": 2.133, - "grad_norm": 1.5665518045425415, - "learning_rate": 4.8652296836676435e-05, - "epoch": 0.910546199654223, - "step": 14220 + "loss": 357.8192, + "grad_norm": 37.3087158203125, + "learning_rate": 7.497673598898613e-05, + "epoch": 0.7462361458133129, + "step": 832 }, { - "loss": 2.1322, - "grad_norm": 1.5273241996765137, - "learning_rate": 4.856967164526966e-05, - "epoch": 0.910866363578152, - "step": 14225 + "loss": 363.7517, + "grad_norm": 35.9515266418457, + "learning_rate": 7.447400819050751e-05, + "epoch": 0.7471330642578, + "step": 833 }, { - "loss": 2.1584, - "grad_norm": 2.971830129623413, - "learning_rate": 4.84846628857476e-05, - "epoch": 0.9111865275020811, - "step": 14230 + "loss": 355.3728, + "grad_norm": 36.964534759521484, + "learning_rate": 7.397267634411337e-05, + "epoch": 0.7480299827022872, + "step": 834 }, { - "loss": 2.1475, - "grad_norm": 1.5898001194000244, - "learning_rate": 4.83972791549279e-05, - "epoch": 0.9115066914260102, - "step": 14235 + "loss": 354.5074, + "grad_norm": 39.167415618896484, + "learning_rate": 7.347274443687855e-05, + "epoch": 0.7489269011467743, + "step": 835 }, { - "loss": 2.1378, - "grad_norm": 1.674157977104187, - "learning_rate": 4.8307529289805706e-05, - "epoch": 0.9118268553499391, - "step": 14240 + "loss": 361.1248, + "grad_norm": 40.1679801940918, + "learning_rate": 7.297421644474387e-05, + "epoch": 0.7498238195912614, + "step": 836 }, { - "loss": 2.1262, - "grad_norm": 1.5460021495819092, - "learning_rate": 4.821542236666009e-05, - "epoch": 0.9121470192738682, - "step": 14245 + "loss": 357.9431, + "grad_norm": 38.67217254638672, + "learning_rate": 7.247709633248526e-05, + "epoch": 0.7507207380357486, + "step": 837 }, { - "loss": 2.1465, - "grad_norm": 1.5996989011764526, - "learning_rate": 4.812096770013609e-05, - "epoch": 0.9124671831977973, - "step": 14250 + "loss": 360.9297, + "grad_norm": 37.734153747558594, + "learning_rate": 7.198138805368143e-05, + "epoch": 0.7516176564802357, + "step": 838 }, { - "loss": 2.1211, - "grad_norm": 1.5312010049819946, - "learning_rate": 4.802417484230277e-05, - "epoch": 0.9127873471217264, - "step": 14255 + "loss": 350.7899, + "grad_norm": 36.58796691894531, + "learning_rate": 7.148709555068314e-05, + "epoch": 0.752514574924723, + "step": 839 }, { - "loss": 2.1254, - "grad_norm": 1.5802662372589111, - "learning_rate": 4.792505358168723e-05, - "epoch": 0.9131075110456554, - "step": 14260 + "loss": 358.5099, + "grad_norm": 37.6004753112793, + "learning_rate": 7.09942227545814e-05, + "epoch": 0.7534114933692101, + "step": 840 }, { - "loss": 2.1521, - "grad_norm": 1.5600544214248657, - "learning_rate": 4.782361394228472e-05, - "epoch": 0.9134276749695844, - "step": 14265 + "loss": 350.2813, + "grad_norm": 39.31602096557617, + "learning_rate": 7.05027735851762e-05, + "epoch": 0.7543084118136972, + "step": 841 }, { - "loss": 2.1041, - "grad_norm": 1.5304535627365112, - "learning_rate": 4.7719866182544894e-05, - "epoch": 0.9137478388935135, - "step": 14270 + "loss": 361.4473, + "grad_norm": 37.72463607788086, + "learning_rate": 7.001275195094581e-05, + "epoch": 0.7552053302581844, + "step": 842 }, { - "loss": 2.1418, - "grad_norm": 1.540677785873413, - "learning_rate": 4.761382079433441e-05, - "epoch": 0.9140680028174425, - "step": 14275 + "loss": 356.7912, + "grad_norm": 36.68344497680664, + "learning_rate": 6.952416174901504e-05, + "epoch": 0.7561022487026715, + "step": 843 }, { - "loss": 2.1354, - "grad_norm": 1.5304155349731445, - "learning_rate": 4.7505488501875907e-05, - "epoch": 0.9143881667413716, - "step": 14280 + "loss": 360.7002, + "grad_norm": 39.82998275756836, + "learning_rate": 6.903700686512485e-05, + "epoch": 0.7569991671471586, + "step": 844 }, { - "loss": 2.1853, - "grad_norm": 1.581351637840271, - "learning_rate": 4.739488026066347e-05, - "epoch": 0.9147083306653007, - "step": 14285 + "loss": 357.1058, + "grad_norm": 39.26710510253906, + "learning_rate": 6.855129117360095e-05, + "epoch": 0.7578960855916459, + "step": 845 }, { - "loss": 2.1387, - "grad_norm": 1.5637761354446411, - "learning_rate": 4.728200725635469e-05, - "epoch": 0.9150284945892296, - "step": 14290 + "loss": 356.4349, + "grad_norm": 37.95897674560547, + "learning_rate": 6.806701853732319e-05, + "epoch": 0.758793004036133, + "step": 846 }, { - "loss": 2.1256, - "grad_norm": 1.5060229301452637, - "learning_rate": 4.716688090363953e-05, - "epoch": 0.9153486585131587, - "step": 14295 + "loss": 353.9336, + "grad_norm": 36.72467041015625, + "learning_rate": 6.75841928076951e-05, + "epoch": 0.7596899224806202, + "step": 847 }, { - "loss": 2.1226, - "grad_norm": 1.548572301864624, - "learning_rate": 4.7049512845085954e-05, - "epoch": 0.9156688224370878, - "step": 14300 + "loss": 355.9283, + "grad_norm": 38.29819869995117, + "learning_rate": 6.710281782461275e-05, + "epoch": 0.7605868409251073, + "step": 848 }, { - "loss": 2.1307, - "grad_norm": 1.6175365447998047, - "learning_rate": 4.692991494996247e-05, - "epoch": 0.9159889863610169, - "step": 14305 + "loss": 357.5876, + "grad_norm": 39.196720123291016, + "learning_rate": 6.662289741643454e-05, + "epoch": 0.7614837593695944, + "step": 849 }, { - "loss": 2.1273, - "grad_norm": 1.5475963354110718, - "learning_rate": 4.680809931303792e-05, - "epoch": 0.9163091502849459, - "step": 14310 + "loss": 359.8077, + "grad_norm": 40.00128936767578, + "learning_rate": 6.614443539995078e-05, + "epoch": 0.7623806778140816, + "step": 850 }, { - "loss": 2.159, - "grad_norm": 1.644508719444275, - "learning_rate": 4.668407825335823e-05, - "epoch": 0.916629314208875, - "step": 14315 + "eval_loss": 1.582360863685608, + "eval_runtime": 18.4592, + "eval_samples_per_second": 110.947, + "eval_steps_per_second": 13.868, + "epoch": 0.7623806778140816, + "step": 850 }, { - "loss": 2.1625, - "grad_norm": 1.582242727279663, - "learning_rate": 4.6557864313000695e-05, - "epoch": 0.916949478132804, - "step": 14320 + "loss": 355.6048, + "grad_norm": 38.59453582763672, + "learning_rate": 6.56674355803532e-05, + "epoch": 0.7632775962585687, + "step": 851 }, { - "loss": 2.1283, - "grad_norm": 1.6576043367385864, - "learning_rate": 4.642947025580559e-05, - "epoch": 0.917269642056733, - "step": 14325 + "loss": 360.1093, + "grad_norm": 39.37229537963867, + "learning_rate": 6.519190175120473e-05, + "epoch": 0.764174514703056, + "step": 852 }, { - "loss": 2.1409, - "grad_norm": 1.511075735092163, - "learning_rate": 4.629890906608536e-05, - "epoch": 0.9175898059806621, - "step": 14330 + "loss": 357.6195, + "grad_norm": 36.07246017456055, + "learning_rate": 6.47178376944092e-05, + "epoch": 0.7650714331475431, + "step": 853 }, { - "loss": 2.1281, - "grad_norm": 1.5506491661071777, - "learning_rate": 4.6166193947311544e-05, - "epoch": 0.9179099699045912, - "step": 14335 + "loss": 357.4596, + "grad_norm": 36.77618408203125, + "learning_rate": 6.424524718018163e-05, + "epoch": 0.7659683515920302, + "step": 854 }, { - "loss": 2.1398, - "grad_norm": 1.5191504955291748, - "learning_rate": 4.6031338320779534e-05, - "epoch": 0.9182301338285203, - "step": 14340 + "loss": 359.593, + "grad_norm": 36.766483306884766, + "learning_rate": 6.377413396701781e-05, + "epoch": 0.7668652700365174, + "step": 855 }, { - "loss": 2.1247, - "grad_norm": 1.588230848312378, - "learning_rate": 4.589435582425131e-05, - "epoch": 0.9185502977524492, - "step": 14345 + "loss": 356.4777, + "grad_norm": 43.47877502441406, + "learning_rate": 6.330450180166464e-05, + "epoch": 0.7677621884810045, + "step": 856 }, { - "loss": 2.1537, - "grad_norm": 1.5594102144241333, - "learning_rate": 4.5755260310576234e-05, - "epoch": 0.9188704616763783, - "step": 14350 + "loss": 353.8591, + "grad_norm": 39.65815353393555, + "learning_rate": 6.283635441909044e-05, + "epoch": 0.7686591069254917, + "step": 857 }, { - "loss": 2.1339, - "grad_norm": 1.5035042762756348, - "learning_rate": 4.561406584629018e-05, - "epoch": 0.9191906256003074, - "step": 14355 + "loss": 358.9107, + "grad_norm": 42.22090148925781, + "learning_rate": 6.236969554245486e-05, + "epoch": 0.7695560253699789, + "step": 858 }, { - "loss": 2.1301, - "grad_norm": 1.648313283920288, - "learning_rate": 4.547078671019294e-05, - "epoch": 0.9195107895242364, - "step": 14360 + "loss": 361.3808, + "grad_norm": 37.009342193603516, + "learning_rate": 6.19045288830798e-05, + "epoch": 0.770452943814466, + "step": 859 }, { - "loss": 2.1223, - "grad_norm": 1.5859969854354858, - "learning_rate": 4.5325437391904316e-05, - "epoch": 0.9198309534481655, - "step": 14365 + "loss": 359.7101, + "grad_norm": 36.62922668457031, + "learning_rate": 6.144085814041941e-05, + "epoch": 0.7713498622589532, + "step": 860 }, { - "loss": 2.1238, - "grad_norm": 1.5037835836410522, - "learning_rate": 4.51780325903987e-05, - "epoch": 0.9201511173720945, - "step": 14370 + "loss": 360.3506, + "grad_norm": 35.92998123168945, + "learning_rate": 6.097868700203082e-05, + "epoch": 0.7722467807034403, + "step": 861 }, { - "loss": 2.1529, - "grad_norm": 1.5427237749099731, - "learning_rate": 4.5028587212518705e-05, - "epoch": 0.9204712812960235, - "step": 14375 + "loss": 352.6364, + "grad_norm": 40.08286666870117, + "learning_rate": 6.05180191435451e-05, + "epoch": 0.7731436991479275, + "step": 862 }, { - "loss": 2.1016, - "grad_norm": 1.4729586839675903, - "learning_rate": 4.487711637146754e-05, - "epoch": 0.9207914452199526, - "step": 14380 + "loss": 356.8879, + "grad_norm": 38.76757049560547, + "learning_rate": 6.0058858228637605e-05, + "epoch": 0.7740406175924146, + "step": 863 }, { - "loss": 2.1257, - "grad_norm": 1.5223755836486816, - "learning_rate": 4.4723635385280724e-05, - "epoch": 0.9211116091438817, - "step": 14385 + "loss": 355.7852, + "grad_norm": 37.80318069458008, + "learning_rate": 5.960120790899895e-05, + "epoch": 0.7749375360369017, + "step": 864 }, { - "loss": 2.1473, - "grad_norm": 1.5194729566574097, - "learning_rate": 4.456815977527694e-05, - "epoch": 0.9214317730678108, - "step": 14390 + "loss": 357.245, + "grad_norm": 36.61247253417969, + "learning_rate": 5.914507182430626e-05, + "epoch": 0.775834454481389, + "step": 865 }, { - "loss": 2.141, - "grad_norm": 1.5059860944747925, - "learning_rate": 4.4410705264488415e-05, - "epoch": 0.9217519369917397, - "step": 14395 + "loss": 355.3506, + "grad_norm": 37.76987838745117, + "learning_rate": 5.869045360219391e-05, + "epoch": 0.7767313729258761, + "step": 866 }, { - "loss": 2.1138, - "grad_norm": 1.543553113937378, - "learning_rate": 4.425128777607084e-05, - "epoch": 0.9220721009156688, - "step": 14400 + "loss": 351.2185, + "grad_norm": 37.881492614746094, + "learning_rate": 5.8237356858224704e-05, + "epoch": 0.7776282913703633, + "step": 867 }, { - "eval_loss": 2.00039005279541, - "eval_runtime": 13.1104, - "eval_samples_per_second": 156.212, - "eval_steps_per_second": 19.526, - "epoch": 0.9220721009156688, - "step": 14400 + "loss": 360.2768, + "grad_norm": 39.45249557495117, + "learning_rate": 5.7785785195861194e-05, + "epoch": 0.7785252098148504, + "step": 868 }, { - "loss": 2.1468, - "grad_norm": 1.5583864450454712, - "learning_rate": 4.4089923431693136e-05, - "epoch": 0.9223922648395979, - "step": 14405 + "loss": 353.9251, + "grad_norm": 39.94224548339844, + "learning_rate": 5.733574220643712e-05, + "epoch": 0.7794221282593375, + "step": 869 }, { - "loss": 2.1244, - "grad_norm": 1.5649158954620361, - "learning_rate": 4.392662854990702e-05, - "epoch": 0.9227124287635269, - "step": 14410 + "loss": 355.1441, + "grad_norm": 37.91038513183594, + "learning_rate": 5.688723146912858e-05, + "epoch": 0.7803190467038247, + "step": 870 }, { - "loss": 2.1579, - "grad_norm": 1.5519460439682007, - "learning_rate": 4.376141964449681e-05, - "epoch": 0.923032592687456, - "step": 14415 + "loss": 359.303, + "grad_norm": 36.14017105102539, + "learning_rate": 5.644025655092591e-05, + "epoch": 0.7812159651483118, + "step": 871 }, { - "loss": 2.1185, - "grad_norm": 1.6381224393844604, - "learning_rate": 4.359431342280935e-05, - "epoch": 0.923352756611385, - "step": 14420 + "loss": 359.8912, + "grad_norm": 37.15394592285156, + "learning_rate": 5.5994821006604965e-05, + "epoch": 0.7821128835927991, + "step": 872 }, { - "loss": 2.1166, - "grad_norm": 1.5199573040008545, - "learning_rate": 4.342532678406444e-05, - "epoch": 0.9236729205353141, - "step": 14425 + "loss": 360.2237, + "grad_norm": 35.74496078491211, + "learning_rate": 5.555092837869902e-05, + "epoch": 0.7830098020372862, + "step": 873 }, { - "loss": 2.1598, - "grad_norm": 1.5303763151168823, - "learning_rate": 4.325447681764586e-05, - "epoch": 0.9239930844592431, - "step": 14430 + "loss": 352.0333, + "grad_norm": 37.32427215576172, + "learning_rate": 5.5108582197470784e-05, + "epoch": 0.7839067204817733, + "step": 874 }, { - "loss": 2.1426, - "grad_norm": 1.6057510375976562, - "learning_rate": 4.3081780801373104e-05, - "epoch": 0.9243132483831722, - "step": 14435 + "loss": 359.9949, + "grad_norm": 40.355411529541016, + "learning_rate": 5.4667785980883897e-05, + "epoch": 0.7848036389262605, + "step": 875 }, { - "loss": 2.0851, - "grad_norm": 1.566265344619751, - "learning_rate": 4.290725619975413e-05, - "epoch": 0.9246334123071013, - "step": 14440 + "loss": 351.2752, + "grad_norm": 36.727745056152344, + "learning_rate": 5.422854323457527e-05, + "epoch": 0.7857005573707476, + "step": 876 }, { - "loss": 2.118, - "grad_norm": 1.5874476432800293, - "learning_rate": 4.27309206622192e-05, - "epoch": 0.9249535762310303, - "step": 14445 + "loss": 352.9948, + "grad_norm": 37.40601348876953, + "learning_rate": 5.379085745182721e-05, + "epoch": 0.7865974758152348, + "step": 877 }, { - "loss": 2.1075, - "grad_norm": 1.531272530555725, - "learning_rate": 4.255279202133598e-05, - "epoch": 0.9252737401549593, - "step": 14450 + "loss": 357.7682, + "grad_norm": 36.147159576416016, + "learning_rate": 5.335473211353942e-05, + "epoch": 0.787494394259722, + "step": 878 }, { - "loss": 2.1417, - "grad_norm": 1.5390820503234863, - "learning_rate": 4.237288829100622e-05, - "epoch": 0.9255939040788884, - "step": 14455 + "loss": 360.3233, + "grad_norm": 36.26030349731445, + "learning_rate": 5.29201706882014e-05, + "epoch": 0.7883913127042091, + "step": 879 }, { - "loss": 2.1287, - "grad_norm": 1.5704469680786133, - "learning_rate": 4.219122766464396e-05, - "epoch": 0.9259140680028174, - "step": 14460 + "loss": 354.8234, + "grad_norm": 34.958744049072266, + "learning_rate": 5.2487176631865114e-05, + "epoch": 0.7892882311486963, + "step": 880 }, { - "loss": 2.1114, - "grad_norm": 1.4786990880966187, - "learning_rate": 4.200782851333571e-05, - "epoch": 0.9262342319267465, - "step": 14465 + "loss": 358.086, + "grad_norm": 36.89348602294922, + "learning_rate": 5.205575338811719e-05, + "epoch": 0.7901851495931834, + "step": 881 }, { - "loss": 2.1369, - "grad_norm": 1.518187403678894, - "learning_rate": 4.1822709383982607e-05, - "epoch": 0.9265543958506756, - "step": 14470 + "loss": 357.6668, + "grad_norm": 39.996177673339844, + "learning_rate": 5.1625904388051564e-05, + "epoch": 0.7910820680376706, + "step": 882 }, { - "loss": 2.1398, - "grad_norm": 1.5425680875778198, - "learning_rate": 4.163588899742474e-05, - "epoch": 0.9268745597746046, - "step": 14475 + "loss": 353.7882, + "grad_norm": 36.440711975097656, + "learning_rate": 5.119763305024225e-05, + "epoch": 0.7919789864821577, + "step": 883 }, { - "loss": 2.1393, - "grad_norm": 1.5290255546569824, - "learning_rate": 4.1447386246547995e-05, - "epoch": 0.9271947236985336, - "step": 14480 + "loss": 356.1277, + "grad_norm": 36.0537223815918, + "learning_rate": 5.077094278071642e-05, + "epoch": 0.7928759049266448, + "step": 884 }, { - "loss": 2.1433, - "grad_norm": 1.6013634204864502, - "learning_rate": 4.1257220194373424e-05, - "epoch": 0.9275148876224627, - "step": 14485 + "loss": 359.5157, + "grad_norm": 35.76783752441406, + "learning_rate": 5.034583697292674e-05, + "epoch": 0.7937728233711321, + "step": 885 }, { - "loss": 2.1047, - "grad_norm": 1.5584988594055176, - "learning_rate": 4.106541007212942e-05, - "epoch": 0.9278350515463918, - "step": 14490 + "loss": 353.6391, + "grad_norm": 34.94169998168945, + "learning_rate": 4.9922319007724954e-05, + "epoch": 0.7946697418156192, + "step": 886 }, { - "loss": 2.1444, - "grad_norm": 1.5811387300491333, - "learning_rate": 4.0871975277306894e-05, - "epoch": 0.9281552154703208, - "step": 14495 + "loss": 361.0958, + "grad_norm": 38.87442398071289, + "learning_rate": 4.9500392253334635e-05, + "epoch": 0.7955666602601064, + "step": 887 }, { - "loss": 2.1093, - "grad_norm": 1.5312178134918213, - "learning_rate": 4.067693537169764e-05, - "epoch": 0.9284753793942498, - "step": 14500 + "loss": 357.8425, + "grad_norm": 36.01359558105469, + "learning_rate": 4.908006006532445e-05, + "epoch": 0.7964635787045935, + "step": 888 }, { - "loss": 2.1307, - "grad_norm": 1.6398024559020996, - "learning_rate": 4.048031007941607e-05, - "epoch": 0.9287955433181789, - "step": 14505 + "loss": 358.4057, + "grad_norm": 39.11752700805664, + "learning_rate": 4.866132578658172e-05, + "epoch": 0.7973604971490806, + "step": 889 }, { - "loss": 2.1452, - "grad_norm": 1.535327434539795, - "learning_rate": 4.028211928490454e-05, - "epoch": 0.929115707242108, - "step": 14510 + "loss": 355.1286, + "grad_norm": 37.169158935546875, + "learning_rate": 4.8244192747285507e-05, + "epoch": 0.7982574155935678, + "step": 890 }, { - "loss": 2.1035, - "grad_norm": 1.545807123184204, - "learning_rate": 4.008238303092249e-05, - "epoch": 0.929435871166037, - "step": 14515 + "loss": 356.0285, + "grad_norm": 35.89703369140625, + "learning_rate": 4.7828664264880254e-05, + "epoch": 0.7991543340380549, + "step": 891 }, { - "loss": 2.1376, - "grad_norm": 1.4855430126190186, - "learning_rate": 3.98811215165195e-05, - "epoch": 0.9297560350899661, - "step": 14520 + "loss": 353.9138, + "grad_norm": 35.52785873413086, + "learning_rate": 4.741474364404955e-05, + "epoch": 0.8000512524825422, + "step": 892 }, { - "loss": 2.1208, - "grad_norm": 1.5296348333358765, - "learning_rate": 3.9678355094992644e-05, - "epoch": 0.9300761990138952, - "step": 14525 + "loss": 359.8646, + "grad_norm": 35.992713928222656, + "learning_rate": 4.7002434176689564e-05, + "epoch": 0.8009481709270293, + "step": 893 }, { - "loss": 2.1218, - "grad_norm": 1.5518286228179932, - "learning_rate": 3.9474104271828126e-05, - "epoch": 0.9303963629378241, - "step": 14530 + "loss": 360.1763, + "grad_norm": 36.50730514526367, + "learning_rate": 4.659173914188319e-05, + "epoch": 0.8018450893715164, + "step": 894 }, { - "loss": 2.1161, - "grad_norm": 1.6967196464538574, - "learning_rate": 3.926838970262765e-05, - "epoch": 0.9307165268617532, - "step": 14535 + "loss": 356.7962, + "grad_norm": 36.77907180786133, + "learning_rate": 4.618266180587363e-05, + "epoch": 0.8027420078160036, + "step": 895 }, { - "loss": 2.124, - "grad_norm": 1.6289931535720825, - "learning_rate": 3.906123219101952e-05, - "epoch": 0.9310366907856823, - "step": 14540 + "loss": 354.5534, + "grad_norm": 36.69013214111328, + "learning_rate": 4.5775205422038695e-05, + "epoch": 0.8036389262604907, + "step": 896 }, { - "loss": 2.1255, - "grad_norm": 1.6077406406402588, - "learning_rate": 3.885265268655478e-05, - "epoch": 0.9313568547096113, - "step": 14545 + "loss": 355.8555, + "grad_norm": 36.079769134521484, + "learning_rate": 4.536937323086479e-05, + "epoch": 0.8045358447049779, + "step": 897 }, { - "loss": 2.1284, - "grad_norm": 1.5821962356567383, - "learning_rate": 3.864267228258866e-05, - "epoch": 0.9316770186335404, - "step": 14550 + "loss": 352.4216, + "grad_norm": 36.98958969116211, + "learning_rate": 4.4965168459921076e-05, + "epoch": 0.8054327631494651, + "step": 898 }, { - "loss": 2.1048, - "grad_norm": 1.696872591972351, - "learning_rate": 3.843131221414738e-05, - "epoch": 0.9319971825574694, - "step": 14555 + "loss": 354.3763, + "grad_norm": 36.339656829833984, + "learning_rate": 4.456259432383408e-05, + "epoch": 0.8063296815939522, + "step": 899 }, { - "loss": 2.1074, - "grad_norm": 1.5824540853500366, - "learning_rate": 3.8218593855780746e-05, - "epoch": 0.9323173464813985, - "step": 14560 + "loss": 353.9048, + "grad_norm": 35.602909088134766, + "learning_rate": 4.4161654024261756e-05, + "epoch": 0.8072266000384394, + "step": 900 }, { - "loss": 2.1297, - "grad_norm": 1.553502082824707, - "learning_rate": 3.800453871940049e-05, - "epoch": 0.9326375104053275, - "step": 14565 + "eval_loss": 1.581258773803711, + "eval_runtime": 19.1453, + "eval_samples_per_second": 106.971, + "eval_steps_per_second": 13.371, + "epoch": 0.8072266000384394, + "step": 900 }, { - "loss": 2.106, - "grad_norm": 1.4843323230743408, - "learning_rate": 3.778916845210487e-05, - "epoch": 0.9329576743292566, - "step": 14570 + "loss": 353.9864, + "grad_norm": 37.425819396972656, + "learning_rate": 4.3762350749868425e-05, + "epoch": 0.8081235184829265, + "step": 901 }, { - "loss": 2.1326, - "grad_norm": 1.5960259437561035, - "learning_rate": 3.757250483398952e-05, - "epoch": 0.9332778382531857, - "step": 14575 + "loss": 352.1746, + "grad_norm": 36.96770095825195, + "learning_rate": 4.336468767629906e-05, + "epoch": 0.8090204369274137, + "step": 902 }, { - "loss": 2.1133, - "grad_norm": 1.5438355207443237, - "learning_rate": 3.735456977594481e-05, - "epoch": 0.9335980021771146, - "step": 14580 + "loss": 362.0162, + "grad_norm": 36.64163589477539, + "learning_rate": 4.296866796615406e-05, + "epoch": 0.8099173553719008, + "step": 903 }, { - "loss": 2.1361, - "grad_norm": 1.5123685598373413, - "learning_rate": 3.71353853174401e-05, - "epoch": 0.9339181661010437, - "step": 14585 + "loss": 356.8323, + "grad_norm": 37.755550384521484, + "learning_rate": 4.257429476896454e-05, + "epoch": 0.8108142738163879, + "step": 904 }, { - "loss": 2.1555, - "grad_norm": 1.6334389448165894, - "learning_rate": 3.691497362429485e-05, - "epoch": 0.9342383300249728, - "step": 14590 + "loss": 355.0851, + "grad_norm": 35.74870300292969, + "learning_rate": 4.2181571221166696e-05, + "epoch": 0.8117111922608752, + "step": 905 }, { - "loss": 2.1254, - "grad_norm": 1.5371829271316528, - "learning_rate": 3.669335698643704e-05, - "epoch": 0.9345584939489019, - "step": 14595 + "loss": 354.1617, + "grad_norm": 35.670047760009766, + "learning_rate": 4.179050044607713e-05, + "epoch": 0.8126081107053623, + "step": 906 }, { - "loss": 2.1017, - "grad_norm": 1.5454310178756714, - "learning_rate": 3.64705578156491e-05, - "epoch": 0.9348786578728309, - "step": 14600 + "loss": 354.9214, + "grad_norm": 36.92220687866211, + "learning_rate": 4.140108555386812e-05, + "epoch": 0.8135050291498495, + "step": 907 }, { - "eval_loss": 1.994555950164795, - "eval_runtime": 9.3529, - "eval_samples_per_second": 218.97, - "eval_steps_per_second": 27.371, - "epoch": 0.9348786578728309, - "step": 14600 + "loss": 351.6111, + "grad_norm": 38.204166412353516, + "learning_rate": 4.101332964154275e-05, + "epoch": 0.8144019475943366, + "step": 908 }, { - "loss": 2.1203, - "grad_norm": 1.554782509803772, - "learning_rate": 3.624659864330129e-05, - "epoch": 0.93519882179676, - "step": 14605 + "loss": 355.9622, + "grad_norm": 35.54768753051758, + "learning_rate": 4.0627235792910224e-05, + "epoch": 0.8152988660388237, + "step": 909 }, { - "loss": 2.1085, - "grad_norm": 1.4939922094345093, - "learning_rate": 3.602150211807326e-05, - "epoch": 0.935518985720689, - "step": 14610 + "loss": 359.8922, + "grad_norm": 37.4915771484375, + "learning_rate": 4.024280707856134e-05, + "epoch": 0.8161957844833109, + "step": 910 }, { - "loss": 2.1079, - "grad_norm": 1.5748850107192993, - "learning_rate": 3.5795291003663575e-05, - "epoch": 0.935839149644618, - "step": 14615 + "loss": 356.2166, + "grad_norm": 36.84100341796875, + "learning_rate": 3.9860046555844406e-05, + "epoch": 0.8170927029277981, + "step": 911 }, { - "loss": 2.109, - "grad_norm": 1.5259861946105957, - "learning_rate": 3.556798817648763e-05, - "epoch": 0.9361593135685471, - "step": 14620 + "loss": 355.0562, + "grad_norm": 35.636878967285156, + "learning_rate": 3.947895726884038e-05, + "epoch": 0.8179896213722853, + "step": 912 }, { - "loss": 2.1514, - "grad_norm": 1.5462098121643066, - "learning_rate": 3.533961662336424e-05, - "epoch": 0.9364794774924762, - "step": 14625 + "loss": 360.0903, + "grad_norm": 36.50727081298828, + "learning_rate": 3.909954224833911e-05, + "epoch": 0.8188865398167724, + "step": 913 }, { - "loss": 2.1115, - "grad_norm": 1.5656838417053223, - "learning_rate": 3.511019943919098e-05, - "epoch": 0.9367996414164051, - "step": 14630 + "loss": 359.0554, + "grad_norm": 37.51554489135742, + "learning_rate": 3.8721804511815007e-05, + "epoch": 0.8197834582612595, + "step": 914 }, { - "loss": 2.123, - "grad_norm": 1.528436541557312, - "learning_rate": 3.487975982460863e-05, - "epoch": 0.9371198053403342, - "step": 14635 + "loss": 356.6491, + "grad_norm": 36.2037239074707, + "learning_rate": 3.834574706340302e-05, + "epoch": 0.8206803767057467, + "step": 915 }, { - "loss": 2.1343, - "grad_norm": 1.5504084825515747, - "learning_rate": 3.4648321083654935e-05, - "epoch": 0.9374399692642633, - "step": 14640 + "loss": 357.358, + "grad_norm": 39.62883758544922, + "learning_rate": 3.797137289387503e-05, + "epoch": 0.8215772951502338, + "step": 916 }, { - "loss": 2.1361, - "grad_norm": 1.524232029914856, - "learning_rate": 3.441590662140792e-05, - "epoch": 0.9377601331881924, - "step": 14645 + "loss": 356.6225, + "grad_norm": 35.792728424072266, + "learning_rate": 3.7598684980615694e-05, + "epoch": 0.822474213594721, + "step": 917 }, { - "loss": 2.1085, - "grad_norm": 1.4713977575302124, - "learning_rate": 3.418253994161892e-05, - "epoch": 0.9380802971121214, - "step": 14650 + "loss": 351.0151, + "grad_norm": 35.77069854736328, + "learning_rate": 3.7227686287598874e-05, + "epoch": 0.8233711320392082, + "step": 918 }, { - "loss": 2.115, - "grad_norm": 1.5077587366104126, - "learning_rate": 3.3948244644335735e-05, - "epoch": 0.9384004610360505, - "step": 14655 + "loss": 356.1569, + "grad_norm": 36.655330657958984, + "learning_rate": 3.685837976536435e-05, + "epoch": 0.8242680504836953, + "step": 919 }, { - "loss": 2.1197, - "grad_norm": 1.5058708190917969, - "learning_rate": 3.3713044423515946e-05, - "epoch": 0.9387206249599795, - "step": 14660 + "loss": 356.6186, + "grad_norm": 35.82206726074219, + "learning_rate": 3.649076835099399e-05, + "epoch": 0.8251649689281825, + "step": 920 }, { - "loss": 2.1231, - "grad_norm": 1.5846128463745117, - "learning_rate": 3.3476963064630786e-05, - "epoch": 0.9390407888839085, - "step": 14665 + "loss": 352.9849, + "grad_norm": 36.314361572265625, + "learning_rate": 3.612485496808843e-05, + "epoch": 0.8260618873726696, + "step": 921 }, { - "loss": 2.0968, - "grad_norm": 1.511231541633606, - "learning_rate": 3.324002444225976e-05, - "epoch": 0.9393609528078376, - "step": 14670 + "loss": 355.4819, + "grad_norm": 37.96638870239258, + "learning_rate": 3.57606425267441e-05, + "epoch": 0.8269588058171568, + "step": 922 }, { - "loss": 2.1139, - "grad_norm": 1.600589632987976, - "learning_rate": 3.3002252517676244e-05, - "epoch": 0.9396811167317667, - "step": 14675 + "loss": 358.6233, + "grad_norm": 36.10899353027344, + "learning_rate": 3.539813392352989e-05, + "epoch": 0.8278557242616439, + "step": 923 }, { - "loss": 2.1021, - "grad_norm": 1.4865034818649292, - "learning_rate": 3.27636713364243e-05, - "epoch": 0.9400012806556958, - "step": 14680 + "loss": 353.3172, + "grad_norm": 34.54022216796875, + "learning_rate": 3.5037332041464e-05, + "epoch": 0.828752642706131, + "step": 924 }, { - "loss": 2.1081, - "grad_norm": 1.5039657354354858, - "learning_rate": 3.2524305025887e-05, - "epoch": 0.9403214445796247, - "step": 14685 + "loss": 357.7184, + "grad_norm": 36.95024108886719, + "learning_rate": 3.467823974999115e-05, + "epoch": 0.8296495611506183, + "step": 925 }, { - "loss": 2.1285, - "grad_norm": 1.5387564897537231, - "learning_rate": 3.228417779284643e-05, - "epoch": 0.9406416085035538, - "step": 14690 + "loss": 352.9876, + "grad_norm": 37.89804458618164, + "learning_rate": 3.4320859904959924e-05, + "epoch": 0.8305464795951054, + "step": 926 }, { - "loss": 2.1274, - "grad_norm": 1.550644040107727, - "learning_rate": 3.2043313921035743e-05, - "epoch": 0.9409617724274829, - "step": 14695 + "loss": 354.4651, + "grad_norm": 36.63965606689453, + "learning_rate": 3.3965195348599626e-05, + "epoch": 0.8314433980395926, + "step": 927 }, { - "loss": 2.1279, - "grad_norm": 1.5030955076217651, - "learning_rate": 3.180173776868331e-05, - "epoch": 0.9412819363514119, - "step": 14700 + "loss": 356.9139, + "grad_norm": 35.67973709106445, + "learning_rate": 3.361124890949816e-05, + "epoch": 0.8323403164840797, + "step": 928 }, { - "loss": 2.1246, - "grad_norm": 1.5117207765579224, - "learning_rate": 3.155947376604948e-05, - "epoch": 0.941602100275341, - "step": 14705 + "loss": 358.1943, + "grad_norm": 35.843719482421875, + "learning_rate": 3.325902340257914e-05, + "epoch": 0.8332372349285668, + "step": 929 }, { - "loss": 2.1481, - "grad_norm": 1.5794004201889038, - "learning_rate": 3.13165464129559e-05, - "epoch": 0.94192226419927, - "step": 14710 + "loss": 352.4489, + "grad_norm": 36.6231803894043, + "learning_rate": 3.2908521629079704e-05, + "epoch": 0.834134153373054, + "step": 930 }, { - "loss": 2.1415, - "grad_norm": 1.4979078769683838, - "learning_rate": 3.107298027630797e-05, - "epoch": 0.942242428123199, - "step": 14715 + "loss": 350.1209, + "grad_norm": 34.934112548828125, + "learning_rate": 3.255974637652828e-05, + "epoch": 0.8350310718175412, + "step": 931 }, { - "loss": 2.1283, - "grad_norm": 1.5171302556991577, - "learning_rate": 3.082879998761035e-05, - "epoch": 0.9425625920471281, - "step": 14720 + "loss": 356.8803, + "grad_norm": 34.707252502441406, + "learning_rate": 3.2212700418722265e-05, + "epoch": 0.8359279902620284, + "step": 932 }, { - "loss": 2.1219, - "grad_norm": 1.5499953031539917, - "learning_rate": 3.058403024047607e-05, - "epoch": 0.9428827559710572, - "step": 14725 + "loss": 356.7214, + "grad_norm": 35.543949127197266, + "learning_rate": 3.186738651570595e-05, + "epoch": 0.8368249087065155, + "step": 933 }, { - "loss": 2.111, - "grad_norm": 1.4895777702331543, - "learning_rate": 3.033869578812924e-05, - "epoch": 0.9432029198949863, - "step": 14730 + "loss": 354.0534, + "grad_norm": 35.74333572387695, + "learning_rate": 3.1523807413748887e-05, + "epoch": 0.8377218271510026, + "step": 934 }, { - "loss": 2.1037, - "grad_norm": 1.5428434610366821, - "learning_rate": 3.0092821440901857e-05, - "epoch": 0.9435230838189153, - "step": 14735 + "loss": 350.9949, + "grad_norm": 36.81149673461914, + "learning_rate": 3.118196584532359e-05, + "epoch": 0.8386187455954898, + "step": 935 }, { - "loss": 2.1368, - "grad_norm": 1.5094553232192993, - "learning_rate": 2.984643206372471e-05, - "epoch": 0.9438432477428443, - "step": 14740 + "loss": 355.0341, + "grad_norm": 36.43380355834961, + "learning_rate": 3.084186452908411e-05, + "epoch": 0.8395156640399769, + "step": 936 }, { - "loss": 2.1381, - "grad_norm": 1.5818499326705933, - "learning_rate": 2.959955257361286e-05, - "epoch": 0.9441634116667734, - "step": 14745 + "loss": 357.6827, + "grad_norm": 35.787872314453125, + "learning_rate": 3.0503506169844373e-05, + "epoch": 0.840412582484464, + "step": 937 }, { - "loss": 2.0922, - "grad_norm": 1.513429880142212, - "learning_rate": 2.935220793714582e-05, - "epoch": 0.9444835755907024, - "step": 14750 + "loss": 353.5415, + "grad_norm": 35.96485137939453, + "learning_rate": 3.0166893458556666e-05, + "epoch": 0.8413095009289513, + "step": 938 }, { - "loss": 2.1022, - "grad_norm": 1.556632161140442, - "learning_rate": 2.9104423167942678e-05, - "epoch": 0.9448037395146315, - "step": 14755 + "loss": 357.3773, + "grad_norm": 33.9022216796875, + "learning_rate": 2.983202907228999e-05, + "epoch": 0.8422064193734384, + "step": 939 }, { - "loss": 2.1063, - "grad_norm": 1.5335612297058105, - "learning_rate": 2.885622332413256e-05, - "epoch": 0.9451239034385606, - "step": 14760 + "loss": 355.6847, + "grad_norm": 36.94380187988281, + "learning_rate": 2.949891567420923e-05, + "epoch": 0.8431033378179256, + "step": 940 }, { - "loss": 2.1308, - "grad_norm": 1.5133774280548096, - "learning_rate": 2.8607633505820504e-05, - "epoch": 0.9454440673624896, - "step": 14765 + "loss": 352.4488, + "grad_norm": 36.33073043823242, + "learning_rate": 2.9167555913553577e-05, + "epoch": 0.8440002562624127, + "step": 941 }, { - "loss": 2.0859, - "grad_norm": 1.4861382246017456, - "learning_rate": 2.835867885254912e-05, - "epoch": 0.9457642312864186, - "step": 14770 + "loss": 355.2479, + "grad_norm": 34.81533432006836, + "learning_rate": 2.88379524256156e-05, + "epoch": 0.8448971747068998, + "step": 942 }, { - "loss": 2.1228, - "grad_norm": 1.5895521640777588, - "learning_rate": 2.8109384540756267e-05, - "epoch": 0.9460843952103477, - "step": 14775 + "loss": 359.0098, + "grad_norm": 34.85913848876953, + "learning_rate": 2.8510107831720393e-05, + "epoch": 0.845794093151387, + "step": 943 }, { - "loss": 2.0916, - "grad_norm": 1.5556671619415283, - "learning_rate": 2.7859775781229013e-05, - "epoch": 0.9464045591342768, - "step": 14780 + "loss": 355.3041, + "grad_norm": 35.2500114440918, + "learning_rate": 2.8184024739204534e-05, + "epoch": 0.8466910115958741, + "step": 944 }, { - "loss": 2.1495, - "grad_norm": 1.494764804840088, - "learning_rate": 2.7609877816554085e-05, - "epoch": 0.9467247230582058, - "step": 14785 + "loss": 357.6105, + "grad_norm": 36.625144958496094, + "learning_rate": 2.7859705741395403e-05, + "epoch": 0.8475879300403614, + "step": 945 }, { - "loss": 2.1671, - "grad_norm": 1.5063127279281616, - "learning_rate": 2.7359715918565103e-05, - "epoch": 0.9470448869821348, - "step": 14790 + "loss": 355.7482, + "grad_norm": 34.630428314208984, + "learning_rate": 2.7537153417590803e-05, + "epoch": 0.8484848484848485, + "step": 946 }, { - "loss": 2.1287, - "grad_norm": 1.612107515335083, - "learning_rate": 2.710931538578692e-05, - "epoch": 0.9473650509060639, - "step": 14795 + "loss": 358.0374, + "grad_norm": 35.17256164550781, + "learning_rate": 2.721637033303803e-05, + "epoch": 0.8493817669293356, + "step": 947 }, { - "loss": 2.1083, - "grad_norm": 1.5210696458816528, - "learning_rate": 2.6858701540877185e-05, - "epoch": 0.9476852148299929, - "step": 14800 + "loss": 352.4902, + "grad_norm": 36.90748596191406, + "learning_rate": 2.6897359038913716e-05, + "epoch": 0.8502786853738228, + "step": 948 }, { - "eval_loss": 1.9744480848312378, - "eval_runtime": 9.307, - "eval_samples_per_second": 220.049, - "eval_steps_per_second": 27.506, - "epoch": 0.9476852148299929, - "step": 14800 + "loss": 356.3272, + "grad_norm": 35.69559097290039, + "learning_rate": 2.6580122072303647e-05, + "epoch": 0.8511756038183099, + "step": 949 }, { - "loss": 2.1447, - "grad_norm": 1.492371916770935, - "learning_rate": 2.660789972806551e-05, - "epoch": 0.948005378753922, - "step": 14805 + "loss": 351.9118, + "grad_norm": 34.44248580932617, + "learning_rate": 2.6264661956182212e-05, + "epoch": 0.8520725222627971, + "step": 950 }, { - "loss": 2.1099, - "grad_norm": 1.541387677192688, - "learning_rate": 2.635693531059043e-05, - "epoch": 0.9483255426778511, - "step": 14810 + "eval_loss": 1.5959553718566895, + "eval_runtime": 18.4817, + "eval_samples_per_second": 110.812, + "eval_steps_per_second": 13.852, + "epoch": 0.8520725222627971, + "step": 950 }, { - "loss": 2.1105, - "grad_norm": 1.4616005420684814, - "learning_rate": 2.6105833668134473e-05, - "epoch": 0.9486457066017802, - "step": 14815 + "loss": 356.2447, + "grad_norm": 34.08928680419922, + "learning_rate": 2.5950981199392847e-05, + "epoch": 0.8529694407072843, + "step": 951 }, { - "loss": 2.1132, - "grad_norm": 1.4997398853302002, - "learning_rate": 2.5854620194257533e-05, - "epoch": 0.9489658705257091, - "step": 14820 + "loss": 357.2951, + "grad_norm": 35.93143844604492, + "learning_rate": 2.5639082296627537e-05, + "epoch": 0.8538663591517714, + "step": 952 }, { - "loss": 2.104, - "grad_norm": 1.5401780605316162, - "learning_rate": 2.5603320293828866e-05, - "epoch": 0.9492860344496382, - "step": 14825 + "loss": 357.1935, + "grad_norm": 34.351898193359375, + "learning_rate": 2.5328967728407454e-05, + "epoch": 0.8547632775962586, + "step": 953 }, { - "loss": 2.1319, - "grad_norm": 1.4212836027145386, - "learning_rate": 2.535195938045791e-05, - "epoch": 0.9496061983735673, - "step": 14830 + "loss": 352.3139, + "grad_norm": 36.010223388671875, + "learning_rate": 2.5020639961062853e-05, + "epoch": 0.8556601960407457, + "step": 954 }, { - "loss": 2.1123, - "grad_norm": 1.498403549194336, - "learning_rate": 2.5100562873924283e-05, - "epoch": 0.9499263622974963, - "step": 14835 + "loss": 356.4665, + "grad_norm": 34.825042724609375, + "learning_rate": 2.4714101446713793e-05, + "epoch": 0.8565571144852329, + "step": 955 }, { - "loss": 2.1251, - "grad_norm": 1.5054161548614502, - "learning_rate": 2.484915619760707e-05, - "epoch": 0.9502465262214254, - "step": 14840 + "loss": 354.6561, + "grad_norm": 35.965755462646484, + "learning_rate": 2.4409354623250307e-05, + "epoch": 0.85745403292972, + "step": 956 }, { - "loss": 2.1148, - "grad_norm": 1.5165938138961792, - "learning_rate": 2.4597764775913813e-05, - "epoch": 0.9505666901453544, - "step": 14845 + "loss": 350.8446, + "grad_norm": 34.73567199707031, + "learning_rate": 2.4106401914313238e-05, + "epoch": 0.8583509513742071, + "step": 957 }, { - "loss": 2.1301, - "grad_norm": 1.4614144563674927, - "learning_rate": 2.4346414031709386e-05, - "epoch": 0.9508868540692835, - "step": 14850 + "loss": 357.6875, + "grad_norm": 34.63365936279297, + "learning_rate": 2.3805245729274947e-05, + "epoch": 0.8592478698186944, + "step": 958 }, { - "loss": 2.0923, - "grad_norm": 1.522401213645935, - "learning_rate": 2.409512938374499e-05, - "epoch": 0.9512070179932125, - "step": 14855 + "loss": 352.3867, + "grad_norm": 37.33460235595703, + "learning_rate": 2.3505888463220047e-05, + "epoch": 0.8601447882631815, + "step": 959 }, { - "loss": 2.1349, - "grad_norm": 1.5170820951461792, - "learning_rate": 2.384393624408761e-05, - "epoch": 0.9515271819171416, - "step": 14860 + "loss": 357.7318, + "grad_norm": 35.54653549194336, + "learning_rate": 2.3208332496926387e-05, + "epoch": 0.8610417067076687, + "step": 960 }, { - "loss": 2.0957, - "grad_norm": 1.4755724668502808, - "learning_rate": 2.3592860015550146e-05, - "epoch": 0.9518473458410707, - "step": 14865 + "loss": 356.5225, + "grad_norm": 34.780433654785156, + "learning_rate": 2.2912580196846222e-05, + "epoch": 0.8619386251521558, + "step": 961 }, { - "loss": 2.1389, - "grad_norm": 1.517958164215088, - "learning_rate": 2.334192608912241e-05, - "epoch": 0.9521675097649996, - "step": 14870 + "loss": 358.1692, + "grad_norm": 37.751983642578125, + "learning_rate": 2.2618633915087282e-05, + "epoch": 0.8628355435966429, + "step": 962 }, { - "loss": 2.1245, - "grad_norm": 1.4963104724884033, - "learning_rate": 2.3091159841403398e-05, - "epoch": 0.9524876736889287, - "step": 14875 + "loss": 359.3351, + "grad_norm": 35.848167419433594, + "learning_rate": 2.2326495989393985e-05, + "epoch": 0.8637324620411301, + "step": 963 }, { - "loss": 2.1243, - "grad_norm": 1.4569158554077148, - "learning_rate": 2.2840586632035014e-05, - "epoch": 0.9528078376128578, - "step": 14880 + "loss": 354.9636, + "grad_norm": 34.292728424072266, + "learning_rate": 2.203616874312919e-05, + "epoch": 0.8646293804856173, + "step": 964 }, { - "loss": 2.1134, - "grad_norm": 1.542040467262268, - "learning_rate": 2.2590231801137447e-05, - "epoch": 0.9531280015367868, - "step": 14885 + "loss": 350.5273, + "grad_norm": 35.46641540527344, + "learning_rate": 2.174765448525523e-05, + "epoch": 0.8655262989301045, + "step": 965 }, { - "loss": 2.1003, - "grad_norm": 1.4777690172195435, - "learning_rate": 2.2340120666746577e-05, - "epoch": 0.9534481654607159, - "step": 14890 + "loss": 355.4344, + "grad_norm": 34.72315979003906, + "learning_rate": 2.1460955510315962e-05, + "epoch": 0.8664232173745916, + "step": 966 }, { - "loss": 2.1317, - "grad_norm": 1.4963622093200684, - "learning_rate": 2.2090278522253604e-05, - "epoch": 0.953768329384645, - "step": 14895 + "loss": 353.3275, + "grad_norm": 36.16691589355469, + "learning_rate": 2.1176074098418402e-05, + "epoch": 0.8673201358190787, + "step": 967 }, { - "loss": 2.1372, - "grad_norm": 1.5573043823242188, - "learning_rate": 2.1840730633847156e-05, - "epoch": 0.954088493308574, - "step": 14900 + "loss": 355.2486, + "grad_norm": 36.415794372558594, + "learning_rate": 2.0893012515214388e-05, + "epoch": 0.8682170542635659, + "step": 968 }, { - "loss": 2.1405, - "grad_norm": 1.5820651054382324, - "learning_rate": 2.1591502237958115e-05, - "epoch": 0.954408657232503, - "step": 14905 + "loss": 355.4182, + "grad_norm": 35.465538024902344, + "learning_rate": 2.06117730118828e-05, + "epoch": 0.869113972708053, + "step": 969 }, { - "loss": 2.1104, - "grad_norm": 1.5473122596740723, - "learning_rate": 2.134261853870757e-05, - "epoch": 0.9547288211564321, - "step": 14910 + "loss": 354.304, + "grad_norm": 35.425926208496094, + "learning_rate": 2.0332357825111668e-05, + "epoch": 0.8700108911525402, + "step": 970 }, { - "loss": 2.1043, - "grad_norm": 1.5598843097686768, - "learning_rate": 2.1094104705357908e-05, - "epoch": 0.9550489850803612, - "step": 14915 + "loss": 351.7629, + "grad_norm": 34.78888702392578, + "learning_rate": 2.0054769177080185e-05, + "epoch": 0.8709078095970274, + "step": 971 }, { - "loss": 2.1012, - "grad_norm": 1.5229307413101196, - "learning_rate": 2.0845985869767487e-05, - "epoch": 0.9553691490042902, - "step": 14920 + "loss": 358.8823, + "grad_norm": 35.0769157409668, + "learning_rate": 1.97790092754411e-05, + "epoch": 0.8718047280415145, + "step": 972 }, { - "loss": 2.0979, - "grad_norm": 1.5292434692382812, - "learning_rate": 2.0598287123849095e-05, - "epoch": 0.9556893129282192, - "step": 14925 + "loss": 353.2525, + "grad_norm": 35.73164749145508, + "learning_rate": 1.9505080313303365e-05, + "epoch": 0.8727016464860017, + "step": 973 }, { - "loss": 2.1127, - "grad_norm": 1.5072777271270752, - "learning_rate": 2.0351033517032427e-05, - "epoch": 0.9560094768521483, - "step": 14930 + "loss": 355.5436, + "grad_norm": 35.51607894897461, + "learning_rate": 1.9232984469214453e-05, + "epoch": 0.8735985649304888, + "step": 974 }, { - "loss": 2.1393, - "grad_norm": 1.4828133583068848, - "learning_rate": 2.0104250053730905e-05, - "epoch": 0.9563296407760774, - "step": 14935 + "loss": 353.8528, + "grad_norm": 35.09918975830078, + "learning_rate": 1.8962723907143044e-05, + "epoch": 0.874495483374976, + "step": 975 }, { - "loss": 2.1392, - "grad_norm": 1.572771668434143, - "learning_rate": 1.9857961690812945e-05, - "epoch": 0.9566498047000064, - "step": 14940 + "loss": 358.7514, + "grad_norm": 36.12480926513672, + "learning_rate": 1.869430077646203e-05, + "epoch": 0.8753924018194631, + "step": 976 }, { - "loss": 2.0975, - "grad_norm": 1.4665296077728271, - "learning_rate": 1.9612193335078193e-05, - "epoch": 0.9569699686239355, - "step": 14945 + "loss": 354.3459, + "grad_norm": 34.32866287231445, + "learning_rate": 1.8427717211931177e-05, + "epoch": 0.8762893202639502, + "step": 977 }, { - "loss": 2.1348, - "grad_norm": 1.47166109085083, - "learning_rate": 1.936696984073867e-05, - "epoch": 0.9572901325478645, - "step": 14950 + "loss": 350.5236, + "grad_norm": 35.1101188659668, + "learning_rate": 1.816297533368022e-05, + "epoch": 0.8771862387084375, + "step": 978 }, { - "loss": 2.0937, - "grad_norm": 1.4156910181045532, - "learning_rate": 1.9122316006905333e-05, - "epoch": 0.9576102964717935, - "step": 14955 + "loss": 353.4749, + "grad_norm": 36.59587478637695, + "learning_rate": 1.7900077247192087e-05, + "epoch": 0.8780831571529246, + "step": 979 }, { - "loss": 2.1046, - "grad_norm": 1.5509635210037231, - "learning_rate": 1.887825657508016e-05, - "epoch": 0.9579304603957226, - "step": 14960 + "loss": 353.3892, + "grad_norm": 34.86069869995117, + "learning_rate": 1.7639025043286155e-05, + "epoch": 0.8789800755974118, + "step": 980 }, { - "loss": 2.1508, - "grad_norm": 1.4977760314941406, - "learning_rate": 1.8634816226654074e-05, - "epoch": 0.9582506243196517, - "step": 14965 + "loss": 354.1761, + "grad_norm": 35.580291748046875, + "learning_rate": 1.7379820798101383e-05, + "epoch": 0.8798769940418989, + "step": 981 }, { - "loss": 2.1104, - "grad_norm": 1.4559874534606934, - "learning_rate": 1.839201958041096e-05, - "epoch": 0.9585707882435807, - "step": 14970 + "loss": 355.6291, + "grad_norm": 34.58673095703125, + "learning_rate": 1.7122466573080196e-05, + "epoch": 0.880773912486386, + "step": 982 }, { - "loss": 2.1156, - "grad_norm": 1.5095617771148682, - "learning_rate": 1.8149891190038e-05, - "epoch": 0.9588909521675097, - "step": 14975 + "loss": 357.7327, + "grad_norm": 33.76737976074219, + "learning_rate": 1.6866964414951698e-05, + "epoch": 0.8816708309308732, + "step": 983 }, { - "loss": 2.0877, - "grad_norm": 1.4174879789352417, - "learning_rate": 1.7908455541642584e-05, - "epoch": 0.9592111160914388, - "step": 14980 + "loss": 355.4995, + "grad_norm": 34.57607650756836, + "learning_rate": 1.6613316355715558e-05, + "epoch": 0.8825677493753604, + "step": 984 }, { - "loss": 2.1116, - "grad_norm": 1.5217187404632568, - "learning_rate": 1.7667737051276076e-05, - "epoch": 0.9595312800153679, - "step": 14985 + "loss": 357.9588, + "grad_norm": 34.49372100830078, + "learning_rate": 1.6361524412626088e-05, + "epoch": 0.8834646678198476, + "step": 985 }, { - "loss": 2.1417, - "grad_norm": 1.4732062816619873, - "learning_rate": 1.742776006246463e-05, - "epoch": 0.9598514439392969, - "step": 14990 + "loss": 357.0802, + "grad_norm": 34.17061996459961, + "learning_rate": 1.611159058817571e-05, + "epoch": 0.8843615862643347, + "step": 986 }, { - "loss": 2.1204, - "grad_norm": 1.4818220138549805, - "learning_rate": 1.71885488437474e-05, - "epoch": 0.960171607863226, - "step": 14995 + "loss": 354.1526, + "grad_norm": 36.93791198730469, + "learning_rate": 1.5863516870079418e-05, + "epoch": 0.8852585047088218, + "step": 987 }, { - "loss": 2.0826, - "grad_norm": 1.4875595569610596, - "learning_rate": 1.695012758622226e-05, - "epoch": 0.9604917717871551, - "step": 15000 + "loss": 358.1216, + "grad_norm": 35.566646575927734, + "learning_rate": 1.5617305231258898e-05, + "epoch": 0.886155423153309, + "step": 988 }, { - "eval_loss": 1.9857242107391357, - "eval_runtime": 15.0438, - "eval_samples_per_second": 136.136, - "eval_steps_per_second": 17.017, - "epoch": 0.9604917717871551, - "step": 15000 + "loss": 351.2595, + "grad_norm": 35.77732467651367, + "learning_rate": 1.5372957629826655e-05, + "epoch": 0.8870523415977961, + "step": 989 }, { - "loss": 2.0994, - "grad_norm": 1.5387358665466309, - "learning_rate": 1.6712520401099422e-05, - "epoch": 0.960811935711084, - "step": 15005 + "loss": 353.016, + "grad_norm": 37.376441955566406, + "learning_rate": 1.513047600907061e-05, + "epoch": 0.8879492600422833, + "step": 990 }, { - "loss": 2.1318, - "grad_norm": 1.4918915033340454, - "learning_rate": 1.6475751317263063e-05, - "epoch": 0.9611320996350131, - "step": 15010 + "loss": 352.4042, + "grad_norm": 34.55933380126953, + "learning_rate": 1.4889862297438688e-05, + "epoch": 0.8888461784867705, + "step": 991 }, { - "loss": 2.122, - "grad_norm": 1.4722011089324951, - "learning_rate": 1.6239844278841366e-05, - "epoch": 0.9614522635589422, - "step": 15015 + "loss": 352.0331, + "grad_norm": 34.30587387084961, + "learning_rate": 1.4651118408523317e-05, + "epoch": 0.8897430969312576, + "step": 992 }, { - "loss": 2.1167, - "grad_norm": 1.4381043910980225, - "learning_rate": 1.600482314278505e-05, - "epoch": 0.9617724274828713, - "step": 15020 + "loss": 356.2885, + "grad_norm": 34.28126525878906, + "learning_rate": 1.4414246241046286e-05, + "epoch": 0.8906400153757448, + "step": 993 }, { - "loss": 2.0594, - "grad_norm": 1.5075242519378662, - "learning_rate": 1.5770711676454767e-05, - "epoch": 0.9620925914068003, - "step": 15025 + "loss": 356.9485, + "grad_norm": 35.106529235839844, + "learning_rate": 1.4179247678843681e-05, + "epoch": 0.8915369338202319, + "step": 994 }, { - "loss": 2.1349, - "grad_norm": 1.49933660030365, - "learning_rate": 1.5537533555217525e-05, - "epoch": 0.9624127553307293, - "step": 15030 + "loss": 357.6618, + "grad_norm": 33.811737060546875, + "learning_rate": 1.3946124590850901e-05, + "epoch": 0.892433852264719, + "step": 995 }, { - "loss": 2.1218, - "grad_norm": 1.4700450897216797, - "learning_rate": 1.5305312360052442e-05, - "epoch": 0.9627329192546584, - "step": 15035 + "loss": 361.4888, + "grad_norm": 33.41731643676758, + "learning_rate": 1.3714878831087657e-05, + "epoch": 0.8933307707092062, + "step": 996 }, { - "loss": 2.102, - "grad_norm": 1.5126475095748901, - "learning_rate": 1.5074071575166057e-05, - "epoch": 0.9630530831785874, - "step": 15040 + "loss": 358.7178, + "grad_norm": 34.46256637573242, + "learning_rate": 1.3485512238643499e-05, + "epoch": 0.8942276891536933, + "step": 997 }, { - "loss": 2.0933, - "grad_norm": 1.501659870147705, - "learning_rate": 1.4843834585617333e-05, - "epoch": 0.9633732471025165, - "step": 15045 + "loss": 357.5736, + "grad_norm": 35.067893981933594, + "learning_rate": 1.3258026637662846e-05, + "epoch": 0.8951246075981806, + "step": 998 }, { - "loss": 2.1065, - "grad_norm": 1.4653816223144531, - "learning_rate": 1.4614624674952842e-05, - "epoch": 0.9636934110264456, - "step": 15050 + "loss": 353.149, + "grad_norm": 34.04292678833008, + "learning_rate": 1.3032423837330748e-05, + "epoch": 0.8960215260426677, + "step": 999 }, { - "loss": 2.1135, - "grad_norm": 1.5033470392227173, - "learning_rate": 1.4386465022852091e-05, - "epoch": 0.9640135749503745, - "step": 15055 + "loss": 356.1142, + "grad_norm": 34.39286422729492, + "learning_rate": 1.2808705631858459e-05, + "epoch": 0.8969184444871549, + "step": 1000 }, { - "loss": 2.1284, - "grad_norm": 1.539259672164917, - "learning_rate": 1.4159378702783404e-05, - "epoch": 0.9643337388743036, - "step": 15060 + "eval_loss": 1.586561918258667, + "eval_runtime": 20.2668, + "eval_samples_per_second": 101.052, + "eval_steps_per_second": 12.631, + "epoch": 0.8969184444871549, + "step": 1000 }, { - "loss": 2.1003, - "grad_norm": 1.4910892248153687, - "learning_rate": 1.3933388679670506e-05, - "epoch": 0.9646539027982327, - "step": 15065 + "loss": 354.0248, + "grad_norm": 36.2171516418457, + "learning_rate": 1.2586873800468996e-05, + "epoch": 0.897815362931642, + "step": 1001 }, { - "loss": 2.1035, - "grad_norm": 1.4681600332260132, - "learning_rate": 1.3708517807570171e-05, - "epoch": 0.9649740667221618, - "step": 15070 + "loss": 362.0434, + "grad_norm": 34.42704391479492, + "learning_rate": 1.2366930107383156e-05, + "epoch": 0.8987122813761291, + "step": 1002 }, { - "loss": 2.0992, - "grad_norm": 1.4621437788009644, - "learning_rate": 1.3484788827360955e-05, - "epoch": 0.9652942306460908, - "step": 15075 + "loss": 354.9637, + "grad_norm": 34.4918212890625, + "learning_rate": 1.2148876301805528e-05, + "epoch": 0.8996091998206163, + "step": 1003 }, { - "loss": 2.1292, - "grad_norm": 1.438450813293457, - "learning_rate": 1.3262224364443493e-05, - "epoch": 0.9656143945700199, - "step": 15080 + "loss": 348.8729, + "grad_norm": 34.57630157470703, + "learning_rate": 1.1932714117910386e-05, + "epoch": 0.9005061182651035, + "step": 1004 }, { - "loss": 2.0915, - "grad_norm": 1.4840413331985474, - "learning_rate": 1.3040846926452386e-05, - "epoch": 0.9659345584939489, - "step": 15085 + "loss": 352.9299, + "grad_norm": 35.46476745605469, + "learning_rate": 1.171844527482796e-05, + "epoch": 0.9014030367095907, + "step": 1005 }, { - "loss": 2.1174, - "grad_norm": 1.4679360389709473, - "learning_rate": 1.2820678900980093e-05, - "epoch": 0.9662547224178779, - "step": 15090 + "loss": 355.247, + "grad_norm": 34.4285888671875, + "learning_rate": 1.1506071476630964e-05, + "epoch": 0.9022999551540778, + "step": 1006 }, { - "loss": 2.1146, - "grad_norm": 1.46363365650177, - "learning_rate": 1.260174255331282e-05, - "epoch": 0.966574886341807, - "step": 15095 + "loss": 352.168, + "grad_norm": 34.935569763183594, + "learning_rate": 1.1295594412320754e-05, + "epoch": 0.9031968735985649, + "step": 1007 }, { - "loss": 2.1146, - "grad_norm": 1.5261812210083008, - "learning_rate": 1.2384060024178956e-05, - "epoch": 0.9668950502657361, - "step": 15100 + "loss": 357.9673, + "grad_norm": 33.162166595458984, + "learning_rate": 1.1087015755814084e-05, + "epoch": 0.9040937920430521, + "step": 1008 }, { - "loss": 2.1373, - "grad_norm": 1.5290420055389404, - "learning_rate": 1.2167653327509926e-05, - "epoch": 0.9672152141896652, - "step": 15105 + "loss": 350.8712, + "grad_norm": 34.0540657043457, + "learning_rate": 1.088033716592976e-05, + "epoch": 0.9049907104875392, + "step": 1009 }, { - "loss": 2.0911, - "grad_norm": 1.4681881666183472, - "learning_rate": 1.1952544348214028e-05, - "epoch": 0.9675353781135941, - "step": 15110 + "loss": 356.8466, + "grad_norm": 33.83312225341797, + "learning_rate": 1.0675560286375369e-05, + "epoch": 0.9058876289320263, + "step": 1010 }, { - "loss": 2.1215, - "grad_norm": 1.5597201585769653, - "learning_rate": 1.1738754839963159e-05, - "epoch": 0.9678555420375232, - "step": 15115 + "loss": 353.7512, + "grad_norm": 34.7866096496582, + "learning_rate": 1.0472686745734233e-05, + "epoch": 0.9067845473765136, + "step": 1011 }, { - "loss": 2.0924, - "grad_norm": 1.4340606927871704, - "learning_rate": 1.1526306422992994e-05, - "epoch": 0.9681757059614523, - "step": 15120 + "loss": 354.8209, + "grad_norm": 34.10197067260742, + "learning_rate": 1.027171815745262e-05, + "epoch": 0.9076814658210007, + "step": 1012 }, { - "loss": 2.1189, - "grad_norm": 1.51260244846344, - "learning_rate": 1.1315220581916477e-05, - "epoch": 0.9684958698853813, - "step": 15125 + "loss": 354.7816, + "grad_norm": 34.292598724365234, + "learning_rate": 1.0072656119826662e-05, + "epoch": 0.9085783842654879, + "step": 1013 }, { - "loss": 2.117, - "grad_norm": 1.4482096433639526, - "learning_rate": 1.1105518663551176e-05, - "epoch": 0.9688160338093104, - "step": 15130 + "loss": 356.8245, + "grad_norm": 34.5960693359375, + "learning_rate": 9.875502215989791e-06, + "epoch": 0.909475302709975, + "step": 1014 }, { - "loss": 2.1359, - "grad_norm": 1.4650722742080688, - "learning_rate": 1.0897221874760444e-05, - "epoch": 0.9691361977332394, - "step": 15135 + "loss": 353.8681, + "grad_norm": 33.786537170410156, + "learning_rate": 9.680258013900129e-06, + "epoch": 0.9103722211544621, + "step": 1015 }, { - "loss": 2.1113, - "grad_norm": 1.5193512439727783, - "learning_rate": 1.0690351280308877e-05, - "epoch": 0.9694563616571684, - "step": 15140 + "loss": 355.527, + "grad_norm": 35.2137565612793, + "learning_rate": 9.486925066327978e-06, + "epoch": 0.9112691395989493, + "step": 1016 }, { - "loss": 2.1186, - "grad_norm": 1.5083808898925781, - "learning_rate": 1.0484927800731984e-05, - "epoch": 0.9697765255810975, - "step": 15145 + "loss": 352.3827, + "grad_norm": 34.659767150878906, + "learning_rate": 9.295504910843522e-06, + "epoch": 0.9121660580434365, + "step": 1017 }, { - "loss": 2.088, - "grad_norm": 1.4598770141601562, - "learning_rate": 1.0280972210220578e-05, - "epoch": 0.9700966895050266, - "step": 15150 + "loss": 355.3458, + "grad_norm": 33.41202926635742, + "learning_rate": 9.10599906980461e-06, + "epoch": 0.9130629764879237, + "step": 1018 }, { - "loss": 2.091, - "grad_norm": 1.4730453491210938, - "learning_rate": 1.0078505134519874e-05, - "epoch": 0.9704168534289557, - "step": 15155 + "loss": 357.3716, + "grad_norm": 32.52941131591797, + "learning_rate": 8.91840905034455e-06, + "epoch": 0.9139598949324108, + "step": 1019 }, { - "loss": 2.0936, - "grad_norm": 1.4336142539978027, - "learning_rate": 9.87754704884369e-06, - "epoch": 0.9707370173528846, - "step": 15160 + "loss": 354.1408, + "grad_norm": 33.926963806152344, + "learning_rate": 8.732736344360198e-06, + "epoch": 0.914856813376898, + "step": 1020 }, { - "loss": 2.099, - "grad_norm": 1.4537841081619263, - "learning_rate": 9.678118275803749e-06, - "epoch": 0.9710571812768137, - "step": 15165 + "loss": 357.4122, + "grad_norm": 33.29584503173828, + "learning_rate": 8.548982428500163e-06, + "epoch": 0.9157537318213851, + "step": 1021 }, { - "loss": 2.0901, - "grad_norm": 1.4845715761184692, - "learning_rate": 9.480238983354515e-06, - "epoch": 0.9713773452007428, - "step": 15170 + "loss": 356.5175, + "grad_norm": 35.51197814941406, + "learning_rate": 8.367148764152843e-06, + "epoch": 0.9166506502658722, + "step": 1022 }, { - "loss": 2.1275, - "grad_norm": 1.5654568672180176, - "learning_rate": 9.283929182753659e-06, - "epoch": 0.9716975091246718, - "step": 15175 + "loss": 361.666, + "grad_norm": 35.082054138183594, + "learning_rate": 8.187236797435077e-06, + "epoch": 0.9175475687103594, + "step": 1023 }, { - "loss": 2.1, - "grad_norm": 1.4920252561569214, - "learning_rate": 9.089208726538304e-06, - "epoch": 0.9720176730486009, - "step": 15180 + "loss": 350.1344, + "grad_norm": 34.95941925048828, + "learning_rate": 8.009247959180482e-06, + "epoch": 0.9184444871548466, + "step": 1024 }, { - "loss": 2.1293, - "grad_norm": 1.5260412693023682, - "learning_rate": 8.896097306517388e-06, - "epoch": 0.97233783697253, - "step": 15185 + "loss": 359.1797, + "grad_norm": 34.81248474121094, + "learning_rate": 7.833183664928023e-06, + "epoch": 0.9193414055993337, + "step": 1025 }, { - "loss": 2.1188, - "grad_norm": 1.475035548210144, - "learning_rate": 8.70461445178025e-06, - "epoch": 0.972658000896459, - "step": 15190 + "loss": 352.5403, + "grad_norm": 34.408485412597656, + "learning_rate": 7.659045314910879e-06, + "epoch": 0.9202383240438209, + "step": 1026 }, { - "loss": 2.1094, - "grad_norm": 1.465287208557129, - "learning_rate": 8.514779526721713e-06, - "epoch": 0.972978164820388, - "step": 15195 + "loss": 353.7971, + "grad_norm": 34.32902526855469, + "learning_rate": 7.486834294045286e-06, + "epoch": 0.921135242488308, + "step": 1027 }, { - "loss": 2.1036, - "grad_norm": 1.4647490978240967, - "learning_rate": 8.32661172908373e-06, - "epoch": 0.9732983287443171, - "step": 15200 + "loss": 352.8156, + "grad_norm": 33.39252471923828, + "learning_rate": 7.316551971919522e-06, + "epoch": 0.9220321609327952, + "step": 1028 }, { - "eval_loss": 1.9738588333129883, - "eval_runtime": 13.379, - "eval_samples_per_second": 153.076, - "eval_steps_per_second": 19.134, - "epoch": 0.9732983287443171, - "step": 15200 + "loss": 355.1404, + "grad_norm": 35.65606689453125, + "learning_rate": 7.148199702782854e-06, + "epoch": 0.9229290793772823, + "step": 1029 }, { - "loss": 2.112, - "grad_norm": 1.4496986865997314, - "learning_rate": 8.140130088014008e-06, - "epoch": 0.9736184926682462, - "step": 15205 + "loss": 358.3244, + "grad_norm": 35.14055252075195, + "learning_rate": 6.981778825535079e-06, + "epoch": 0.9238259978217694, + "step": 1030 }, { - "loss": 2.0984, - "grad_norm": 1.439038634300232, - "learning_rate": 7.955353462141554e-06, - "epoch": 0.9739386565921752, - "step": 15210 + "loss": 356.6115, + "grad_norm": 32.90983581542969, + "learning_rate": 6.817290663715614e-06, + "epoch": 0.9247229162662567, + "step": 1031 }, { - "loss": 2.0996, - "grad_norm": 1.4874390363693237, - "learning_rate": 7.7723005376696e-06, - "epoch": 0.9742588205161042, - "step": 15215 + "loss": 354.6003, + "grad_norm": 33.653778076171875, + "learning_rate": 6.654736525493033e-06, + "epoch": 0.9256198347107438, + "step": 1032 }, { - "loss": 2.0868, - "grad_norm": 1.4709018468856812, - "learning_rate": 7.5909898264857895e-06, - "epoch": 0.9745789844400333, - "step": 15220 + "loss": 356.817, + "grad_norm": 35.58637619018555, + "learning_rate": 6.494117703654739e-06, + "epoch": 0.926516753155231, + "step": 1033 }, { - "loss": 2.1179, - "grad_norm": 1.4825295209884644, - "learning_rate": 7.411439664290226e-06, - "epoch": 0.9748991483639623, - "step": 15225 + "loss": 355.3286, + "grad_norm": 33.73952102661133, + "learning_rate": 6.335435475596646e-06, + "epoch": 0.9274136715997181, + "step": 1034 }, { - "loss": 2.0792, - "grad_norm": 1.4520797729492188, - "learning_rate": 7.2336682087410985e-06, - "epoch": 0.9752193122878914, - "step": 15230 + "loss": 355.2651, + "grad_norm": 33.62116241455078, + "learning_rate": 6.1786911033129e-06, + "epoch": 0.9283105900442052, + "step": 1035 }, { - "loss": 2.0882, - "grad_norm": 1.4495720863342285, - "learning_rate": 7.05769343761849e-06, - "epoch": 0.9755394762118205, - "step": 15235 + "loss": 357.9323, + "grad_norm": 33.39925003051758, + "learning_rate": 6.023885833386061e-06, + "epoch": 0.9292075084886924, + "step": 1036 }, { - "loss": 2.1166, - "grad_norm": 1.4639912843704224, - "learning_rate": 6.883533147006266e-06, - "epoch": 0.9758596401357496, - "step": 15240 + "loss": 351.2944, + "grad_norm": 34.47417068481445, + "learning_rate": 5.87102089697708e-06, + "epoch": 0.9301044269331796, + "step": 1037 }, { - "loss": 2.0994, - "grad_norm": 1.4839041233062744, - "learning_rate": 6.7112049494924364e-06, - "epoch": 0.9761798040596785, - "step": 15245 + "loss": 355.5925, + "grad_norm": 33.980857849121094, + "learning_rate": 5.720097509815392e-06, + "epoch": 0.9310013453776668, + "step": 1038 }, { - "loss": 2.0832, - "grad_norm": 1.4434202909469604, - "learning_rate": 6.540726272387926e-06, - "epoch": 0.9764999679836076, - "step": 15250 + "loss": 355.6397, + "grad_norm": 32.85739517211914, + "learning_rate": 5.571116872189475e-06, + "epoch": 0.9318982638221539, + "step": 1039 }, { - "loss": 2.0947, - "grad_norm": 1.4894647598266602, - "learning_rate": 6.372114355964293e-06, - "epoch": 0.9768201319075367, - "step": 15255 + "loss": 355.7616, + "grad_norm": 33.64262390136719, + "learning_rate": 5.424080168937112e-06, + "epoch": 0.932795182266641, + "step": 1040 }, { - "loss": 2.1189, - "grad_norm": 1.4748380184173584, - "learning_rate": 6.205386251710138e-06, - "epoch": 0.9771402958314657, - "step": 15260 + "loss": 357.7719, + "grad_norm": 34.275169372558594, + "learning_rate": 5.278988569436066e-06, + "epoch": 0.9336921007111282, + "step": 1041 }, { - "loss": 2.0734, - "grad_norm": 1.4855523109436035, - "learning_rate": 6.040558820606795e-06, - "epoch": 0.9774604597553948, - "step": 15265 + "loss": 357.6499, + "grad_norm": 34.75218963623047, + "learning_rate": 5.1358432275947775e-06, + "epoch": 0.9345890191556153, + "step": 1042 }, { - "loss": 2.1203, - "grad_norm": 1.4771287441253662, - "learning_rate": 5.877648731423133e-06, - "epoch": 0.9777806236793238, - "step": 15270 + "loss": 353.3368, + "grad_norm": 34.046241760253906, + "learning_rate": 4.994645281843152e-06, + "epoch": 0.9354859376001025, + "step": 1043 }, { - "loss": 2.0971, - "grad_norm": 1.4301459789276123, - "learning_rate": 5.716672459029926e-06, - "epoch": 0.9781007876032529, - "step": 15275 + "loss": 354.6295, + "grad_norm": 34.62663269042969, + "learning_rate": 4.855395855123512e-06, + "epoch": 0.9363828560445897, + "step": 1044 }, { - "loss": 2.0943, - "grad_norm": 1.4706257581710815, - "learning_rate": 5.557646282733725e-06, - "epoch": 0.9784209515271819, - "step": 15280 + "loss": 352.3897, + "grad_norm": 35.12565231323242, + "learning_rate": 4.718096054881688e-06, + "epoch": 0.9372797744890768, + "step": 1045 }, { - "loss": 2.0998, - "grad_norm": 1.485478401184082, - "learning_rate": 5.400586284630579e-06, - "epoch": 0.978741115451111, - "step": 15285 + "loss": 352.5993, + "grad_norm": 33.51365661621094, + "learning_rate": 4.582746973058216e-06, + "epoch": 0.938176692933564, + "step": 1046 }, { - "loss": 2.0991, - "grad_norm": 1.4847251176834106, - "learning_rate": 5.245508347979675e-06, - "epoch": 0.9790612793750401, - "step": 15290 + "loss": 354.0611, + "grad_norm": 33.32587814331055, + "learning_rate": 4.449349686079574e-06, + "epoch": 0.9390736113780511, + "step": 1047 }, { - "loss": 2.1203, - "grad_norm": 1.4288967847824097, - "learning_rate": 5.092428155597084e-06, - "epoch": 0.979381443298969, - "step": 15295 + "loss": 361.4709, + "grad_norm": 35.336490631103516, + "learning_rate": 4.317905254849791e-06, + "epoch": 0.9399705298225383, + "step": 1048 }, { - "loss": 2.1178, - "grad_norm": 1.4488484859466553, - "learning_rate": 4.941361188269775e-06, - "epoch": 0.9797016072228981, - "step": 15300 + "loss": 360.2202, + "grad_norm": 34.51678466796875, + "learning_rate": 4.188414724741768e-06, + "epoch": 0.9408674482670254, + "step": 1049 }, { - "loss": 2.1052, - "grad_norm": 1.464503288269043, - "learning_rate": 4.792322723190057e-06, - "epoch": 0.9800217711468272, - "step": 15305 + "loss": 354.1904, + "grad_norm": 34.459373474121094, + "learning_rate": 4.060879125589195e-06, + "epoch": 0.9417643667115125, + "step": 1050 }, { - "loss": 2.0986, - "grad_norm": 1.5340265035629272, - "learning_rate": 4.645327832410648e-06, - "epoch": 0.9803419350707562, - "step": 15310 + "eval_loss": 1.5787107944488525, + "eval_runtime": 18.3575, + "eval_samples_per_second": 111.562, + "eval_steps_per_second": 13.945, + "epoch": 0.9417643667115125, + "step": 1050 }, { - "loss": 2.096, - "grad_norm": 1.4559099674224854, - "learning_rate": 4.500391381320421e-06, - "epoch": 0.9806620989946853, - "step": 15315 + "loss": 353.3853, + "grad_norm": 33.25263214111328, + "learning_rate": 3.9352994716783105e-06, + "epoch": 0.9426612851559998, + "step": 1051 }, { - "loss": 2.1186, - "grad_norm": 1.4796267747879028, - "learning_rate": 4.357528027141117e-06, - "epoch": 0.9809822629186143, - "step": 15320 + "loss": 350.3391, + "grad_norm": 35.57413101196289, + "learning_rate": 3.8116767617396298e-06, + "epoch": 0.9435582036004869, + "step": 1052 }, { - "loss": 2.1225, - "grad_norm": 1.4935765266418457, - "learning_rate": 4.216752217445052e-06, - "epoch": 0.9813024268425434, - "step": 15325 + "loss": 356.2869, + "grad_norm": 33.38325881958008, + "learning_rate": 3.690011978940255e-06, + "epoch": 0.9444551220449741, + "step": 1053 }, { - "loss": 2.0997, - "grad_norm": 1.4788649082183838, - "learning_rate": 4.078078188694101e-06, - "epoch": 0.9816225907664724, - "step": 15330 + "loss": 356.4574, + "grad_norm": 34.5271110534668, + "learning_rate": 3.570306090876024e-06, + "epoch": 0.9453520404894612, + "step": 1054 }, { - "loss": 2.1163, - "grad_norm": 1.52225661277771, - "learning_rate": 3.941519964799928e-06, - "epoch": 0.9819427546904015, - "step": 15335 + "loss": 359.7423, + "grad_norm": 35.02552795410156, + "learning_rate": 3.4525600495636246e-06, + "epoch": 0.9462489589339483, + "step": 1055 }, { - "loss": 2.1381, - "grad_norm": 1.479812502861023, - "learning_rate": 3.807091355705811e-06, - "epoch": 0.9822629186143306, - "step": 15340 + "loss": 353.1874, + "grad_norm": 35.6952018737793, + "learning_rate": 3.3367747914331838e-06, + "epoch": 0.9471458773784355, + "step": 1056 }, { - "loss": 2.0806, - "grad_norm": 1.448622226715088, - "learning_rate": 3.674805955990032e-06, - "epoch": 0.9825830825382595, - "step": 15345 + "loss": 355.9973, + "grad_norm": 35.45086669921875, + "learning_rate": 3.222951237320915e-06, + "epoch": 0.9480427958229227, + "step": 1057 }, { - "loss": 2.1262, - "grad_norm": 1.514609694480896, - "learning_rate": 3.5446771434911096e-06, - "epoch": 0.9829032464621886, - "step": 15350 + "loss": 355.2783, + "grad_norm": 32.976966857910156, + "learning_rate": 3.1110902924615102e-06, + "epoch": 0.9489397142674099, + "step": 1058 }, { - "loss": 2.0836, - "grad_norm": 1.492081880569458, - "learning_rate": 3.416718077954864e-06, - "epoch": 0.9832234103861177, - "step": 15355 + "loss": 358.506, + "grad_norm": 34.06571960449219, + "learning_rate": 3.0011928464811213e-06, + "epoch": 0.949836632711897, + "step": 1059 }, { - "loss": 2.113, - "grad_norm": 1.4686152935028076, - "learning_rate": 3.290941699703651e-06, - "epoch": 0.9835435743100468, - "step": 15360 + "loss": 358.1763, + "grad_norm": 33.59235382080078, + "learning_rate": 2.8932597733903886e-06, + "epoch": 0.9507335511563841, + "step": 1060 }, { - "loss": 2.1112, - "grad_norm": 1.521332859992981, - "learning_rate": 3.1673607283276813e-06, - "epoch": 0.9838637382339758, - "step": 15365 + "loss": 357.5705, + "grad_norm": 32.182106018066406, + "learning_rate": 2.7872919315772017e-06, + "epoch": 0.9516304696008713, + "step": 1061 }, { - "loss": 2.1045, - "grad_norm": 1.4462871551513672, - "learning_rate": 3.045987661398736e-06, - "epoch": 0.9841839021579049, - "step": 15370 + "loss": 354.619, + "grad_norm": 35.46062469482422, + "learning_rate": 2.683290163800145e-06, + "epoch": 0.9525273880453584, + "step": 1062 }, { - "loss": 2.1058, - "grad_norm": 1.4568146467208862, - "learning_rate": 2.926834773206269e-06, - "epoch": 0.9845040660818339, - "step": 15375 + "loss": 350.0426, + "grad_norm": 32.130767822265625, + "learning_rate": 2.581255297181617e-06, + "epoch": 0.9534243064898456, + "step": 1063 }, { - "loss": 2.123, - "grad_norm": 1.486159324645996, - "learning_rate": 2.809914113516171e-06, - "epoch": 0.9848242300057629, - "step": 15380 + "loss": 351.98, + "grad_norm": 32.878875732421875, + "learning_rate": 2.4811881432013905e-06, + "epoch": 0.9543212249343328, + "step": 1064 }, { - "loss": 2.1106, - "grad_norm": 1.4326839447021484, - "learning_rate": 2.6952375063521467e-06, - "epoch": 0.985144393929692, - "step": 15385 + "loss": 353.1487, + "grad_norm": 33.90510559082031, + "learning_rate": 2.3830894976899774e-06, + "epoch": 0.9552181433788199, + "step": 1065 }, { - "loss": 2.1136, - "grad_norm": 1.4544931650161743, - "learning_rate": 2.582816548800002e-06, - "epoch": 0.9854645578536211, - "step": 15390 + "loss": 357.164, + "grad_norm": 34.16891860961914, + "learning_rate": 2.2869601408225805e-06, + "epoch": 0.9561150618233071, + "step": 1066 }, { - "loss": 2.0961, - "grad_norm": 1.5159916877746582, - "learning_rate": 2.472662609834825e-06, - "epoch": 0.98578472177755, - "step": 15395 + "loss": 351.2288, + "grad_norm": 33.57730484008789, + "learning_rate": 2.1928008371125406e-06, + "epoch": 0.9570119802677942, + "step": 1067 }, { - "loss": 2.0906, - "grad_norm": 1.4797563552856445, - "learning_rate": 2.364786829171281e-06, - "epoch": 0.9861048857014791, - "step": 15400 + "loss": 356.0024, + "grad_norm": 33.691978454589844, + "learning_rate": 2.1006123354055384e-06, + "epoch": 0.9579088987122814, + "step": 1068 }, { - "eval_loss": 1.969813346862793, - "eval_runtime": 9.427, - "eval_samples_per_second": 217.248, - "eval_steps_per_second": 27.156, - "epoch": 0.9861048857014791, - "step": 15400 + "loss": 361.7596, + "grad_norm": 33.60329055786133, + "learning_rate": 2.0103953688734853e-06, + "epoch": 0.9588058171567685, + "step": 1069 }, { - "loss": 2.1251, - "grad_norm": 1.4745688438415527, - "learning_rate": 2.2592001161370392e-06, - "epoch": 0.9864250496254082, - "step": 15405 + "loss": 354.5997, + "grad_norm": 35.25307083129883, + "learning_rate": 1.9221506550088365e-06, + "epoch": 0.9597027356012557, + "step": 1070 }, { - "loss": 2.0873, - "grad_norm": 1.4373728036880493, - "learning_rate": 2.155913148569558e-06, - "epoch": 0.9867452135493373, - "step": 15410 + "loss": 355.2119, + "grad_norm": 34.94419860839844, + "learning_rate": 1.83587889561862e-06, + "epoch": 0.9605996540457429, + "step": 1071 }, { - "loss": 2.0997, - "grad_norm": 1.454837679862976, - "learning_rate": 2.0549363717362215e-06, - "epoch": 0.9870653774732663, - "step": 15415 + "loss": 355.9485, + "grad_norm": 34.35773468017578, + "learning_rate": 1.7515807768192228e-06, + "epoch": 0.96149657249023, + "step": 1072 }, { - "loss": 2.1055, - "grad_norm": 1.472625970840454, - "learning_rate": 1.956279997278043e-06, - "epoch": 0.9873855413971954, - "step": 15420 + "loss": 353.5008, + "grad_norm": 33.7717170715332, + "learning_rate": 1.6692569690305859e-06, + "epoch": 0.9623934909347172, + "step": 1073 }, { - "loss": 2.1318, - "grad_norm": 1.3973727226257324, - "learning_rate": 1.8599540021769695e-06, - "epoch": 0.9877057053211244, - "step": 15425 + "loss": 357.9717, + "grad_norm": 35.07488250732422, + "learning_rate": 1.5889081269710726e-06, + "epoch": 0.9632904093792043, + "step": 1074 }, { - "loss": 2.0958, - "grad_norm": 1.4296380281448364, - "learning_rate": 1.765968127746928e-06, - "epoch": 0.9880258692450534, - "step": 15430 + "loss": 361.8947, + "grad_norm": 34.685150146484375, + "learning_rate": 1.5105348896522486e-06, + "epoch": 0.9641873278236914, + "step": 1075 }, { - "loss": 2.0831, - "grad_norm": 1.433281421661377, - "learning_rate": 1.6743318786486966e-06, - "epoch": 0.9883460331689825, - "step": 15435 + "loss": 357.5904, + "grad_norm": 34.1632080078125, + "learning_rate": 1.4341378803737204e-06, + "epoch": 0.9650842462681786, + "step": 1076 }, { - "loss": 2.0827, - "grad_norm": 1.4774094820022583, - "learning_rate": 1.5850545219287e-06, - "epoch": 0.9886661970929116, - "step": 15440 + "loss": 357.5146, + "grad_norm": 34.23555374145508, + "learning_rate": 1.3597177067181943e-06, + "epoch": 0.9659811647126658, + "step": 1077 }, { - "loss": 2.125, - "grad_norm": 1.477018117904663, - "learning_rate": 1.4981450860818757e-06, - "epoch": 0.9889863610168407, - "step": 15445 + "loss": 356.91, + "grad_norm": 32.962257385253906, + "learning_rate": 1.2872749605468137e-06, + "epoch": 0.966878083157153, + "step": 1078 }, { - "loss": 2.076, - "grad_norm": 1.5008845329284668, - "learning_rate": 1.4136123601385998e-06, - "epoch": 0.9893065249407696, - "step": 15450 + "loss": 351.4866, + "grad_norm": 34.07936096191406, + "learning_rate": 1.2168102179941076e-06, + "epoch": 0.9677750016016401, + "step": 1079 }, { - "loss": 2.0926, - "grad_norm": 1.450234055519104, - "learning_rate": 1.3314648927758966e-06, - "epoch": 0.9896266888646987, - "step": 15455 + "loss": 355.5893, + "grad_norm": 33.35137939453125, + "learning_rate": 1.1483240394637717e-06, + "epoch": 0.9686719200461272, + "step": 1080 }, { - "loss": 2.0658, - "grad_norm": 1.4952715635299683, - "learning_rate": 1.2517109914528841e-06, - "epoch": 0.9899468527886278, - "step": 15460 + "loss": 355.4586, + "grad_norm": 34.09134292602539, + "learning_rate": 1.0818169696239776e-06, + "epoch": 0.9695688384906144, + "step": 1081 }, { - "loss": 2.1135, - "grad_norm": 1.434744954109192, - "learning_rate": 1.1743587215707007e-06, - "epoch": 0.9902670167125568, - "step": 15465 + "loss": 354.5378, + "grad_norm": 32.67642593383789, + "learning_rate": 1.0172895374031265e-06, + "epoch": 0.9704657569351015, + "step": 1082 }, { - "loss": 2.0885, - "grad_norm": 1.4307448863983154, - "learning_rate": 1.0994159056568198e-06, - "epoch": 0.9905871806364859, - "step": 15470 + "loss": 354.3784, + "grad_norm": 32.6947021484375, + "learning_rate": 9.5474225598563e-07, + "epoch": 0.9713626753795886, + "step": 1083 }, { - "loss": 2.0971, - "grad_norm": 1.4121356010437012, - "learning_rate": 1.026890122573998e-06, - "epoch": 0.990907344560415, - "step": 15475 + "loss": 355.8788, + "grad_norm": 33.51148986816406, + "learning_rate": 8.941756228078579e-07, + "epoch": 0.9722595938240759, + "step": 1084 }, { - "loss": 2.1023, - "grad_norm": 1.4584524631500244, - "learning_rate": 9.56788706753814e-07, - "epoch": 0.9912275084843439, - "step": 15480 + "loss": 353.8372, + "grad_norm": 33.57039260864258, + "learning_rate": 8.35590119554086e-07, + "epoch": 0.973156512268563, + "step": 1085 }, { - "loss": 2.1039, - "grad_norm": 1.4847089052200317, - "learning_rate": 8.891187474549617e-07, - "epoch": 0.991547672408273, - "step": 15485 + "loss": 353.2452, + "grad_norm": 33.60462188720703, + "learning_rate": 7.789862121528324e-07, + "epoch": 0.9740534307130502, + "step": 1086 }, { - "loss": 2.0808, - "grad_norm": 1.496368408203125, - "learning_rate": 8.238870880463157e-07, - "epoch": 0.9918678363322021, - "step": 15490 + "loss": 357.0675, + "grad_norm": 33.704349517822266, + "learning_rate": 7.243643507729436e-07, + "epoch": 0.9749503491575373, + "step": 1087 }, { - "loss": 2.0938, - "grad_norm": 1.4407403469085693, - "learning_rate": 7.611003253148757e-07, - "epoch": 0.9921880002561312, - "step": 15495 + "loss": 354.5553, + "grad_norm": 34.90256881713867, + "learning_rate": 6.717249698202088e-07, + "epoch": 0.9758472676020244, + "step": 1088 }, { - "loss": 2.0824, - "grad_norm": 1.4371376037597656, - "learning_rate": 7.007648087986374e-07, - "epoch": 0.9925081641800602, - "step": 15500 + "loss": 349.4813, + "grad_norm": 34.148128509521484, + "learning_rate": 6.210684879337513e-07, + "epoch": 0.9767441860465116, + "step": 1089 }, { - "loss": 2.0959, - "grad_norm": 1.4718334674835205, - "learning_rate": 6.428866401444799e-07, - "epoch": 0.9928283281039892, - "step": 15505 + "loss": 357.7331, + "grad_norm": 34.612762451171875, + "learning_rate": 5.72395307982837e-07, + "epoch": 0.9776411044909988, + "step": 1090 }, { - "loss": 2.1124, - "grad_norm": 1.5052322149276733, - "learning_rate": 5.874716724911078e-07, - "epoch": 0.9931484920279183, - "step": 15510 + "loss": 358.809, + "grad_norm": 32.881195068359375, + "learning_rate": 5.257058170635709e-07, + "epoch": 0.978538022935486, + "step": 1091 }, { - "loss": 2.0862, - "grad_norm": 1.485822319984436, - "learning_rate": 5.345255098771346e-07, - "epoch": 0.9934686559518473, - "step": 15515 + "loss": 356.2231, + "grad_norm": 32.4294319152832, + "learning_rate": 4.810003864958168e-07, + "epoch": 0.9794349413799731, + "step": 1092 }, { - "loss": 2.0994, - "grad_norm": 1.4460045099258423, - "learning_rate": 4.840535066743506e-07, - "epoch": 0.9937888198757764, - "step": 15520 + "loss": 354.6883, + "grad_norm": 35.39781951904297, + "learning_rate": 4.3827937182033815e-07, + "epoch": 0.9803318598244602, + "step": 1093 }, { - "loss": 2.093, - "grad_norm": 1.4056727886199951, - "learning_rate": 4.360607670462591e-07, - "epoch": 0.9941089837997055, - "step": 15525 + "loss": 352.7607, + "grad_norm": 34.17608642578125, + "learning_rate": 3.9754311279582844e-07, + "epoch": 0.9812287782689474, + "step": 1094 }, { - "loss": 2.094, - "grad_norm": 1.4093832969665527, - "learning_rate": 3.905521444318605e-07, - "epoch": 0.9944291477236346, - "step": 15530 + "loss": 353.8497, + "grad_norm": 31.340768814086914, + "learning_rate": 3.587919333963574e-07, + "epoch": 0.9821256967134345, + "step": 1095 }, { - "loss": 2.1117, - "grad_norm": 1.445207953453064, - "learning_rate": 3.4753224105488204e-07, - "epoch": 0.9947493116475635, - "step": 15535 + "loss": 357.9939, + "grad_norm": 33.75115966796875, + "learning_rate": 3.2202614180870673e-07, + "epoch": 0.9830226151579217, + "step": 1096 }, { - "loss": 2.1148, - "grad_norm": 1.4611190557479858, - "learning_rate": 3.070054074583162e-07, - "epoch": 0.9950694755714926, - "step": 15540 + "loss": 356.0656, + "grad_norm": 32.56006622314453, + "learning_rate": 2.872460304299274e-07, + "epoch": 0.9839195336024089, + "step": 1097 }, { - "loss": 2.1264, - "grad_norm": 1.5278282165527344, - "learning_rate": 2.689757420644951e-07, - "epoch": 0.9953896394954217, - "step": 15545 + "loss": 353.62, + "grad_norm": 34.134193420410156, + "learning_rate": 2.5445187586503603e-07, + "epoch": 0.984816452046896, + "step": 1098 }, { - "loss": 2.1253, - "grad_norm": 1.4608923196792603, - "learning_rate": 2.334470907605829e-07, - "epoch": 0.9957098034193507, - "step": 15550 + "loss": 355.838, + "grad_norm": 34.15678024291992, + "learning_rate": 2.2364393892479462e-07, + "epoch": 0.9857133704913832, + "step": 1099 }, { - "loss": 2.0896, - "grad_norm": 1.4511940479278564, - "learning_rate": 2.004230465096818e-07, - "epoch": 0.9960299673432798, - "step": 15555 + "loss": 358.3669, + "grad_norm": 32.837039947509766, + "learning_rate": 1.9482246462365626e-07, + "epoch": 0.9866102889358703, + "step": 1100 }, { - "loss": 2.0991, - "grad_norm": 1.4206980466842651, - "learning_rate": 1.699069489874583e-07, - "epoch": 0.9963501312672088, - "step": 15560 + "eval_loss": 1.5716547966003418, + "eval_runtime": 18.217, + "eval_samples_per_second": 112.422, + "eval_steps_per_second": 14.053, + "epoch": 0.9866102889358703, + "step": 1100 }, { - "loss": 2.0942, - "grad_norm": 1.4339975118637085, - "learning_rate": 1.419018842444164e-07, - "epoch": 0.9966702951911378, - "step": 15565 + "loss": 356.8408, + "grad_norm": 33.33000183105469, + "learning_rate": 1.6798768217776706e-07, + "epoch": 0.9875072073803575, + "step": 1101 }, { - "loss": 2.1004, - "grad_norm": 1.4129743576049805, - "learning_rate": 1.1641068439380842e-07, - "epoch": 0.9969904591150669, - "step": 15570 + "loss": 356.4636, + "grad_norm": 34.879573822021484, + "learning_rate": 1.4313980500327283e-07, + "epoch": 0.9884041258248446, + "step": 1102 }, { - "loss": 2.1244, - "grad_norm": 1.4470487833023071, - "learning_rate": 9.343592732521944e-08, - "epoch": 0.997310623038996, - "step": 15575 + "loss": 356.378, + "grad_norm": 33.825469970703125, + "learning_rate": 1.2027903071440415e-07, + "epoch": 0.9893010442693317, + "step": 1103 }, { - "loss": 2.0782, - "grad_norm": 1.457018256187439, - "learning_rate": 7.29799364438899e-08, - "epoch": 0.9976307869629251, - "step": 15580 + "loss": 359.4078, + "grad_norm": 34.18437957763672, + "learning_rate": 9.94055411221717e-08, + "epoch": 0.990197962713819, + "step": 1104 }, { - "loss": 2.1337, - "grad_norm": 1.4190618991851807, - "learning_rate": 5.5044780435722923e-08, - "epoch": 0.997950950886854, - "step": 15585 + "loss": 356.8303, + "grad_norm": 35.02104187011719, + "learning_rate": 8.051950223267323e-08, + "epoch": 0.9910948811583061, + "step": 1105 }, { - "loss": 2.1195, - "grad_norm": 1.4926364421844482, - "learning_rate": 3.963227305810724e-08, - "epoch": 0.9982711148107831, - "step": 15590 + "loss": 351.9132, + "grad_norm": 33.7501220703125, + "learning_rate": 6.362106424590009e-08, + "epoch": 0.9919917996027933, + "step": 1106 }, { - "loss": 2.0792, - "grad_norm": 1.4838751554489136, - "learning_rate": 2.6743972956475016e-08, - "epoch": 0.9985912787347122, - "step": 15595 + "loss": 356.2349, + "grad_norm": 34.74052810668945, + "learning_rate": 4.871036155454367e-08, + "epoch": 0.9928887180472804, + "step": 1107 }, { - "loss": 2.0793, - "grad_norm": 1.4116333723068237, - "learning_rate": 1.6381183506697374e-08, - "epoch": 0.9989114426586412, - "step": 15600 + "loss": 357.3864, + "grad_norm": 33.26545715332031, + "learning_rate": 3.578751274294079e-08, + "epoch": 0.9937856364917675, + "step": 1108 }, { - "eval_loss": 1.9771896600723267, - "eval_runtime": 12.6984, - "eval_samples_per_second": 161.281, - "eval_steps_per_second": 20.16, - "epoch": 0.9989114426586412, - "step": 15600 + "loss": 358.4432, + "grad_norm": 33.61418914794922, + "learning_rate": 2.4852620586046647e-08, + "epoch": 0.9946825549362547, + "step": 1109 }, { - "loss": 2.1191, - "grad_norm": 1.5354454517364502, - "learning_rate": 8.544952683253726e-09, - "epoch": 0.9992316065825703, - "step": 15605 + "loss": 356.3781, + "grad_norm": 33.90690612792969, + "learning_rate": 1.5905772048629975e-08, + "epoch": 0.9955794733807419, + "step": 1110 }, { - "loss": 2.097, - "grad_norm": 1.3956998586654663, - "learning_rate": 3.2360729532776578e-09, - "epoch": 0.9995517705064993, - "step": 15610 + "loss": 355.2562, + "grad_norm": 36.185489654541016, + "learning_rate": 8.947038284717879e-09, + "epoch": 0.9964763918252291, + "step": 1111 }, { - "loss": 2.0965, - "grad_norm": 1.407570481300354, - "learning_rate": 4.550811963849322e-10, - "epoch": 0.9998719344304284, - "step": 15615 + "loss": 353.4495, + "grad_norm": 35.645416259765625, + "learning_rate": 3.976474636874228e-09, + "epoch": 0.9973733102697162, + "step": 1112 }, { - "train_runtime": 71.4624, - "train_samples_per_second": 55944.819, - "train_steps_per_second": 218.534, - "total_flos": 5.2748789856731136e+17, - "train_loss": 0.0022892785559331377, - "epoch": 1.0, - "step": 15617 + "loss": 358.9317, + "grad_norm": 34.38767623901367, + "learning_rate": 9.941206357555465e-10, + "epoch": 0.9982702287142033, + "step": 1113 }, { - "eval_loss": 1.9724633693695068, - "eval_runtime": 9.3608, - "eval_samples_per_second": 218.784, - "eval_steps_per_second": 27.348, - "epoch": 1.0, - "step": 15617 + "loss": 355.1901, + "grad_norm": 33.96023941040039, + "learning_rate": 0.0, + "epoch": 0.9991671471586905, + "step": 1114 }, { - "train_runtime": 0.0087, - "train_samples_per_second": 461932896.212, - "train_steps_per_second": 1804425.376, - "total_flos": 5.2748789856731136e+17, - "train_loss": 0.0, - "epoch": 1.0, - "step": 15617 + "train_runtime": 10703.3349, + "train_samples_per_second": 186.666, + "train_steps_per_second": 0.104, + "total_flos": 6.811715592467251e+17, + "train_loss": 100.33408414611269, + "epoch": 0.9991671471586905, + "step": 1114 }, { - "eval_loss": 1.9763054847717285, - "eval_runtime": 13.0331, - "eval_samples_per_second": 157.138, - "eval_steps_per_second": 19.642, - "epoch": 1.0, - "step": 15617 + "eval_loss": 1.585738182067871, + "eval_runtime": 19.5932, + "eval_samples_per_second": 104.526, + "eval_steps_per_second": 13.066, + "epoch": 0.9991671471586905, + "step": 1114 } ], "best_metric": null, @@ -22540,9 +8012,9 @@ "stateful_callbacks": { "TrainerControl": { "args": { - "should_training_stop": false, + "should_training_stop": true, "should_epoch_stop": false, - "should_save": false, + "should_save": true, "should_evaluate": false, "should_log": false },