{ "best_metric": 0.21242891252040863, "best_model_checkpoint": "./0.4b_finetuned_results/checkpoint-7500", "epoch": 11.22754491017964, "eval_steps": 500, "global_step": 7500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014970059880239521, "grad_norm": 4.375, "learning_rate": 6.666666666666667e-05, "loss": 3.8882, "step": 10 }, { "epoch": 0.029940119760479042, "grad_norm": 6.0625, "learning_rate": 0.00013333333333333334, "loss": 3.2257, "step": 20 }, { "epoch": 0.04491017964071856, "grad_norm": 6.28125, "learning_rate": 0.0002, "loss": 2.92, "step": 30 }, { "epoch": 0.059880239520958084, "grad_norm": 1.9765625, "learning_rate": 0.00019999938668382333, "loss": 2.3984, "step": 40 }, { "epoch": 0.0748502994011976, "grad_norm": 1.484375, "learning_rate": 0.00019999754674281632, "loss": 2.1626, "step": 50 }, { "epoch": 0.08982035928143713, "grad_norm": 3.375, "learning_rate": 0.0001999944801995484, "loss": 2.0388, "step": 60 }, { "epoch": 0.10479041916167664, "grad_norm": 1.7890625, "learning_rate": 0.0001999901870916347, "loss": 2.0121, "step": 70 }, { "epoch": 0.11976047904191617, "grad_norm": 1.609375, "learning_rate": 0.00019998466747173592, "loss": 1.8579, "step": 80 }, { "epoch": 0.1347305389221557, "grad_norm": 0.81640625, "learning_rate": 0.00019997792140755746, "loss": 1.8254, "step": 90 }, { "epoch": 0.1497005988023952, "grad_norm": 1.515625, "learning_rate": 0.0001999699489818488, "loss": 1.7037, "step": 100 }, { "epoch": 0.16467065868263472, "grad_norm": 0.94140625, "learning_rate": 0.00019996075029240219, "loss": 1.6647, "step": 110 }, { "epoch": 0.17964071856287425, "grad_norm": 0.61328125, "learning_rate": 0.0001999503254520518, "loss": 1.5988, "step": 120 }, { "epoch": 0.19461077844311378, "grad_norm": 0.337890625, "learning_rate": 0.00019993867458867207, "loss": 1.6197, "step": 130 }, { "epoch": 0.20958083832335328, "grad_norm": 0.47265625, "learning_rate": 0.00019992579784517626, "loss": 1.5954, "step": 140 }, { "epoch": 0.2245508982035928, "grad_norm": 0.33203125, "learning_rate": 0.00019991169537951468, "loss": 1.5666, "step": 150 }, { "epoch": 0.23952095808383234, "grad_norm": 0.52734375, "learning_rate": 0.00019989636736467278, "loss": 1.5227, "step": 160 }, { "epoch": 0.25449101796407186, "grad_norm": 0.34375, "learning_rate": 0.00019987981398866887, "loss": 1.5048, "step": 170 }, { "epoch": 0.2694610778443114, "grad_norm": 0.46875, "learning_rate": 0.00019986203545455203, "loss": 1.4755, "step": 180 }, { "epoch": 0.2844311377245509, "grad_norm": 0.51953125, "learning_rate": 0.0001998430319803996, "loss": 1.4505, "step": 190 }, { "epoch": 0.2994011976047904, "grad_norm": 0.38671875, "learning_rate": 0.00019982280379931422, "loss": 1.4295, "step": 200 }, { "epoch": 0.3143712574850299, "grad_norm": 0.34765625, "learning_rate": 0.00019980135115942136, "loss": 1.4683, "step": 210 }, { "epoch": 0.32934131736526945, "grad_norm": 0.306640625, "learning_rate": 0.00019977867432386604, "loss": 1.4427, "step": 220 }, { "epoch": 0.344311377245509, "grad_norm": 0.357421875, "learning_rate": 0.00019975477357080966, "loss": 1.3852, "step": 230 }, { "epoch": 0.3592814371257485, "grad_norm": 0.361328125, "learning_rate": 0.00019972964919342663, "loss": 1.427, "step": 240 }, { "epoch": 0.37425149700598803, "grad_norm": 0.306640625, "learning_rate": 0.00019970330149990062, "loss": 1.3759, "step": 250 }, { "epoch": 0.38922155688622756, "grad_norm": 0.3515625, "learning_rate": 0.00019967573081342103, "loss": 1.3559, "step": 260 }, { "epoch": 0.4041916167664671, "grad_norm": 0.28515625, "learning_rate": 0.00019964693747217874, "loss": 1.3715, "step": 270 }, { "epoch": 0.41916167664670656, "grad_norm": 0.30859375, "learning_rate": 0.00019961692182936225, "loss": 1.2932, "step": 280 }, { "epoch": 0.4341317365269461, "grad_norm": 0.306640625, "learning_rate": 0.00019958568425315314, "loss": 1.3086, "step": 290 }, { "epoch": 0.4491017964071856, "grad_norm": 0.291015625, "learning_rate": 0.00019955322512672162, "loss": 1.3091, "step": 300 }, { "epoch": 0.46407185628742514, "grad_norm": 0.248046875, "learning_rate": 0.00019951954484822182, "loss": 1.3196, "step": 310 }, { "epoch": 0.47904191616766467, "grad_norm": 0.267578125, "learning_rate": 0.00019948464383078696, "loss": 1.2944, "step": 320 }, { "epoch": 0.4940119760479042, "grad_norm": 0.375, "learning_rate": 0.00019944852250252418, "loss": 1.3461, "step": 330 }, { "epoch": 0.5089820359281437, "grad_norm": 0.275390625, "learning_rate": 0.00019941118130650942, "loss": 1.3221, "step": 340 }, { "epoch": 0.5239520958083832, "grad_norm": 0.23828125, "learning_rate": 0.00019937262070078183, "loss": 1.3111, "step": 350 }, { "epoch": 0.5389221556886228, "grad_norm": 0.2578125, "learning_rate": 0.0001993328411583383, "loss": 1.3128, "step": 360 }, { "epoch": 0.5538922155688623, "grad_norm": 0.2578125, "learning_rate": 0.00019929184316712758, "loss": 1.2618, "step": 370 }, { "epoch": 0.5688622754491018, "grad_norm": 0.29296875, "learning_rate": 0.00019924962723004425, "loss": 1.2893, "step": 380 }, { "epoch": 0.5838323353293413, "grad_norm": 0.30859375, "learning_rate": 0.0001992061938649227, "loss": 1.2727, "step": 390 }, { "epoch": 0.5988023952095808, "grad_norm": 0.3359375, "learning_rate": 0.0001991615436045306, "loss": 1.293, "step": 400 }, { "epoch": 0.6137724550898204, "grad_norm": 0.314453125, "learning_rate": 0.0001991156769965625, "loss": 1.2692, "step": 410 }, { "epoch": 0.6287425149700598, "grad_norm": 0.326171875, "learning_rate": 0.00019906859460363307, "loss": 1.2588, "step": 420 }, { "epoch": 0.6437125748502994, "grad_norm": 0.26953125, "learning_rate": 0.00019902029700327018, "loss": 1.2576, "step": 430 }, { "epoch": 0.6586826347305389, "grad_norm": 0.2890625, "learning_rate": 0.0001989707847879078, "loss": 1.2595, "step": 440 }, { "epoch": 0.6736526946107785, "grad_norm": 0.337890625, "learning_rate": 0.00019892005856487878, "loss": 1.2331, "step": 450 }, { "epoch": 0.688622754491018, "grad_norm": 0.28515625, "learning_rate": 0.0001988681189564074, "loss": 1.2161, "step": 460 }, { "epoch": 0.7035928143712575, "grad_norm": 0.25390625, "learning_rate": 0.0001988149665996017, "loss": 1.2675, "step": 470 }, { "epoch": 0.718562874251497, "grad_norm": 0.26953125, "learning_rate": 0.00019876060214644566, "loss": 1.269, "step": 480 }, { "epoch": 0.7335329341317365, "grad_norm": 0.40625, "learning_rate": 0.00019870502626379127, "loss": 1.2342, "step": 490 }, { "epoch": 0.7485029940119761, "grad_norm": 0.298828125, "learning_rate": 0.00019864823963335033, "loss": 1.2351, "step": 500 }, { "epoch": 0.7485029940119761, "eval_loss": 1.1021808385849, "eval_runtime": 109.4058, "eval_samples_per_second": 9.14, "eval_steps_per_second": 1.143, "step": 500 }, { "epoch": 0.7634730538922155, "grad_norm": 0.3671875, "learning_rate": 0.00019859024295168593, "loss": 1.2235, "step": 510 }, { "epoch": 0.7784431137724551, "grad_norm": 0.267578125, "learning_rate": 0.0001985310369302042, "loss": 1.2353, "step": 520 }, { "epoch": 0.7934131736526946, "grad_norm": 0.24609375, "learning_rate": 0.00019847062229514533, "loss": 1.2445, "step": 530 }, { "epoch": 0.8083832335329342, "grad_norm": 0.265625, "learning_rate": 0.00019840899978757485, "loss": 1.2687, "step": 540 }, { "epoch": 0.8233532934131736, "grad_norm": 0.2138671875, "learning_rate": 0.0001983461701633742, "loss": 1.2026, "step": 550 }, { "epoch": 0.8383233532934131, "grad_norm": 0.283203125, "learning_rate": 0.00019828213419323208, "loss": 1.2304, "step": 560 }, { "epoch": 0.8532934131736527, "grad_norm": 0.26953125, "learning_rate": 0.00019821689266263427, "loss": 1.1961, "step": 570 }, { "epoch": 0.8682634730538922, "grad_norm": 0.251953125, "learning_rate": 0.00019815044637185456, "loss": 1.158, "step": 580 }, { "epoch": 0.8832335329341318, "grad_norm": 0.251953125, "learning_rate": 0.00019808279613594464, "loss": 1.1804, "step": 590 }, { "epoch": 0.8982035928143712, "grad_norm": 0.25390625, "learning_rate": 0.00019801394278472418, "loss": 1.1705, "step": 600 }, { "epoch": 0.9131736526946108, "grad_norm": 0.259765625, "learning_rate": 0.0001979438871627707, "loss": 1.1816, "step": 610 }, { "epoch": 0.9281437125748503, "grad_norm": 0.3515625, "learning_rate": 0.00019787263012940905, "loss": 1.2516, "step": 620 }, { "epoch": 0.9431137724550899, "grad_norm": 0.2734375, "learning_rate": 0.00019780017255870114, "loss": 1.2214, "step": 630 }, { "epoch": 0.9580838323353293, "grad_norm": 0.27734375, "learning_rate": 0.00019772651533943493, "loss": 1.1855, "step": 640 }, { "epoch": 0.9730538922155688, "grad_norm": 0.345703125, "learning_rate": 0.0001976516593751137, "loss": 1.1784, "step": 650 }, { "epoch": 0.9880239520958084, "grad_norm": 0.32421875, "learning_rate": 0.00019757560558394493, "loss": 1.194, "step": 660 }, { "epoch": 1.0029940119760479, "grad_norm": 0.283203125, "learning_rate": 0.00019749835489882905, "loss": 1.198, "step": 670 }, { "epoch": 1.0179640718562875, "grad_norm": 0.255859375, "learning_rate": 0.00019741990826734794, "loss": 1.0274, "step": 680 }, { "epoch": 1.032934131736527, "grad_norm": 0.30078125, "learning_rate": 0.00019734026665175334, "loss": 0.9878, "step": 690 }, { "epoch": 1.0479041916167664, "grad_norm": 0.32421875, "learning_rate": 0.0001972594310289551, "loss": 1.0292, "step": 700 }, { "epoch": 1.062874251497006, "grad_norm": 0.2890625, "learning_rate": 0.00019717740239050914, "loss": 1.0265, "step": 710 }, { "epoch": 1.0778443113772456, "grad_norm": 0.314453125, "learning_rate": 0.0001970941817426052, "loss": 0.9696, "step": 720 }, { "epoch": 1.092814371257485, "grad_norm": 0.4453125, "learning_rate": 0.0001970097701060548, "loss": 0.9735, "step": 730 }, { "epoch": 1.1077844311377245, "grad_norm": 0.33984375, "learning_rate": 0.00019692416851627826, "loss": 1.0029, "step": 740 }, { "epoch": 1.122754491017964, "grad_norm": 0.26171875, "learning_rate": 0.00019683737802329244, "loss": 1.0072, "step": 750 }, { "epoch": 1.1377245508982037, "grad_norm": 0.271484375, "learning_rate": 0.0001967493996916976, "loss": 1.0173, "step": 760 }, { "epoch": 1.152694610778443, "grad_norm": 0.267578125, "learning_rate": 0.00019666023460066442, "loss": 0.9945, "step": 770 }, { "epoch": 1.1676646706586826, "grad_norm": 0.287109375, "learning_rate": 0.00019656988384392075, "loss": 0.9927, "step": 780 }, { "epoch": 1.1826347305389222, "grad_norm": 0.283203125, "learning_rate": 0.00019647834852973818, "loss": 0.9995, "step": 790 }, { "epoch": 1.1976047904191618, "grad_norm": 0.29296875, "learning_rate": 0.00019638562978091853, "loss": 0.9957, "step": 800 }, { "epoch": 1.2125748502994012, "grad_norm": 0.2490234375, "learning_rate": 0.00019629172873477995, "loss": 0.9653, "step": 810 }, { "epoch": 1.2275449101796407, "grad_norm": 0.2451171875, "learning_rate": 0.00019619664654314302, "loss": 0.9714, "step": 820 }, { "epoch": 1.2425149700598803, "grad_norm": 0.302734375, "learning_rate": 0.0001961003843723167, "loss": 1.0226, "step": 830 }, { "epoch": 1.2574850299401197, "grad_norm": 0.283203125, "learning_rate": 0.00019600294340308398, "loss": 1.0417, "step": 840 }, { "epoch": 1.2724550898203593, "grad_norm": 0.287109375, "learning_rate": 0.00019590432483068722, "loss": 0.9593, "step": 850 }, { "epoch": 1.2874251497005988, "grad_norm": 0.27734375, "learning_rate": 0.00019580452986481378, "loss": 1.0255, "step": 860 }, { "epoch": 1.3023952095808382, "grad_norm": 0.341796875, "learning_rate": 0.00019570355972958097, "loss": 0.9971, "step": 870 }, { "epoch": 1.3173652694610778, "grad_norm": 0.322265625, "learning_rate": 0.00019560141566352115, "loss": 0.9914, "step": 880 }, { "epoch": 1.3323353293413174, "grad_norm": 0.2255859375, "learning_rate": 0.0001954980989195665, "loss": 0.9699, "step": 890 }, { "epoch": 1.347305389221557, "grad_norm": 0.25390625, "learning_rate": 0.0001953936107650336, "loss": 0.9667, "step": 900 }, { "epoch": 1.3622754491017965, "grad_norm": 0.28515625, "learning_rate": 0.00019528795248160795, "loss": 0.9813, "step": 910 }, { "epoch": 1.377245508982036, "grad_norm": 0.23828125, "learning_rate": 0.0001951811253653283, "loss": 0.9861, "step": 920 }, { "epoch": 1.3922155688622755, "grad_norm": 0.26953125, "learning_rate": 0.00019507313072657055, "loss": 0.9772, "step": 930 }, { "epoch": 1.407185628742515, "grad_norm": 0.259765625, "learning_rate": 0.00019496396989003193, "loss": 1.0045, "step": 940 }, { "epoch": 1.4221556886227544, "grad_norm": 0.302734375, "learning_rate": 0.00019485364419471454, "loss": 0.9919, "step": 950 }, { "epoch": 1.437125748502994, "grad_norm": 0.279296875, "learning_rate": 0.00019474215499390912, "loss": 0.9796, "step": 960 }, { "epoch": 1.4520958083832336, "grad_norm": 0.26171875, "learning_rate": 0.00019462950365517817, "loss": 0.9821, "step": 970 }, { "epoch": 1.467065868263473, "grad_norm": 0.3203125, "learning_rate": 0.00019451569156033954, "loss": 1.0337, "step": 980 }, { "epoch": 1.4820359281437125, "grad_norm": 0.310546875, "learning_rate": 0.00019440072010544918, "loss": 0.9987, "step": 990 }, { "epoch": 1.4970059880239521, "grad_norm": 0.27734375, "learning_rate": 0.00019428459070078416, "loss": 1.004, "step": 1000 }, { "epoch": 1.4970059880239521, "eval_loss": 0.9072233438491821, "eval_runtime": 109.5318, "eval_samples_per_second": 9.13, "eval_steps_per_second": 1.141, "step": 1000 }, { "epoch": 1.5119760479041915, "grad_norm": 0.2890625, "learning_rate": 0.00019416730477082533, "loss": 0.9444, "step": 1010 }, { "epoch": 1.5269461077844313, "grad_norm": 0.296875, "learning_rate": 0.00019404886375423984, "loss": 0.9829, "step": 1020 }, { "epoch": 1.5419161676646707, "grad_norm": 0.27734375, "learning_rate": 0.00019392926910386353, "loss": 0.9532, "step": 1030 }, { "epoch": 1.55688622754491, "grad_norm": 0.265625, "learning_rate": 0.00019380852228668304, "loss": 0.9769, "step": 1040 }, { "epoch": 1.5718562874251498, "grad_norm": 0.275390625, "learning_rate": 0.00019368662478381799, "loss": 0.9783, "step": 1050 }, { "epoch": 1.5868263473053892, "grad_norm": 0.2890625, "learning_rate": 0.00019356357809050247, "loss": 0.9881, "step": 1060 }, { "epoch": 1.6017964071856288, "grad_norm": 0.232421875, "learning_rate": 0.00019343938371606712, "loss": 0.9883, "step": 1070 }, { "epoch": 1.6167664670658684, "grad_norm": 0.2734375, "learning_rate": 0.00019331404318392027, "loss": 0.9893, "step": 1080 }, { "epoch": 1.6317365269461077, "grad_norm": 0.248046875, "learning_rate": 0.00019318755803152945, "loss": 0.9851, "step": 1090 }, { "epoch": 1.6467065868263473, "grad_norm": 0.28515625, "learning_rate": 0.00019305992981040246, "loss": 0.9531, "step": 1100 }, { "epoch": 1.6616766467065869, "grad_norm": 0.275390625, "learning_rate": 0.00019293116008606837, "loss": 0.9717, "step": 1110 }, { "epoch": 1.6766467065868262, "grad_norm": 0.232421875, "learning_rate": 0.00019280125043805824, "loss": 0.9699, "step": 1120 }, { "epoch": 1.6916167664670658, "grad_norm": 0.2294921875, "learning_rate": 0.00019267020245988592, "loss": 0.9407, "step": 1130 }, { "epoch": 1.7065868263473054, "grad_norm": 0.251953125, "learning_rate": 0.00019253801775902824, "loss": 0.977, "step": 1140 }, { "epoch": 1.7215568862275448, "grad_norm": 0.287109375, "learning_rate": 0.0001924046979569055, "loss": 0.9549, "step": 1150 }, { "epoch": 1.7365269461077846, "grad_norm": 0.248046875, "learning_rate": 0.00019227024468886157, "loss": 0.9824, "step": 1160 }, { "epoch": 1.751497005988024, "grad_norm": 0.259765625, "learning_rate": 0.00019213465960414368, "loss": 0.9936, "step": 1170 }, { "epoch": 1.7664670658682635, "grad_norm": 0.29296875, "learning_rate": 0.00019199794436588243, "loss": 1.0042, "step": 1180 }, { "epoch": 1.781437125748503, "grad_norm": 0.23828125, "learning_rate": 0.0001918601006510711, "loss": 0.9629, "step": 1190 }, { "epoch": 1.7964071856287425, "grad_norm": 0.2890625, "learning_rate": 0.00019172113015054532, "loss": 0.9522, "step": 1200 }, { "epoch": 1.811377245508982, "grad_norm": 0.26953125, "learning_rate": 0.0001915810345689622, "loss": 0.9806, "step": 1210 }, { "epoch": 1.8263473053892216, "grad_norm": 0.263671875, "learning_rate": 0.00019143981562477947, "loss": 0.9736, "step": 1220 }, { "epoch": 1.841317365269461, "grad_norm": 0.2578125, "learning_rate": 0.00019129747505023436, "loss": 0.9701, "step": 1230 }, { "epoch": 1.8562874251497006, "grad_norm": 0.22265625, "learning_rate": 0.00019115401459132247, "loss": 0.9494, "step": 1240 }, { "epoch": 1.8712574850299402, "grad_norm": 0.251953125, "learning_rate": 0.00019100943600777615, "loss": 0.9922, "step": 1250 }, { "epoch": 1.8862275449101795, "grad_norm": 0.2490234375, "learning_rate": 0.00019086374107304312, "loss": 0.9711, "step": 1260 }, { "epoch": 1.9011976047904193, "grad_norm": 0.25390625, "learning_rate": 0.00019071693157426457, "loss": 0.9664, "step": 1270 }, { "epoch": 1.9161676646706587, "grad_norm": 0.26171875, "learning_rate": 0.00019056900931225333, "loss": 0.9591, "step": 1280 }, { "epoch": 1.931137724550898, "grad_norm": 0.380859375, "learning_rate": 0.00019041997610147167, "loss": 0.9942, "step": 1290 }, { "epoch": 1.9461077844311379, "grad_norm": 0.2294921875, "learning_rate": 0.0001902698337700092, "loss": 0.9391, "step": 1300 }, { "epoch": 1.9610778443113772, "grad_norm": 0.298828125, "learning_rate": 0.00019011858415956038, "loss": 0.9993, "step": 1310 }, { "epoch": 1.9760479041916168, "grad_norm": 0.29296875, "learning_rate": 0.0001899662291254018, "loss": 0.9571, "step": 1320 }, { "epoch": 1.9910179640718564, "grad_norm": 0.283203125, "learning_rate": 0.0001898127705363696, "loss": 0.9835, "step": 1330 }, { "epoch": 2.0059880239520957, "grad_norm": 0.27734375, "learning_rate": 0.00018965821027483654, "loss": 0.9305, "step": 1340 }, { "epoch": 2.020958083832335, "grad_norm": 0.26171875, "learning_rate": 0.00018950255023668876, "loss": 0.8295, "step": 1350 }, { "epoch": 2.035928143712575, "grad_norm": 0.240234375, "learning_rate": 0.00018934579233130267, "loss": 0.7653, "step": 1360 }, { "epoch": 2.0508982035928143, "grad_norm": 0.29296875, "learning_rate": 0.00018918793848152142, "loss": 0.7581, "step": 1370 }, { "epoch": 2.065868263473054, "grad_norm": 0.29296875, "learning_rate": 0.00018902899062363143, "loss": 0.7983, "step": 1380 }, { "epoch": 2.0808383233532934, "grad_norm": 0.279296875, "learning_rate": 0.0001888689507073385, "loss": 0.8187, "step": 1390 }, { "epoch": 2.095808383233533, "grad_norm": 0.29296875, "learning_rate": 0.0001887078206957441, "loss": 0.7879, "step": 1400 }, { "epoch": 2.1107784431137726, "grad_norm": 0.2734375, "learning_rate": 0.000188545602565321, "loss": 0.8083, "step": 1410 }, { "epoch": 2.125748502994012, "grad_norm": 0.263671875, "learning_rate": 0.00018838229830588934, "loss": 0.8057, "step": 1420 }, { "epoch": 2.1407185628742513, "grad_norm": 0.244140625, "learning_rate": 0.00018821790992059196, "loss": 0.8194, "step": 1430 }, { "epoch": 2.155688622754491, "grad_norm": 0.2451171875, "learning_rate": 0.00018805243942587, "loss": 0.7958, "step": 1440 }, { "epoch": 2.1706586826347305, "grad_norm": 0.248046875, "learning_rate": 0.00018788588885143808, "loss": 0.8169, "step": 1450 }, { "epoch": 2.18562874251497, "grad_norm": 0.265625, "learning_rate": 0.00018771826024025946, "loss": 0.7722, "step": 1460 }, { "epoch": 2.2005988023952097, "grad_norm": 0.265625, "learning_rate": 0.0001875495556485208, "loss": 0.7934, "step": 1470 }, { "epoch": 2.215568862275449, "grad_norm": 0.244140625, "learning_rate": 0.00018737977714560738, "loss": 0.7915, "step": 1480 }, { "epoch": 2.230538922155689, "grad_norm": 0.310546875, "learning_rate": 0.00018720892681407708, "loss": 0.8021, "step": 1490 }, { "epoch": 2.245508982035928, "grad_norm": 0.298828125, "learning_rate": 0.00018703700674963547, "loss": 0.7987, "step": 1500 }, { "epoch": 2.245508982035928, "eval_loss": 0.7798940539360046, "eval_runtime": 109.62, "eval_samples_per_second": 9.122, "eval_steps_per_second": 1.14, "step": 1500 }, { "epoch": 2.2604790419161676, "grad_norm": 0.259765625, "learning_rate": 0.00018686401906110964, "loss": 0.7979, "step": 1510 }, { "epoch": 2.2754491017964074, "grad_norm": 0.265625, "learning_rate": 0.00018668996587042252, "loss": 0.8255, "step": 1520 }, { "epoch": 2.2904191616766467, "grad_norm": 0.2470703125, "learning_rate": 0.00018651484931256685, "loss": 0.8252, "step": 1530 }, { "epoch": 2.305389221556886, "grad_norm": 0.25, "learning_rate": 0.00018633867153557905, "loss": 0.8455, "step": 1540 }, { "epoch": 2.320359281437126, "grad_norm": 0.302734375, "learning_rate": 0.00018616143470051263, "loss": 0.8118, "step": 1550 }, { "epoch": 2.3353293413173652, "grad_norm": 0.283203125, "learning_rate": 0.00018598314098141206, "loss": 0.8122, "step": 1560 }, { "epoch": 2.3502994011976046, "grad_norm": 0.24609375, "learning_rate": 0.00018580379256528576, "loss": 0.7965, "step": 1570 }, { "epoch": 2.3652694610778444, "grad_norm": 0.279296875, "learning_rate": 0.00018562339165207936, "loss": 0.8309, "step": 1580 }, { "epoch": 2.3802395209580838, "grad_norm": 0.31640625, "learning_rate": 0.00018544194045464886, "loss": 0.8046, "step": 1590 }, { "epoch": 2.3952095808383236, "grad_norm": 0.2734375, "learning_rate": 0.0001852594411987334, "loss": 0.8467, "step": 1600 }, { "epoch": 2.410179640718563, "grad_norm": 0.275390625, "learning_rate": 0.00018507589612292783, "loss": 0.8566, "step": 1610 }, { "epoch": 2.4251497005988023, "grad_norm": 0.30859375, "learning_rate": 0.00018489130747865548, "loss": 0.8297, "step": 1620 }, { "epoch": 2.440119760479042, "grad_norm": 0.359375, "learning_rate": 0.00018470567753014035, "loss": 0.7823, "step": 1630 }, { "epoch": 2.4550898203592815, "grad_norm": 0.29296875, "learning_rate": 0.0001845190085543795, "loss": 0.8558, "step": 1640 }, { "epoch": 2.470059880239521, "grad_norm": 0.3046875, "learning_rate": 0.0001843313028411149, "loss": 0.8262, "step": 1650 }, { "epoch": 2.4850299401197606, "grad_norm": 0.2578125, "learning_rate": 0.00018414256269280564, "loss": 0.7982, "step": 1660 }, { "epoch": 2.5, "grad_norm": 0.314453125, "learning_rate": 0.00018395279042459937, "loss": 0.8182, "step": 1670 }, { "epoch": 2.5149700598802394, "grad_norm": 0.328125, "learning_rate": 0.00018376198836430417, "loss": 0.8275, "step": 1680 }, { "epoch": 2.529940119760479, "grad_norm": 0.271484375, "learning_rate": 0.00018357015885235982, "loss": 0.8102, "step": 1690 }, { "epoch": 2.5449101796407185, "grad_norm": 0.3125, "learning_rate": 0.0001833773042418092, "loss": 0.8145, "step": 1700 }, { "epoch": 2.5598802395209583, "grad_norm": 1.40625, "learning_rate": 0.00018318342689826938, "loss": 0.8279, "step": 1710 }, { "epoch": 2.5748502994011977, "grad_norm": 0.275390625, "learning_rate": 0.00018298852919990252, "loss": 0.8354, "step": 1720 }, { "epoch": 2.589820359281437, "grad_norm": 0.3125, "learning_rate": 0.0001827926135373869, "loss": 0.8106, "step": 1730 }, { "epoch": 2.6047904191616764, "grad_norm": 0.267578125, "learning_rate": 0.00018259568231388738, "loss": 0.7983, "step": 1740 }, { "epoch": 2.6197604790419162, "grad_norm": 0.251953125, "learning_rate": 0.00018239773794502607, "loss": 0.8183, "step": 1750 }, { "epoch": 2.6347305389221556, "grad_norm": 0.294921875, "learning_rate": 0.00018219878285885267, "loss": 0.8462, "step": 1760 }, { "epoch": 2.6497005988023954, "grad_norm": 0.283203125, "learning_rate": 0.0001819988194958146, "loss": 0.8227, "step": 1770 }, { "epoch": 2.6646706586826348, "grad_norm": 0.2431640625, "learning_rate": 0.0001817978503087272, "loss": 0.8104, "step": 1780 }, { "epoch": 2.679640718562874, "grad_norm": 0.271484375, "learning_rate": 0.0001815958777627435, "loss": 0.7972, "step": 1790 }, { "epoch": 2.694610778443114, "grad_norm": 0.283203125, "learning_rate": 0.00018139290433532416, "loss": 0.8339, "step": 1800 }, { "epoch": 2.7095808383233533, "grad_norm": 0.298828125, "learning_rate": 0.00018118893251620682, "loss": 0.8723, "step": 1810 }, { "epoch": 2.724550898203593, "grad_norm": 0.3203125, "learning_rate": 0.00018098396480737585, "loss": 0.8544, "step": 1820 }, { "epoch": 2.7395209580838324, "grad_norm": 0.265625, "learning_rate": 0.0001807780037230315, "loss": 0.8557, "step": 1830 }, { "epoch": 2.754491017964072, "grad_norm": 0.279296875, "learning_rate": 0.00018057105178955905, "loss": 0.8283, "step": 1840 }, { "epoch": 2.769461077844311, "grad_norm": 0.3828125, "learning_rate": 0.00018036311154549784, "loss": 0.7906, "step": 1850 }, { "epoch": 2.784431137724551, "grad_norm": 0.2490234375, "learning_rate": 0.0001801541855415102, "loss": 0.8036, "step": 1860 }, { "epoch": 2.7994011976047903, "grad_norm": 0.298828125, "learning_rate": 0.00017994427634035015, "loss": 0.7828, "step": 1870 }, { "epoch": 2.81437125748503, "grad_norm": 0.306640625, "learning_rate": 0.00017973338651683176, "loss": 0.7915, "step": 1880 }, { "epoch": 2.8293413173652695, "grad_norm": 0.228515625, "learning_rate": 0.00017952151865779792, "loss": 0.8141, "step": 1890 }, { "epoch": 2.844311377245509, "grad_norm": 0.349609375, "learning_rate": 0.00017930867536208826, "loss": 0.8155, "step": 1900 }, { "epoch": 2.8592814371257482, "grad_norm": 0.310546875, "learning_rate": 0.00017909485924050758, "loss": 0.8004, "step": 1910 }, { "epoch": 2.874251497005988, "grad_norm": 0.2470703125, "learning_rate": 0.00017888007291579357, "loss": 0.803, "step": 1920 }, { "epoch": 2.8892215568862274, "grad_norm": 0.275390625, "learning_rate": 0.00017866431902258478, "loss": 0.804, "step": 1930 }, { "epoch": 2.904191616766467, "grad_norm": 0.310546875, "learning_rate": 0.00017844760020738827, "loss": 0.8154, "step": 1940 }, { "epoch": 2.9191616766467066, "grad_norm": 0.251953125, "learning_rate": 0.00017822991912854713, "loss": 0.8257, "step": 1950 }, { "epoch": 2.934131736526946, "grad_norm": 0.3125, "learning_rate": 0.00017801127845620793, "loss": 0.8386, "step": 1960 }, { "epoch": 2.9491017964071857, "grad_norm": 0.275390625, "learning_rate": 0.0001777916808722879, "loss": 0.8003, "step": 1970 }, { "epoch": 2.964071856287425, "grad_norm": 0.28515625, "learning_rate": 0.000177571129070442, "loss": 0.8055, "step": 1980 }, { "epoch": 2.979041916167665, "grad_norm": 0.271484375, "learning_rate": 0.00017734962575603, "loss": 0.8233, "step": 1990 }, { "epoch": 2.9940119760479043, "grad_norm": 0.255859375, "learning_rate": 0.00017712717364608328, "loss": 0.8106, "step": 2000 }, { "epoch": 2.9940119760479043, "eval_loss": 0.6911507844924927, "eval_runtime": 109.6711, "eval_samples_per_second": 9.118, "eval_steps_per_second": 1.14, "step": 2000 }, { "epoch": 3.0089820359281436, "grad_norm": 0.37890625, "learning_rate": 0.00017690377546927133, "loss": 0.7276, "step": 2010 }, { "epoch": 3.0239520958083834, "grad_norm": 0.27734375, "learning_rate": 0.00017667943396586848, "loss": 0.6882, "step": 2020 }, { "epoch": 3.038922155688623, "grad_norm": 0.291015625, "learning_rate": 0.0001764541518877202, "loss": 0.6786, "step": 2030 }, { "epoch": 3.053892215568862, "grad_norm": 0.31640625, "learning_rate": 0.00017622793199820934, "loss": 0.6352, "step": 2040 }, { "epoch": 3.068862275449102, "grad_norm": 0.294921875, "learning_rate": 0.00017600077707222224, "loss": 0.6648, "step": 2050 }, { "epoch": 3.0838323353293413, "grad_norm": 0.294921875, "learning_rate": 0.00017577268989611472, "loss": 0.6601, "step": 2060 }, { "epoch": 3.0988023952095807, "grad_norm": 0.361328125, "learning_rate": 0.00017554367326767792, "loss": 0.6645, "step": 2070 }, { "epoch": 3.1137724550898205, "grad_norm": 0.251953125, "learning_rate": 0.00017531372999610384, "loss": 0.6631, "step": 2080 }, { "epoch": 3.12874251497006, "grad_norm": 0.30078125, "learning_rate": 0.0001750828629019511, "loss": 0.6451, "step": 2090 }, { "epoch": 3.143712574850299, "grad_norm": 0.365234375, "learning_rate": 0.00017485107481711012, "loss": 0.6703, "step": 2100 }, { "epoch": 3.158682634730539, "grad_norm": 0.341796875, "learning_rate": 0.00017461836858476856, "loss": 0.6598, "step": 2110 }, { "epoch": 3.1736526946107784, "grad_norm": 0.287109375, "learning_rate": 0.00017438474705937639, "loss": 0.6689, "step": 2120 }, { "epoch": 3.1886227544910177, "grad_norm": 0.3125, "learning_rate": 0.0001741502131066107, "loss": 0.6741, "step": 2130 }, { "epoch": 3.2035928143712575, "grad_norm": 0.31640625, "learning_rate": 0.000173914769603341, "loss": 0.6691, "step": 2140 }, { "epoch": 3.218562874251497, "grad_norm": 0.322265625, "learning_rate": 0.00017367841943759338, "loss": 0.6702, "step": 2150 }, { "epoch": 3.2335329341317367, "grad_norm": 0.265625, "learning_rate": 0.00017344116550851543, "loss": 0.6451, "step": 2160 }, { "epoch": 3.248502994011976, "grad_norm": 0.328125, "learning_rate": 0.00017320301072634066, "loss": 0.6507, "step": 2170 }, { "epoch": 3.2634730538922154, "grad_norm": 0.32421875, "learning_rate": 0.00017296395801235265, "loss": 0.695, "step": 2180 }, { "epoch": 3.2784431137724552, "grad_norm": 0.31640625, "learning_rate": 0.00017272401029884933, "loss": 0.6798, "step": 2190 }, { "epoch": 3.2934131736526946, "grad_norm": 0.330078125, "learning_rate": 0.000172483170529107, "loss": 0.6526, "step": 2200 }, { "epoch": 3.308383233532934, "grad_norm": 0.35546875, "learning_rate": 0.00017224144165734417, "loss": 0.6538, "step": 2210 }, { "epoch": 3.3233532934131738, "grad_norm": 0.291015625, "learning_rate": 0.00017199882664868538, "loss": 0.6777, "step": 2220 }, { "epoch": 3.338323353293413, "grad_norm": 0.28515625, "learning_rate": 0.00017175532847912487, "loss": 0.6762, "step": 2230 }, { "epoch": 3.3532934131736525, "grad_norm": 0.3203125, "learning_rate": 0.00017151095013548994, "loss": 0.673, "step": 2240 }, { "epoch": 3.3682634730538923, "grad_norm": 0.318359375, "learning_rate": 0.00017126569461540443, "loss": 0.6757, "step": 2250 }, { "epoch": 3.3832335329341316, "grad_norm": 0.3046875, "learning_rate": 0.00017101956492725185, "loss": 0.6563, "step": 2260 }, { "epoch": 3.3982035928143715, "grad_norm": 0.267578125, "learning_rate": 0.00017077256409013866, "loss": 0.6877, "step": 2270 }, { "epoch": 3.413173652694611, "grad_norm": 0.328125, "learning_rate": 0.000170524695133857, "loss": 0.67, "step": 2280 }, { "epoch": 3.42814371257485, "grad_norm": 0.326171875, "learning_rate": 0.00017027596109884768, "loss": 0.6808, "step": 2290 }, { "epoch": 3.44311377245509, "grad_norm": 0.318359375, "learning_rate": 0.00017002636503616282, "loss": 0.6941, "step": 2300 }, { "epoch": 3.4580838323353293, "grad_norm": 0.279296875, "learning_rate": 0.00016977591000742854, "loss": 0.6798, "step": 2310 }, { "epoch": 3.4730538922155687, "grad_norm": 0.287109375, "learning_rate": 0.0001695245990848072, "loss": 0.6718, "step": 2320 }, { "epoch": 3.4880239520958085, "grad_norm": 0.30859375, "learning_rate": 0.00016927243535095997, "loss": 0.6483, "step": 2330 }, { "epoch": 3.502994011976048, "grad_norm": 0.333984375, "learning_rate": 0.00016901942189900867, "loss": 0.7177, "step": 2340 }, { "epoch": 3.5179640718562872, "grad_norm": 0.34765625, "learning_rate": 0.00016876556183249822, "loss": 0.6833, "step": 2350 }, { "epoch": 3.532934131736527, "grad_norm": 0.365234375, "learning_rate": 0.00016851085826535838, "loss": 0.6826, "step": 2360 }, { "epoch": 3.5479041916167664, "grad_norm": 0.396484375, "learning_rate": 0.00016825531432186543, "loss": 0.6835, "step": 2370 }, { "epoch": 3.562874251497006, "grad_norm": 0.359375, "learning_rate": 0.00016799893313660408, "loss": 0.6791, "step": 2380 }, { "epoch": 3.5778443113772456, "grad_norm": 0.291015625, "learning_rate": 0.0001677417178544289, "loss": 0.6787, "step": 2390 }, { "epoch": 3.592814371257485, "grad_norm": 0.296875, "learning_rate": 0.00016748367163042576, "loss": 0.6542, "step": 2400 }, { "epoch": 3.6077844311377243, "grad_norm": 0.30078125, "learning_rate": 0.00016722479762987317, "loss": 0.6805, "step": 2410 }, { "epoch": 3.622754491017964, "grad_norm": 0.314453125, "learning_rate": 0.0001669650990282033, "loss": 0.6859, "step": 2420 }, { "epoch": 3.6377245508982035, "grad_norm": 0.32421875, "learning_rate": 0.00016670457901096328, "loss": 0.6633, "step": 2430 }, { "epoch": 3.6526946107784433, "grad_norm": 0.32421875, "learning_rate": 0.00016644324077377592, "loss": 0.6958, "step": 2440 }, { "epoch": 3.6676646706586826, "grad_norm": 0.3125, "learning_rate": 0.00016618108752230052, "loss": 0.6965, "step": 2450 }, { "epoch": 3.682634730538922, "grad_norm": 0.3046875, "learning_rate": 0.00016591812247219377, "loss": 0.6851, "step": 2460 }, { "epoch": 3.697604790419162, "grad_norm": 0.400390625, "learning_rate": 0.00016565434884907002, "loss": 0.6669, "step": 2470 }, { "epoch": 3.712574850299401, "grad_norm": 0.306640625, "learning_rate": 0.0001653897698884619, "loss": 0.6672, "step": 2480 }, { "epoch": 3.727544910179641, "grad_norm": 0.30078125, "learning_rate": 0.00016512438883578044, "loss": 0.7049, "step": 2490 }, { "epoch": 3.7425149700598803, "grad_norm": 0.30859375, "learning_rate": 0.0001648582089462756, "loss": 0.6821, "step": 2500 }, { "epoch": 3.7425149700598803, "eval_loss": 0.5838193893432617, "eval_runtime": 109.8377, "eval_samples_per_second": 9.104, "eval_steps_per_second": 1.138, "step": 2500 }, { "epoch": 3.7574850299401197, "grad_norm": 0.302734375, "learning_rate": 0.000164591233484996, "loss": 0.6775, "step": 2510 }, { "epoch": 3.772455089820359, "grad_norm": 0.328125, "learning_rate": 0.00016432346572674896, "loss": 0.6718, "step": 2520 }, { "epoch": 3.787425149700599, "grad_norm": 0.337890625, "learning_rate": 0.00016405490895606052, "loss": 0.6989, "step": 2530 }, { "epoch": 3.802395209580838, "grad_norm": 0.2890625, "learning_rate": 0.00016378556646713484, "loss": 0.6681, "step": 2540 }, { "epoch": 3.817365269461078, "grad_norm": 0.298828125, "learning_rate": 0.00016351544156381414, "loss": 0.6687, "step": 2550 }, { "epoch": 3.8323353293413174, "grad_norm": 0.318359375, "learning_rate": 0.00016324453755953773, "loss": 0.6771, "step": 2560 }, { "epoch": 3.8473053892215567, "grad_norm": 0.3359375, "learning_rate": 0.0001629728577773019, "loss": 0.668, "step": 2570 }, { "epoch": 3.8622754491017965, "grad_norm": 0.345703125, "learning_rate": 0.00016270040554961868, "loss": 0.7025, "step": 2580 }, { "epoch": 3.877245508982036, "grad_norm": 0.3046875, "learning_rate": 0.00016242718421847528, "loss": 0.6885, "step": 2590 }, { "epoch": 3.8922155688622757, "grad_norm": 0.349609375, "learning_rate": 0.00016215319713529293, "loss": 0.6989, "step": 2600 }, { "epoch": 3.907185628742515, "grad_norm": 0.314453125, "learning_rate": 0.00016187844766088586, "loss": 0.6555, "step": 2610 }, { "epoch": 3.9221556886227544, "grad_norm": 0.349609375, "learning_rate": 0.00016160293916541997, "loss": 0.6777, "step": 2620 }, { "epoch": 3.937125748502994, "grad_norm": 0.3046875, "learning_rate": 0.00016132667502837165, "loss": 0.6907, "step": 2630 }, { "epoch": 3.9520958083832336, "grad_norm": 0.3515625, "learning_rate": 0.00016104965863848617, "loss": 0.6832, "step": 2640 }, { "epoch": 3.967065868263473, "grad_norm": 0.3828125, "learning_rate": 0.00016077189339373614, "loss": 0.6816, "step": 2650 }, { "epoch": 3.9820359281437128, "grad_norm": 0.287109375, "learning_rate": 0.00016049338270127998, "loss": 0.6616, "step": 2660 }, { "epoch": 3.997005988023952, "grad_norm": 0.31640625, "learning_rate": 0.00016021412997741993, "loss": 0.6717, "step": 2670 }, { "epoch": 4.0119760479041915, "grad_norm": 0.34375, "learning_rate": 0.0001599341386475603, "loss": 0.5491, "step": 2680 }, { "epoch": 4.026946107784431, "grad_norm": 0.33984375, "learning_rate": 0.00015965341214616523, "loss": 0.539, "step": 2690 }, { "epoch": 4.04191616766467, "grad_norm": 0.373046875, "learning_rate": 0.0001593719539167169, "loss": 0.5273, "step": 2700 }, { "epoch": 4.0568862275449105, "grad_norm": 0.349609375, "learning_rate": 0.00015908976741167295, "loss": 0.5344, "step": 2710 }, { "epoch": 4.07185628742515, "grad_norm": 0.275390625, "learning_rate": 0.00015880685609242438, "loss": 0.5228, "step": 2720 }, { "epoch": 4.086826347305389, "grad_norm": 0.330078125, "learning_rate": 0.00015852322342925295, "loss": 0.4878, "step": 2730 }, { "epoch": 4.1017964071856285, "grad_norm": 0.361328125, "learning_rate": 0.00015823887290128868, "loss": 0.5228, "step": 2740 }, { "epoch": 4.116766467065868, "grad_norm": 0.3984375, "learning_rate": 0.00015795380799646716, "loss": 0.5453, "step": 2750 }, { "epoch": 4.131736526946108, "grad_norm": 0.326171875, "learning_rate": 0.00015766803221148673, "loss": 0.533, "step": 2760 }, { "epoch": 4.1467065868263475, "grad_norm": 0.296875, "learning_rate": 0.00015738154905176562, "loss": 0.5336, "step": 2770 }, { "epoch": 4.161676646706587, "grad_norm": 0.390625, "learning_rate": 0.00015709436203139893, "loss": 0.5573, "step": 2780 }, { "epoch": 4.176646706586826, "grad_norm": 0.31640625, "learning_rate": 0.00015680647467311557, "loss": 0.5451, "step": 2790 }, { "epoch": 4.191616766467066, "grad_norm": 0.318359375, "learning_rate": 0.00015651789050823508, "loss": 0.5308, "step": 2800 }, { "epoch": 4.206586826347305, "grad_norm": 0.375, "learning_rate": 0.00015622861307662414, "loss": 0.5224, "step": 2810 }, { "epoch": 4.221556886227545, "grad_norm": 0.376953125, "learning_rate": 0.00015593864592665333, "loss": 0.555, "step": 2820 }, { "epoch": 4.236526946107785, "grad_norm": 0.3125, "learning_rate": 0.00015564799261515356, "loss": 0.5702, "step": 2830 }, { "epoch": 4.251497005988024, "grad_norm": 0.34375, "learning_rate": 0.0001553566567073723, "loss": 0.5246, "step": 2840 }, { "epoch": 4.266467065868263, "grad_norm": 0.337890625, "learning_rate": 0.0001550646417769301, "loss": 0.5411, "step": 2850 }, { "epoch": 4.281437125748503, "grad_norm": 0.474609375, "learning_rate": 0.00015477195140577662, "loss": 0.5637, "step": 2860 }, { "epoch": 4.296407185628743, "grad_norm": 0.318359375, "learning_rate": 0.00015447858918414656, "loss": 0.5507, "step": 2870 }, { "epoch": 4.311377245508982, "grad_norm": 0.341796875, "learning_rate": 0.00015418455871051592, "loss": 0.5451, "step": 2880 }, { "epoch": 4.326347305389222, "grad_norm": 0.328125, "learning_rate": 0.00015388986359155758, "loss": 0.5485, "step": 2890 }, { "epoch": 4.341317365269461, "grad_norm": 0.3203125, "learning_rate": 0.0001535945074420972, "loss": 0.5347, "step": 2900 }, { "epoch": 4.3562874251497, "grad_norm": 0.3203125, "learning_rate": 0.00015329849388506886, "loss": 0.5242, "step": 2910 }, { "epoch": 4.37125748502994, "grad_norm": 0.3359375, "learning_rate": 0.00015300182655147068, "loss": 0.5513, "step": 2920 }, { "epoch": 4.38622754491018, "grad_norm": 0.345703125, "learning_rate": 0.00015270450908032012, "loss": 0.5713, "step": 2930 }, { "epoch": 4.401197604790419, "grad_norm": 0.37109375, "learning_rate": 0.0001524065451186095, "loss": 0.5441, "step": 2940 }, { "epoch": 4.416167664670659, "grad_norm": 0.3984375, "learning_rate": 0.00015210793832126112, "loss": 0.548, "step": 2950 }, { "epoch": 4.431137724550898, "grad_norm": 0.419921875, "learning_rate": 0.00015180869235108263, "loss": 0.5245, "step": 2960 }, { "epoch": 4.446107784431137, "grad_norm": 0.341796875, "learning_rate": 0.00015150881087872185, "loss": 0.5238, "step": 2970 }, { "epoch": 4.461077844311378, "grad_norm": 0.37109375, "learning_rate": 0.000151208297582622, "loss": 0.5681, "step": 2980 }, { "epoch": 4.476047904191617, "grad_norm": 0.455078125, "learning_rate": 0.00015090715614897633, "loss": 0.5413, "step": 2990 }, { "epoch": 4.491017964071856, "grad_norm": 0.361328125, "learning_rate": 0.00015060539027168316, "loss": 0.563, "step": 3000 }, { "epoch": 4.491017964071856, "eval_loss": 0.49006855487823486, "eval_runtime": 109.8413, "eval_samples_per_second": 9.104, "eval_steps_per_second": 1.138, "step": 3000 }, { "epoch": 4.505988023952096, "grad_norm": 0.3515625, "learning_rate": 0.00015030300365230037, "loss": 0.5549, "step": 3010 }, { "epoch": 4.520958083832335, "grad_norm": 0.36328125, "learning_rate": 0.00015000000000000001, "loss": 0.5628, "step": 3020 }, { "epoch": 4.5359281437125745, "grad_norm": 0.390625, "learning_rate": 0.00014969638303152295, "loss": 0.5382, "step": 3030 }, { "epoch": 4.550898203592815, "grad_norm": 0.39453125, "learning_rate": 0.00014939215647113318, "loss": 0.5595, "step": 3040 }, { "epoch": 4.565868263473054, "grad_norm": 0.318359375, "learning_rate": 0.00014908732405057208, "loss": 0.5244, "step": 3050 }, { "epoch": 4.580838323353293, "grad_norm": 0.353515625, "learning_rate": 0.00014878188950901276, "loss": 0.5385, "step": 3060 }, { "epoch": 4.595808383233533, "grad_norm": 0.400390625, "learning_rate": 0.0001484758565930141, "loss": 0.5468, "step": 3070 }, { "epoch": 4.610778443113772, "grad_norm": 0.36328125, "learning_rate": 0.00014816922905647485, "loss": 0.5334, "step": 3080 }, { "epoch": 4.625748502994012, "grad_norm": 0.380859375, "learning_rate": 0.00014786201066058766, "loss": 0.5373, "step": 3090 }, { "epoch": 4.640718562874252, "grad_norm": 0.328125, "learning_rate": 0.0001475542051737927, "loss": 0.572, "step": 3100 }, { "epoch": 4.655688622754491, "grad_norm": 0.421875, "learning_rate": 0.00014724581637173164, "loss": 0.5445, "step": 3110 }, { "epoch": 4.6706586826347305, "grad_norm": 0.380859375, "learning_rate": 0.00014693684803720138, "loss": 0.541, "step": 3120 }, { "epoch": 4.68562874251497, "grad_norm": 0.34765625, "learning_rate": 0.0001466273039601074, "loss": 0.5388, "step": 3130 }, { "epoch": 4.700598802395209, "grad_norm": 0.416015625, "learning_rate": 0.00014631718793741753, "loss": 0.5485, "step": 3140 }, { "epoch": 4.7155688622754495, "grad_norm": 0.302734375, "learning_rate": 0.00014600650377311522, "loss": 0.5594, "step": 3150 }, { "epoch": 4.730538922155689, "grad_norm": 0.46484375, "learning_rate": 0.00014569525527815297, "loss": 0.5648, "step": 3160 }, { "epoch": 4.745508982035928, "grad_norm": 0.32421875, "learning_rate": 0.0001453834462704055, "loss": 0.5642, "step": 3170 }, { "epoch": 4.7604790419161676, "grad_norm": 0.37109375, "learning_rate": 0.00014507108057462296, "loss": 0.5443, "step": 3180 }, { "epoch": 4.775449101796407, "grad_norm": 0.3671875, "learning_rate": 0.00014475816202238413, "loss": 0.5249, "step": 3190 }, { "epoch": 4.790419161676647, "grad_norm": 0.390625, "learning_rate": 0.00014444469445204906, "loss": 0.5496, "step": 3200 }, { "epoch": 4.8053892215568865, "grad_norm": 0.357421875, "learning_rate": 0.0001441306817087125, "loss": 0.5723, "step": 3210 }, { "epoch": 4.820359281437126, "grad_norm": 0.3359375, "learning_rate": 0.00014381612764415635, "loss": 0.5551, "step": 3220 }, { "epoch": 4.835329341317365, "grad_norm": 0.345703125, "learning_rate": 0.00014350103611680248, "loss": 0.5748, "step": 3230 }, { "epoch": 4.850299401197605, "grad_norm": 0.34375, "learning_rate": 0.00014318541099166555, "loss": 0.5515, "step": 3240 }, { "epoch": 4.865269461077844, "grad_norm": 0.3984375, "learning_rate": 0.00014286925614030542, "loss": 0.5261, "step": 3250 }, { "epoch": 4.880239520958084, "grad_norm": 0.310546875, "learning_rate": 0.0001425525754407798, "loss": 0.5381, "step": 3260 }, { "epoch": 4.895209580838324, "grad_norm": 0.333984375, "learning_rate": 0.00014223537277759666, "loss": 0.5496, "step": 3270 }, { "epoch": 4.910179640718563, "grad_norm": 0.451171875, "learning_rate": 0.00014191765204166643, "loss": 0.5407, "step": 3280 }, { "epoch": 4.925149700598802, "grad_norm": 0.3671875, "learning_rate": 0.00014159941713025447, "loss": 0.5703, "step": 3290 }, { "epoch": 4.940119760479042, "grad_norm": 0.349609375, "learning_rate": 0.00014128067194693316, "loss": 0.5523, "step": 3300 }, { "epoch": 4.955089820359282, "grad_norm": 0.361328125, "learning_rate": 0.000140961420401534, "loss": 0.5366, "step": 3310 }, { "epoch": 4.970059880239521, "grad_norm": 0.357421875, "learning_rate": 0.00014064166641009982, "loss": 0.5777, "step": 3320 }, { "epoch": 4.985029940119761, "grad_norm": 0.318359375, "learning_rate": 0.00014032141389483648, "loss": 0.5649, "step": 3330 }, { "epoch": 5.0, "grad_norm": 0.56640625, "learning_rate": 0.000140000666784065, "loss": 0.5502, "step": 3340 }, { "epoch": 5.014970059880239, "grad_norm": 0.373046875, "learning_rate": 0.0001396794290121731, "loss": 0.436, "step": 3350 }, { "epoch": 5.029940119760479, "grad_norm": 0.34765625, "learning_rate": 0.0001393577045195673, "loss": 0.4013, "step": 3360 }, { "epoch": 5.044910179640719, "grad_norm": 0.376953125, "learning_rate": 0.00013903549725262433, "loss": 0.4434, "step": 3370 }, { "epoch": 5.059880239520958, "grad_norm": 0.384765625, "learning_rate": 0.0001387128111636427, "loss": 0.4261, "step": 3380 }, { "epoch": 5.074850299401198, "grad_norm": 0.400390625, "learning_rate": 0.00013838965021079446, "loss": 0.416, "step": 3390 }, { "epoch": 5.089820359281437, "grad_norm": 0.400390625, "learning_rate": 0.00013806601835807637, "loss": 0.4205, "step": 3400 }, { "epoch": 5.104790419161676, "grad_norm": 0.421875, "learning_rate": 0.00013774191957526143, "loss": 0.4286, "step": 3410 }, { "epoch": 5.119760479041916, "grad_norm": 0.38671875, "learning_rate": 0.0001374173578378502, "loss": 0.4198, "step": 3420 }, { "epoch": 5.134730538922156, "grad_norm": 0.373046875, "learning_rate": 0.00013709233712702197, "loss": 0.4341, "step": 3430 }, { "epoch": 5.149700598802395, "grad_norm": 0.40234375, "learning_rate": 0.00013676686142958586, "loss": 0.4431, "step": 3440 }, { "epoch": 5.164670658682635, "grad_norm": 0.375, "learning_rate": 0.00013644093473793215, "loss": 0.4194, "step": 3450 }, { "epoch": 5.179640718562874, "grad_norm": 0.41796875, "learning_rate": 0.00013611456104998303, "loss": 0.417, "step": 3460 }, { "epoch": 5.1946107784431135, "grad_norm": 0.357421875, "learning_rate": 0.0001357877443691438, "loss": 0.4467, "step": 3470 }, { "epoch": 5.209580838323353, "grad_norm": 0.359375, "learning_rate": 0.00013546048870425356, "loss": 0.4206, "step": 3480 }, { "epoch": 5.224550898203593, "grad_norm": 0.369140625, "learning_rate": 0.00013513279806953623, "loss": 0.4148, "step": 3490 }, { "epoch": 5.2395209580838324, "grad_norm": 0.36328125, "learning_rate": 0.0001348046764845511, "loss": 0.415, "step": 3500 }, { "epoch": 5.2395209580838324, "eval_loss": 0.4033271074295044, "eval_runtime": 109.9729, "eval_samples_per_second": 9.093, "eval_steps_per_second": 1.137, "step": 3500 }, { "epoch": 5.254491017964072, "grad_norm": 0.38671875, "learning_rate": 0.0001344761279741437, "loss": 0.4233, "step": 3510 }, { "epoch": 5.269461077844311, "grad_norm": 0.3359375, "learning_rate": 0.0001341471565683964, "loss": 0.4348, "step": 3520 }, { "epoch": 5.2844311377245505, "grad_norm": 0.32421875, "learning_rate": 0.00013381776630257884, "loss": 0.4357, "step": 3530 }, { "epoch": 5.299401197604791, "grad_norm": 0.337890625, "learning_rate": 0.00013348796121709862, "loss": 0.4274, "step": 3540 }, { "epoch": 5.31437125748503, "grad_norm": 0.380859375, "learning_rate": 0.00013315774535745163, "loss": 0.4228, "step": 3550 }, { "epoch": 5.3293413173652695, "grad_norm": 0.349609375, "learning_rate": 0.0001328271227741724, "loss": 0.4214, "step": 3560 }, { "epoch": 5.344311377245509, "grad_norm": 0.40625, "learning_rate": 0.00013249609752278454, "loss": 0.4452, "step": 3570 }, { "epoch": 5.359281437125748, "grad_norm": 0.341796875, "learning_rate": 0.00013216467366375086, "loss": 0.4291, "step": 3580 }, { "epoch": 5.374251497005988, "grad_norm": 0.3828125, "learning_rate": 0.00013183285526242365, "loss": 0.4411, "step": 3590 }, { "epoch": 5.389221556886228, "grad_norm": 0.4609375, "learning_rate": 0.0001315006463889948, "loss": 0.426, "step": 3600 }, { "epoch": 5.404191616766467, "grad_norm": 0.4140625, "learning_rate": 0.0001311680511184458, "loss": 0.4095, "step": 3610 }, { "epoch": 5.419161676646707, "grad_norm": 0.349609375, "learning_rate": 0.00013083507353049782, "loss": 0.4452, "step": 3620 }, { "epoch": 5.434131736526946, "grad_norm": 0.400390625, "learning_rate": 0.00013050171770956177, "loss": 0.4311, "step": 3630 }, { "epoch": 5.449101796407185, "grad_norm": 0.35546875, "learning_rate": 0.00013016798774468792, "loss": 0.4287, "step": 3640 }, { "epoch": 5.4640718562874255, "grad_norm": 0.373046875, "learning_rate": 0.00012983388772951602, "loss": 0.4361, "step": 3650 }, { "epoch": 5.479041916167665, "grad_norm": 0.3984375, "learning_rate": 0.00012949942176222496, "loss": 0.4288, "step": 3660 }, { "epoch": 5.494011976047904, "grad_norm": 0.34765625, "learning_rate": 0.0001291645939454825, "loss": 0.4157, "step": 3670 }, { "epoch": 5.508982035928144, "grad_norm": 0.41796875, "learning_rate": 0.00012882940838639497, "loss": 0.4314, "step": 3680 }, { "epoch": 5.523952095808383, "grad_norm": 0.396484375, "learning_rate": 0.00012849386919645684, "loss": 0.4514, "step": 3690 }, { "epoch": 5.538922155688622, "grad_norm": 0.44921875, "learning_rate": 0.00012815798049150046, "loss": 0.4292, "step": 3700 }, { "epoch": 5.553892215568863, "grad_norm": 0.384765625, "learning_rate": 0.0001278217463916453, "loss": 0.4398, "step": 3710 }, { "epoch": 5.568862275449102, "grad_norm": 0.423828125, "learning_rate": 0.00012748517102124755, "loss": 0.4523, "step": 3720 }, { "epoch": 5.583832335329341, "grad_norm": 0.333984375, "learning_rate": 0.00012714825850884974, "loss": 0.4341, "step": 3730 }, { "epoch": 5.598802395209581, "grad_norm": 0.37890625, "learning_rate": 0.00012681101298712963, "loss": 0.4402, "step": 3740 }, { "epoch": 5.61377245508982, "grad_norm": 0.404296875, "learning_rate": 0.00012647343859284997, "loss": 0.4358, "step": 3750 }, { "epoch": 5.62874251497006, "grad_norm": 0.359375, "learning_rate": 0.00012613553946680742, "loss": 0.4114, "step": 3760 }, { "epoch": 5.6437125748503, "grad_norm": 0.41015625, "learning_rate": 0.0001257973197537821, "loss": 0.4586, "step": 3770 }, { "epoch": 5.658682634730539, "grad_norm": 0.443359375, "learning_rate": 0.00012545878360248634, "loss": 0.4236, "step": 3780 }, { "epoch": 5.673652694610778, "grad_norm": 0.46875, "learning_rate": 0.0001251199351655141, "loss": 0.4618, "step": 3790 }, { "epoch": 5.688622754491018, "grad_norm": 0.388671875, "learning_rate": 0.00012478077859929, "loss": 0.4236, "step": 3800 }, { "epoch": 5.703592814371257, "grad_norm": 0.40234375, "learning_rate": 0.00012444131806401817, "loss": 0.4187, "step": 3810 }, { "epoch": 5.718562874251497, "grad_norm": 0.353515625, "learning_rate": 0.00012410155772363135, "loss": 0.4534, "step": 3820 }, { "epoch": 5.733532934131737, "grad_norm": 0.37890625, "learning_rate": 0.00012376150174573986, "loss": 0.4492, "step": 3830 }, { "epoch": 5.748502994011976, "grad_norm": 0.39453125, "learning_rate": 0.00012342115430158024, "loss": 0.4622, "step": 3840 }, { "epoch": 5.763473053892215, "grad_norm": 0.380859375, "learning_rate": 0.00012308051956596444, "loss": 0.446, "step": 3850 }, { "epoch": 5.778443113772455, "grad_norm": 0.421875, "learning_rate": 0.00012273960171722834, "loss": 0.424, "step": 3860 }, { "epoch": 5.793413173652695, "grad_norm": 0.337890625, "learning_rate": 0.00012239840493718048, "loss": 0.4462, "step": 3870 }, { "epoch": 5.808383233532934, "grad_norm": 0.435546875, "learning_rate": 0.00012205693341105107, "loss": 0.4528, "step": 3880 }, { "epoch": 5.823353293413174, "grad_norm": 0.44140625, "learning_rate": 0.00012171519132744024, "loss": 0.4429, "step": 3890 }, { "epoch": 5.838323353293413, "grad_norm": 0.4375, "learning_rate": 0.00012137318287826698, "loss": 0.4185, "step": 3900 }, { "epoch": 5.8532934131736525, "grad_norm": 0.36328125, "learning_rate": 0.0001210309122587176, "loss": 0.4404, "step": 3910 }, { "epoch": 5.868263473053892, "grad_norm": 0.3515625, "learning_rate": 0.0001206883836671942, "loss": 0.4531, "step": 3920 }, { "epoch": 5.883233532934132, "grad_norm": 0.361328125, "learning_rate": 0.0001203456013052634, "loss": 0.4278, "step": 3930 }, { "epoch": 5.8982035928143715, "grad_norm": 0.34765625, "learning_rate": 0.00012000256937760445, "loss": 0.4076, "step": 3940 }, { "epoch": 5.913173652694611, "grad_norm": 0.392578125, "learning_rate": 0.000119659292091958, "loss": 0.4488, "step": 3950 }, { "epoch": 5.92814371257485, "grad_norm": 0.40625, "learning_rate": 0.00011931577365907434, "loss": 0.4167, "step": 3960 }, { "epoch": 5.9431137724550895, "grad_norm": 0.416015625, "learning_rate": 0.00011897201829266163, "loss": 0.4402, "step": 3970 }, { "epoch": 5.95808383233533, "grad_norm": 0.388671875, "learning_rate": 0.00011862803020933447, "loss": 0.4524, "step": 3980 }, { "epoch": 5.973053892215569, "grad_norm": 0.435546875, "learning_rate": 0.00011828381362856195, "loss": 0.4264, "step": 3990 }, { "epoch": 5.9880239520958085, "grad_norm": 0.337890625, "learning_rate": 0.000117939372772616, "loss": 0.4449, "step": 4000 }, { "epoch": 5.9880239520958085, "eval_loss": 0.34823065996170044, "eval_runtime": 110.0553, "eval_samples_per_second": 9.086, "eval_steps_per_second": 1.136, "step": 4000 }, { "epoch": 6.002994011976048, "grad_norm": 0.48046875, "learning_rate": 0.00011759471186651967, "loss": 0.4313, "step": 4010 }, { "epoch": 6.017964071856287, "grad_norm": 0.4375, "learning_rate": 0.00011724983513799506, "loss": 0.3427, "step": 4020 }, { "epoch": 6.032934131736527, "grad_norm": 0.3671875, "learning_rate": 0.00011690474681741178, "loss": 0.3255, "step": 4030 }, { "epoch": 6.047904191616767, "grad_norm": 0.3828125, "learning_rate": 0.00011655945113773472, "loss": 0.3332, "step": 4040 }, { "epoch": 6.062874251497006, "grad_norm": 0.3671875, "learning_rate": 0.00011621395233447248, "loss": 0.3187, "step": 4050 }, { "epoch": 6.077844311377246, "grad_norm": 0.345703125, "learning_rate": 0.00011586825464562514, "loss": 0.357, "step": 4060 }, { "epoch": 6.092814371257485, "grad_norm": 0.34375, "learning_rate": 0.00011552236231163238, "loss": 0.346, "step": 4070 }, { "epoch": 6.107784431137724, "grad_norm": 0.373046875, "learning_rate": 0.00011517627957532154, "loss": 0.3284, "step": 4080 }, { "epoch": 6.122754491017964, "grad_norm": 0.400390625, "learning_rate": 0.00011483001068185535, "loss": 0.3422, "step": 4090 }, { "epoch": 6.137724550898204, "grad_norm": 0.44140625, "learning_rate": 0.00011448355987868011, "loss": 0.3276, "step": 4100 }, { "epoch": 6.152694610778443, "grad_norm": 0.35546875, "learning_rate": 0.00011413693141547352, "loss": 0.3294, "step": 4110 }, { "epoch": 6.167664670658683, "grad_norm": 0.41015625, "learning_rate": 0.00011379012954409237, "loss": 0.3412, "step": 4120 }, { "epoch": 6.182634730538922, "grad_norm": 0.375, "learning_rate": 0.00011344315851852063, "loss": 0.3311, "step": 4130 }, { "epoch": 6.197604790419161, "grad_norm": 0.38671875, "learning_rate": 0.00011309602259481727, "loss": 0.3216, "step": 4140 }, { "epoch": 6.212574850299402, "grad_norm": 0.44140625, "learning_rate": 0.00011274872603106368, "loss": 0.359, "step": 4150 }, { "epoch": 6.227544910179641, "grad_norm": 0.37890625, "learning_rate": 0.00011240127308731197, "loss": 0.3459, "step": 4160 }, { "epoch": 6.24251497005988, "grad_norm": 0.46875, "learning_rate": 0.0001120536680255323, "loss": 0.338, "step": 4170 }, { "epoch": 6.25748502994012, "grad_norm": 0.416015625, "learning_rate": 0.0001117059151095609, "loss": 0.331, "step": 4180 }, { "epoch": 6.272455089820359, "grad_norm": 0.36328125, "learning_rate": 0.00011135801860504749, "loss": 0.3404, "step": 4190 }, { "epoch": 6.287425149700598, "grad_norm": 0.404296875, "learning_rate": 0.00011100998277940315, "loss": 0.3356, "step": 4200 }, { "epoch": 6.302395209580839, "grad_norm": 0.390625, "learning_rate": 0.00011066181190174798, "loss": 0.337, "step": 4210 }, { "epoch": 6.317365269461078, "grad_norm": 0.46484375, "learning_rate": 0.00011031351024285852, "loss": 0.3234, "step": 4220 }, { "epoch": 6.332335329341317, "grad_norm": 0.375, "learning_rate": 0.00010996508207511565, "loss": 0.3368, "step": 4230 }, { "epoch": 6.347305389221557, "grad_norm": 0.3984375, "learning_rate": 0.00010961653167245202, "loss": 0.3501, "step": 4240 }, { "epoch": 6.362275449101796, "grad_norm": 0.37890625, "learning_rate": 0.00010926786331029961, "loss": 0.3361, "step": 4250 }, { "epoch": 6.3772455089820355, "grad_norm": 0.435546875, "learning_rate": 0.00010891908126553738, "loss": 0.3491, "step": 4260 }, { "epoch": 6.392215568862276, "grad_norm": 0.375, "learning_rate": 0.00010857018981643872, "loss": 0.337, "step": 4270 }, { "epoch": 6.407185628742515, "grad_norm": 0.396484375, "learning_rate": 0.00010822119324261899, "loss": 0.3475, "step": 4280 }, { "epoch": 6.422155688622754, "grad_norm": 0.373046875, "learning_rate": 0.00010787209582498315, "loss": 0.3409, "step": 4290 }, { "epoch": 6.437125748502994, "grad_norm": 0.37890625, "learning_rate": 0.00010752290184567302, "loss": 0.3471, "step": 4300 }, { "epoch": 6.452095808383233, "grad_norm": 0.359375, "learning_rate": 0.00010717361558801491, "loss": 0.3335, "step": 4310 }, { "epoch": 6.467065868263473, "grad_norm": 0.375, "learning_rate": 0.0001068242413364671, "loss": 0.3463, "step": 4320 }, { "epoch": 6.482035928143713, "grad_norm": 0.376953125, "learning_rate": 0.0001064747833765672, "loss": 0.3467, "step": 4330 }, { "epoch": 6.497005988023952, "grad_norm": 0.44921875, "learning_rate": 0.00010612524599487958, "loss": 0.3353, "step": 4340 }, { "epoch": 6.5119760479041915, "grad_norm": 0.44921875, "learning_rate": 0.00010577563347894285, "loss": 0.3679, "step": 4350 }, { "epoch": 6.526946107784431, "grad_norm": 0.357421875, "learning_rate": 0.00010542595011721727, "loss": 0.3533, "step": 4360 }, { "epoch": 6.54191616766467, "grad_norm": 0.34765625, "learning_rate": 0.00010507620019903206, "loss": 0.3397, "step": 4370 }, { "epoch": 6.5568862275449105, "grad_norm": 0.3359375, "learning_rate": 0.00010472638801453287, "loss": 0.3342, "step": 4380 }, { "epoch": 6.57185628742515, "grad_norm": 0.451171875, "learning_rate": 0.0001043765178546292, "loss": 0.3395, "step": 4390 }, { "epoch": 6.586826347305389, "grad_norm": 0.431640625, "learning_rate": 0.00010402659401094152, "loss": 0.3463, "step": 4400 }, { "epoch": 6.6017964071856285, "grad_norm": 0.326171875, "learning_rate": 0.00010367662077574898, "loss": 0.3387, "step": 4410 }, { "epoch": 6.616766467065868, "grad_norm": 0.37109375, "learning_rate": 0.00010332660244193649, "loss": 0.3404, "step": 4420 }, { "epoch": 6.631736526946108, "grad_norm": 0.3671875, "learning_rate": 0.0001029765433029422, "loss": 0.3309, "step": 4430 }, { "epoch": 6.6467065868263475, "grad_norm": 0.38671875, "learning_rate": 0.00010262644765270473, "loss": 0.3507, "step": 4440 }, { "epoch": 6.661676646706587, "grad_norm": 0.408203125, "learning_rate": 0.00010227631978561056, "loss": 0.3472, "step": 4450 }, { "epoch": 6.676646706586826, "grad_norm": 0.455078125, "learning_rate": 0.00010192616399644148, "loss": 0.3369, "step": 4460 }, { "epoch": 6.691616766467066, "grad_norm": 0.451171875, "learning_rate": 0.00010157598458032165, "loss": 0.3678, "step": 4470 }, { "epoch": 6.706586826347305, "grad_norm": 0.3828125, "learning_rate": 0.00010122578583266505, "loss": 0.3383, "step": 4480 }, { "epoch": 6.721556886227545, "grad_norm": 0.396484375, "learning_rate": 0.00010087557204912283, "loss": 0.3296, "step": 4490 }, { "epoch": 6.736526946107785, "grad_norm": 0.435546875, "learning_rate": 0.00010052534752553063, "loss": 0.3438, "step": 4500 }, { "epoch": 6.736526946107785, "eval_loss": 0.29338502883911133, "eval_runtime": 110.1167, "eval_samples_per_second": 9.081, "eval_steps_per_second": 1.135, "step": 4500 }, { "epoch": 6.751497005988024, "grad_norm": 0.35546875, "learning_rate": 0.00010017511655785565, "loss": 0.3763, "step": 4510 }, { "epoch": 6.766467065868263, "grad_norm": 0.373046875, "learning_rate": 9.982488344214435e-05, "loss": 0.376, "step": 4520 }, { "epoch": 6.781437125748503, "grad_norm": 0.421875, "learning_rate": 9.947465247446941e-05, "loss": 0.3511, "step": 4530 }, { "epoch": 6.796407185628743, "grad_norm": 0.4609375, "learning_rate": 9.912442795087718e-05, "loss": 0.3549, "step": 4540 }, { "epoch": 6.811377245508982, "grad_norm": 0.369140625, "learning_rate": 9.877421416733496e-05, "loss": 0.3442, "step": 4550 }, { "epoch": 6.826347305389222, "grad_norm": 0.45703125, "learning_rate": 9.842401541967837e-05, "loss": 0.355, "step": 4560 }, { "epoch": 6.841317365269461, "grad_norm": 0.384765625, "learning_rate": 9.807383600355853e-05, "loss": 0.3545, "step": 4570 }, { "epoch": 6.8562874251497, "grad_norm": 0.44921875, "learning_rate": 9.772368021438943e-05, "loss": 0.3463, "step": 4580 }, { "epoch": 6.87125748502994, "grad_norm": 0.390625, "learning_rate": 9.737355234729531e-05, "loss": 0.3433, "step": 4590 }, { "epoch": 6.88622754491018, "grad_norm": 0.400390625, "learning_rate": 9.702345669705785e-05, "loss": 0.3557, "step": 4600 }, { "epoch": 6.901197604790419, "grad_norm": 0.365234375, "learning_rate": 9.667339755806351e-05, "loss": 0.3272, "step": 4610 }, { "epoch": 6.916167664670659, "grad_norm": 0.416015625, "learning_rate": 9.632337922425105e-05, "loss": 0.3412, "step": 4620 }, { "epoch": 6.931137724550898, "grad_norm": 0.46484375, "learning_rate": 9.597340598905852e-05, "loss": 0.3368, "step": 4630 }, { "epoch": 6.946107784431137, "grad_norm": 0.4375, "learning_rate": 9.562348214537082e-05, "loss": 0.3542, "step": 4640 }, { "epoch": 6.961077844311378, "grad_norm": 0.376953125, "learning_rate": 9.527361198546714e-05, "loss": 0.3481, "step": 4650 }, { "epoch": 6.976047904191617, "grad_norm": 0.318359375, "learning_rate": 9.492379980096797e-05, "loss": 0.3556, "step": 4660 }, { "epoch": 6.991017964071856, "grad_norm": 0.373046875, "learning_rate": 9.457404988278275e-05, "loss": 0.3576, "step": 4670 }, { "epoch": 7.005988023952096, "grad_norm": 0.326171875, "learning_rate": 9.422436652105717e-05, "loss": 0.3085, "step": 4680 }, { "epoch": 7.020958083832335, "grad_norm": 0.337890625, "learning_rate": 9.387475400512046e-05, "loss": 0.2891, "step": 4690 }, { "epoch": 7.0359281437125745, "grad_norm": 0.42578125, "learning_rate": 9.352521662343282e-05, "loss": 0.2845, "step": 4700 }, { "epoch": 7.050898203592815, "grad_norm": 0.36328125, "learning_rate": 9.317575866353292e-05, "loss": 0.276, "step": 4710 }, { "epoch": 7.065868263473054, "grad_norm": 0.3984375, "learning_rate": 9.282638441198512e-05, "loss": 0.266, "step": 4720 }, { "epoch": 7.080838323353293, "grad_norm": 0.423828125, "learning_rate": 9.247709815432701e-05, "loss": 0.2701, "step": 4730 }, { "epoch": 7.095808383233533, "grad_norm": 0.4140625, "learning_rate": 9.212790417501688e-05, "loss": 0.277, "step": 4740 }, { "epoch": 7.110778443113772, "grad_norm": 0.43359375, "learning_rate": 9.177880675738103e-05, "loss": 0.2695, "step": 4750 }, { "epoch": 7.125748502994012, "grad_norm": 0.3671875, "learning_rate": 9.142981018356131e-05, "loss": 0.288, "step": 4760 }, { "epoch": 7.140718562874252, "grad_norm": 0.361328125, "learning_rate": 9.108091873446264e-05, "loss": 0.2744, "step": 4770 }, { "epoch": 7.155688622754491, "grad_norm": 0.337890625, "learning_rate": 9.073213668970038e-05, "loss": 0.2696, "step": 4780 }, { "epoch": 7.1706586826347305, "grad_norm": 0.40234375, "learning_rate": 9.038346832754798e-05, "loss": 0.2794, "step": 4790 }, { "epoch": 7.18562874251497, "grad_norm": 0.421875, "learning_rate": 9.003491792488438e-05, "loss": 0.2878, "step": 4800 }, { "epoch": 7.200598802395209, "grad_norm": 0.32421875, "learning_rate": 8.968648975714148e-05, "loss": 0.2728, "step": 4810 }, { "epoch": 7.2155688622754495, "grad_norm": 0.392578125, "learning_rate": 8.933818809825204e-05, "loss": 0.2842, "step": 4820 }, { "epoch": 7.230538922155689, "grad_norm": 0.396484375, "learning_rate": 8.899001722059687e-05, "loss": 0.2806, "step": 4830 }, { "epoch": 7.245508982035928, "grad_norm": 0.376953125, "learning_rate": 8.86419813949525e-05, "loss": 0.2692, "step": 4840 }, { "epoch": 7.2604790419161676, "grad_norm": 0.4375, "learning_rate": 8.829408489043913e-05, "loss": 0.2852, "step": 4850 }, { "epoch": 7.275449101796407, "grad_norm": 0.361328125, "learning_rate": 8.79463319744677e-05, "loss": 0.2685, "step": 4860 }, { "epoch": 7.290419161676647, "grad_norm": 0.365234375, "learning_rate": 8.759872691268805e-05, "loss": 0.2677, "step": 4870 }, { "epoch": 7.3053892215568865, "grad_norm": 0.3515625, "learning_rate": 8.725127396893636e-05, "loss": 0.277, "step": 4880 }, { "epoch": 7.320359281437126, "grad_norm": 0.361328125, "learning_rate": 8.69039774051828e-05, "loss": 0.2811, "step": 4890 }, { "epoch": 7.335329341317365, "grad_norm": 0.39453125, "learning_rate": 8.655684148147936e-05, "loss": 0.292, "step": 4900 }, { "epoch": 7.350299401197605, "grad_norm": 0.396484375, "learning_rate": 8.620987045590766e-05, "loss": 0.298, "step": 4910 }, { "epoch": 7.365269461077844, "grad_norm": 0.365234375, "learning_rate": 8.586306858452653e-05, "loss": 0.2755, "step": 4920 }, { "epoch": 7.380239520958084, "grad_norm": 0.384765625, "learning_rate": 8.551644012131988e-05, "loss": 0.2755, "step": 4930 }, { "epoch": 7.395209580838324, "grad_norm": 0.40625, "learning_rate": 8.516998931814467e-05, "loss": 0.281, "step": 4940 }, { "epoch": 7.410179640718563, "grad_norm": 0.44921875, "learning_rate": 8.48237204246785e-05, "loss": 0.2886, "step": 4950 }, { "epoch": 7.425149700598802, "grad_norm": 0.388671875, "learning_rate": 8.44776376883676e-05, "loss": 0.2861, "step": 4960 }, { "epoch": 7.440119760479042, "grad_norm": 0.365234375, "learning_rate": 8.413174535437487e-05, "loss": 0.2663, "step": 4970 }, { "epoch": 7.455089820359281, "grad_norm": 0.365234375, "learning_rate": 8.378604766552757e-05, "loss": 0.2839, "step": 4980 }, { "epoch": 7.470059880239521, "grad_norm": 0.38671875, "learning_rate": 8.344054886226529e-05, "loss": 0.2827, "step": 4990 }, { "epoch": 7.485029940119761, "grad_norm": 0.384765625, "learning_rate": 8.309525318258826e-05, "loss": 0.2796, "step": 5000 }, { "epoch": 7.485029940119761, "eval_loss": 0.2577977478504181, "eval_runtime": 110.2158, "eval_samples_per_second": 9.073, "eval_steps_per_second": 1.134, "step": 5000 }, { "epoch": 7.5, "grad_norm": 0.4296875, "learning_rate": 8.275016486200498e-05, "loss": 0.2861, "step": 5010 }, { "epoch": 7.514970059880239, "grad_norm": 0.369140625, "learning_rate": 8.240528813348034e-05, "loss": 0.2675, "step": 5020 }, { "epoch": 7.529940119760479, "grad_norm": 0.40625, "learning_rate": 8.206062722738401e-05, "loss": 0.2632, "step": 5030 }, { "epoch": 7.544910179640718, "grad_norm": 0.369140625, "learning_rate": 8.17161863714381e-05, "loss": 0.2731, "step": 5040 }, { "epoch": 7.559880239520958, "grad_norm": 0.373046875, "learning_rate": 8.137196979066556e-05, "loss": 0.2756, "step": 5050 }, { "epoch": 7.574850299401198, "grad_norm": 0.4296875, "learning_rate": 8.10279817073384e-05, "loss": 0.2816, "step": 5060 }, { "epoch": 7.589820359281437, "grad_norm": 0.431640625, "learning_rate": 8.06842263409257e-05, "loss": 0.2917, "step": 5070 }, { "epoch": 7.604790419161676, "grad_norm": 0.34765625, "learning_rate": 8.034070790804201e-05, "loss": 0.2791, "step": 5080 }, { "epoch": 7.619760479041916, "grad_norm": 0.412109375, "learning_rate": 7.999743062239557e-05, "loss": 0.2927, "step": 5090 }, { "epoch": 7.634730538922156, "grad_norm": 0.416015625, "learning_rate": 7.965439869473664e-05, "loss": 0.276, "step": 5100 }, { "epoch": 7.649700598802395, "grad_norm": 0.34375, "learning_rate": 7.93116163328058e-05, "loss": 0.2737, "step": 5110 }, { "epoch": 7.664670658682635, "grad_norm": 0.380859375, "learning_rate": 7.896908774128243e-05, "loss": 0.2762, "step": 5120 }, { "epoch": 7.679640718562874, "grad_norm": 0.353515625, "learning_rate": 7.862681712173304e-05, "loss": 0.2902, "step": 5130 }, { "epoch": 7.6946107784431135, "grad_norm": 0.3828125, "learning_rate": 7.828480867255978e-05, "loss": 0.2702, "step": 5140 }, { "epoch": 7.709580838323353, "grad_norm": 0.373046875, "learning_rate": 7.794306658894896e-05, "loss": 0.289, "step": 5150 }, { "epoch": 7.724550898203593, "grad_norm": 0.345703125, "learning_rate": 7.760159506281955e-05, "loss": 0.2832, "step": 5160 }, { "epoch": 7.7395209580838324, "grad_norm": 0.369140625, "learning_rate": 7.72603982827717e-05, "loss": 0.2886, "step": 5170 }, { "epoch": 7.754491017964072, "grad_norm": 0.423828125, "learning_rate": 7.691948043403557e-05, "loss": 0.2865, "step": 5180 }, { "epoch": 7.769461077844311, "grad_norm": 0.427734375, "learning_rate": 7.657884569841979e-05, "loss": 0.2771, "step": 5190 }, { "epoch": 7.7844311377245505, "grad_norm": 0.369140625, "learning_rate": 7.623849825426017e-05, "loss": 0.2893, "step": 5200 }, { "epoch": 7.799401197604791, "grad_norm": 0.412109375, "learning_rate": 7.589844227636867e-05, "loss": 0.2847, "step": 5210 }, { "epoch": 7.81437125748503, "grad_norm": 0.390625, "learning_rate": 7.555868193598188e-05, "loss": 0.2742, "step": 5220 }, { "epoch": 7.8293413173652695, "grad_norm": 0.36328125, "learning_rate": 7.521922140071002e-05, "loss": 0.2809, "step": 5230 }, { "epoch": 7.844311377245509, "grad_norm": 0.388671875, "learning_rate": 7.488006483448591e-05, "loss": 0.2975, "step": 5240 }, { "epoch": 7.859281437125748, "grad_norm": 0.373046875, "learning_rate": 7.454121639751371e-05, "loss": 0.292, "step": 5250 }, { "epoch": 7.874251497005988, "grad_norm": 0.3984375, "learning_rate": 7.420268024621792e-05, "loss": 0.2878, "step": 5260 }, { "epoch": 7.889221556886228, "grad_norm": 0.435546875, "learning_rate": 7.386446053319258e-05, "loss": 0.2896, "step": 5270 }, { "epoch": 7.904191616766467, "grad_norm": 0.396484375, "learning_rate": 7.352656140715006e-05, "loss": 0.2723, "step": 5280 }, { "epoch": 7.919161676646707, "grad_norm": 0.447265625, "learning_rate": 7.318898701287038e-05, "loss": 0.2997, "step": 5290 }, { "epoch": 7.934131736526946, "grad_norm": 0.373046875, "learning_rate": 7.28517414911503e-05, "loss": 0.2916, "step": 5300 }, { "epoch": 7.949101796407185, "grad_norm": 0.419921875, "learning_rate": 7.251482897875244e-05, "loss": 0.2929, "step": 5310 }, { "epoch": 7.9640718562874255, "grad_norm": 0.359375, "learning_rate": 7.217825360835473e-05, "loss": 0.2704, "step": 5320 }, { "epoch": 7.979041916167665, "grad_norm": 0.34765625, "learning_rate": 7.184201950849956e-05, "loss": 0.2937, "step": 5330 }, { "epoch": 7.994011976047904, "grad_norm": 0.353515625, "learning_rate": 7.150613080354314e-05, "loss": 0.2787, "step": 5340 }, { "epoch": 8.008982035928144, "grad_norm": 0.40625, "learning_rate": 7.117059161360504e-05, "loss": 0.2628, "step": 5350 }, { "epoch": 8.023952095808383, "grad_norm": 0.333984375, "learning_rate": 7.08354060545175e-05, "loss": 0.2473, "step": 5360 }, { "epoch": 8.038922155688622, "grad_norm": 0.37109375, "learning_rate": 7.050057823777502e-05, "loss": 0.2305, "step": 5370 }, { "epoch": 8.053892215568862, "grad_norm": 0.416015625, "learning_rate": 7.016611227048397e-05, "loss": 0.2484, "step": 5380 }, { "epoch": 8.068862275449101, "grad_norm": 0.427734375, "learning_rate": 6.983201225531211e-05, "loss": 0.2441, "step": 5390 }, { "epoch": 8.08383233532934, "grad_norm": 0.34375, "learning_rate": 6.949828229043824e-05, "loss": 0.2497, "step": 5400 }, { "epoch": 8.098802395209582, "grad_norm": 0.373046875, "learning_rate": 6.91649264695022e-05, "loss": 0.2438, "step": 5410 }, { "epoch": 8.113772455089821, "grad_norm": 0.337890625, "learning_rate": 6.883194888155424e-05, "loss": 0.2473, "step": 5420 }, { "epoch": 8.12874251497006, "grad_norm": 0.462890625, "learning_rate": 6.849935361100522e-05, "loss": 0.2436, "step": 5430 }, { "epoch": 8.1437125748503, "grad_norm": 0.36328125, "learning_rate": 6.816714473757638e-05, "loss": 0.2313, "step": 5440 }, { "epoch": 8.158682634730539, "grad_norm": 0.384765625, "learning_rate": 6.783532633624918e-05, "loss": 0.2584, "step": 5450 }, { "epoch": 8.173652694610778, "grad_norm": 0.3125, "learning_rate": 6.750390247721548e-05, "loss": 0.245, "step": 5460 }, { "epoch": 8.188622754491018, "grad_norm": 0.37109375, "learning_rate": 6.717287722582762e-05, "loss": 0.2495, "step": 5470 }, { "epoch": 8.203592814371257, "grad_norm": 0.3984375, "learning_rate": 6.68422546425484e-05, "loss": 0.2479, "step": 5480 }, { "epoch": 8.218562874251496, "grad_norm": 0.337890625, "learning_rate": 6.651203878290139e-05, "loss": 0.2407, "step": 5490 }, { "epoch": 8.233532934131736, "grad_norm": 0.384765625, "learning_rate": 6.618223369742117e-05, "loss": 0.2309, "step": 5500 }, { "epoch": 8.233532934131736, "eval_loss": 0.23407956957817078, "eval_runtime": 110.3134, "eval_samples_per_second": 9.065, "eval_steps_per_second": 1.133, "step": 5500 }, { "epoch": 8.248502994011975, "grad_norm": 0.34375, "learning_rate": 6.585284343160363e-05, "loss": 0.2421, "step": 5510 }, { "epoch": 8.263473053892216, "grad_norm": 0.314453125, "learning_rate": 6.55238720258563e-05, "loss": 0.2336, "step": 5520 }, { "epoch": 8.278443113772456, "grad_norm": 0.341796875, "learning_rate": 6.519532351544892e-05, "loss": 0.2476, "step": 5530 }, { "epoch": 8.293413173652695, "grad_norm": 0.345703125, "learning_rate": 6.486720193046379e-05, "loss": 0.2441, "step": 5540 }, { "epoch": 8.308383233532934, "grad_norm": 0.4296875, "learning_rate": 6.453951129574644e-05, "loss": 0.2415, "step": 5550 }, { "epoch": 8.323353293413174, "grad_norm": 0.328125, "learning_rate": 6.421225563085622e-05, "loss": 0.2356, "step": 5560 }, { "epoch": 8.338323353293413, "grad_norm": 0.34765625, "learning_rate": 6.3885438950017e-05, "loss": 0.2478, "step": 5570 }, { "epoch": 8.353293413173652, "grad_norm": 0.375, "learning_rate": 6.355906526206788e-05, "loss": 0.2386, "step": 5580 }, { "epoch": 8.368263473053892, "grad_norm": 0.3359375, "learning_rate": 6.323313857041417e-05, "loss": 0.2351, "step": 5590 }, { "epoch": 8.383233532934131, "grad_norm": 0.447265625, "learning_rate": 6.290766287297806e-05, "loss": 0.2399, "step": 5600 }, { "epoch": 8.39820359281437, "grad_norm": 0.359375, "learning_rate": 6.258264216214977e-05, "loss": 0.2465, "step": 5610 }, { "epoch": 8.41317365269461, "grad_norm": 0.404296875, "learning_rate": 6.225808042473858e-05, "loss": 0.2545, "step": 5620 }, { "epoch": 8.428143712574851, "grad_norm": 0.357421875, "learning_rate": 6.193398164192365e-05, "loss": 0.2365, "step": 5630 }, { "epoch": 8.44311377245509, "grad_norm": 0.40234375, "learning_rate": 6.161034978920555e-05, "loss": 0.2385, "step": 5640 }, { "epoch": 8.45808383233533, "grad_norm": 0.328125, "learning_rate": 6.128718883635732e-05, "loss": 0.2365, "step": 5650 }, { "epoch": 8.47305389221557, "grad_norm": 0.3671875, "learning_rate": 6.096450274737572e-05, "loss": 0.2553, "step": 5660 }, { "epoch": 8.488023952095809, "grad_norm": 0.361328125, "learning_rate": 6.064229548043272e-05, "loss": 0.2487, "step": 5670 }, { "epoch": 8.502994011976048, "grad_norm": 0.345703125, "learning_rate": 6.032057098782694e-05, "loss": 0.2449, "step": 5680 }, { "epoch": 8.517964071856287, "grad_norm": 0.396484375, "learning_rate": 5.9999333215935075e-05, "loss": 0.2611, "step": 5690 }, { "epoch": 8.532934131736527, "grad_norm": 0.361328125, "learning_rate": 5.9678586105163535e-05, "loss": 0.235, "step": 5700 }, { "epoch": 8.547904191616766, "grad_norm": 0.32421875, "learning_rate": 5.935833358990019e-05, "loss": 0.2515, "step": 5710 }, { "epoch": 8.562874251497005, "grad_norm": 0.396484375, "learning_rate": 5.9038579598466016e-05, "loss": 0.2465, "step": 5720 }, { "epoch": 8.577844311377245, "grad_norm": 0.357421875, "learning_rate": 5.871932805306688e-05, "loss": 0.2375, "step": 5730 }, { "epoch": 8.592814371257486, "grad_norm": 0.373046875, "learning_rate": 5.840058286974557e-05, "loss": 0.2315, "step": 5740 }, { "epoch": 8.607784431137725, "grad_norm": 0.34765625, "learning_rate": 5.8082347958333625e-05, "loss": 0.2517, "step": 5750 }, { "epoch": 8.622754491017965, "grad_norm": 0.412109375, "learning_rate": 5.776462722240337e-05, "loss": 0.2367, "step": 5760 }, { "epoch": 8.637724550898204, "grad_norm": 0.353515625, "learning_rate": 5.7447424559220185e-05, "loss": 0.2403, "step": 5770 }, { "epoch": 8.652694610778443, "grad_norm": 0.375, "learning_rate": 5.713074385969457e-05, "loss": 0.2406, "step": 5780 }, { "epoch": 8.667664670658683, "grad_norm": 0.390625, "learning_rate": 5.681458900833447e-05, "loss": 0.247, "step": 5790 }, { "epoch": 8.682634730538922, "grad_norm": 0.369140625, "learning_rate": 5.649896388319752e-05, "loss": 0.2594, "step": 5800 }, { "epoch": 8.697604790419161, "grad_norm": 0.376953125, "learning_rate": 5.6183872355843636e-05, "loss": 0.2472, "step": 5810 }, { "epoch": 8.7125748502994, "grad_norm": 0.384765625, "learning_rate": 5.5869318291287495e-05, "loss": 0.258, "step": 5820 }, { "epoch": 8.72754491017964, "grad_norm": 0.400390625, "learning_rate": 5.555530554795093e-05, "loss": 0.2392, "step": 5830 }, { "epoch": 8.74251497005988, "grad_norm": 0.3671875, "learning_rate": 5.524183797761588e-05, "loss": 0.2574, "step": 5840 }, { "epoch": 8.75748502994012, "grad_norm": 0.34375, "learning_rate": 5.492891942537703e-05, "loss": 0.2439, "step": 5850 }, { "epoch": 8.77245508982036, "grad_norm": 0.322265625, "learning_rate": 5.461655372959455e-05, "loss": 0.2459, "step": 5860 }, { "epoch": 8.7874251497006, "grad_norm": 0.376953125, "learning_rate": 5.430474472184702e-05, "loss": 0.2524, "step": 5870 }, { "epoch": 8.802395209580839, "grad_norm": 0.39453125, "learning_rate": 5.399349622688479e-05, "loss": 0.239, "step": 5880 }, { "epoch": 8.817365269461078, "grad_norm": 0.400390625, "learning_rate": 5.368281206258252e-05, "loss": 0.2421, "step": 5890 }, { "epoch": 8.832335329341317, "grad_norm": 0.3671875, "learning_rate": 5.337269603989259e-05, "loss": 0.2386, "step": 5900 }, { "epoch": 8.847305389221557, "grad_norm": 0.36328125, "learning_rate": 5.306315196279864e-05, "loss": 0.2426, "step": 5910 }, { "epoch": 8.862275449101796, "grad_norm": 0.359375, "learning_rate": 5.275418362826838e-05, "loss": 0.255, "step": 5920 }, { "epoch": 8.877245508982035, "grad_norm": 0.40625, "learning_rate": 5.2445794826207337e-05, "loss": 0.2524, "step": 5930 }, { "epoch": 8.892215568862275, "grad_norm": 0.37109375, "learning_rate": 5.213798933941236e-05, "loss": 0.2465, "step": 5940 }, { "epoch": 8.907185628742514, "grad_norm": 0.392578125, "learning_rate": 5.1830770943525175e-05, "loss": 0.2703, "step": 5950 }, { "epoch": 8.922155688622755, "grad_norm": 0.34375, "learning_rate": 5.152414340698595e-05, "loss": 0.2459, "step": 5960 }, { "epoch": 8.937125748502995, "grad_norm": 0.34375, "learning_rate": 5.121811049098728e-05, "loss": 0.2308, "step": 5970 }, { "epoch": 8.952095808383234, "grad_norm": 0.35546875, "learning_rate": 5.091267594942797e-05, "loss": 0.2421, "step": 5980 }, { "epoch": 8.967065868263473, "grad_norm": 0.35546875, "learning_rate": 5.060784352886685e-05, "loss": 0.227, "step": 5990 }, { "epoch": 8.982035928143713, "grad_norm": 0.32421875, "learning_rate": 5.030361696847705e-05, "loss": 0.2472, "step": 6000 }, { "epoch": 8.982035928143713, "eval_loss": 0.22224491834640503, "eval_runtime": 110.3137, "eval_samples_per_second": 9.065, "eval_steps_per_second": 1.133, "step": 6000 }, { "epoch": 8.997005988023952, "grad_norm": 0.396484375, "learning_rate": 5.000000000000002e-05, "loss": 0.2286, "step": 6010 }, { "epoch": 9.011976047904191, "grad_norm": 0.345703125, "learning_rate": 4.9696996347699664e-05, "loss": 0.2288, "step": 6020 }, { "epoch": 9.02694610778443, "grad_norm": 0.32421875, "learning_rate": 4.939460972831684e-05, "loss": 0.218, "step": 6030 }, { "epoch": 9.04191616766467, "grad_norm": 0.361328125, "learning_rate": 4.909284385102371e-05, "loss": 0.222, "step": 6040 }, { "epoch": 9.05688622754491, "grad_norm": 0.333984375, "learning_rate": 4.879170241737805e-05, "loss": 0.2235, "step": 6050 }, { "epoch": 9.071856287425149, "grad_norm": 0.36328125, "learning_rate": 4.849118912127817e-05, "loss": 0.2209, "step": 6060 }, { "epoch": 9.08682634730539, "grad_norm": 0.333984375, "learning_rate": 4.8191307648917416e-05, "loss": 0.2352, "step": 6070 }, { "epoch": 9.10179640718563, "grad_norm": 0.427734375, "learning_rate": 4.78920616787389e-05, "loss": 0.2218, "step": 6080 }, { "epoch": 9.116766467065869, "grad_norm": 0.353515625, "learning_rate": 4.759345488139054e-05, "loss": 0.2191, "step": 6090 }, { "epoch": 9.131736526946108, "grad_norm": 0.3828125, "learning_rate": 4.729549091967989e-05, "loss": 0.2204, "step": 6100 }, { "epoch": 9.146706586826348, "grad_norm": 0.384765625, "learning_rate": 4.6998173448529315e-05, "loss": 0.2179, "step": 6110 }, { "epoch": 9.161676646706587, "grad_norm": 0.35546875, "learning_rate": 4.670150611493116e-05, "loss": 0.228, "step": 6120 }, { "epoch": 9.176646706586826, "grad_norm": 0.35546875, "learning_rate": 4.640549255790284e-05, "loss": 0.2147, "step": 6130 }, { "epoch": 9.191616766467066, "grad_norm": 0.337890625, "learning_rate": 4.611013640844245e-05, "loss": 0.2388, "step": 6140 }, { "epoch": 9.206586826347305, "grad_norm": 0.375, "learning_rate": 4.581544128948413e-05, "loss": 0.2168, "step": 6150 }, { "epoch": 9.221556886227544, "grad_norm": 0.35546875, "learning_rate": 4.552141081585346e-05, "loss": 0.2154, "step": 6160 }, { "epoch": 9.236526946107784, "grad_norm": 0.373046875, "learning_rate": 4.522804859422341e-05, "loss": 0.2306, "step": 6170 }, { "epoch": 9.251497005988025, "grad_norm": 0.3515625, "learning_rate": 4.493535822306992e-05, "loss": 0.2188, "step": 6180 }, { "epoch": 9.266467065868264, "grad_norm": 0.31640625, "learning_rate": 4.464334329262773e-05, "loss": 0.2063, "step": 6190 }, { "epoch": 9.281437125748504, "grad_norm": 0.390625, "learning_rate": 4.435200738484646e-05, "loss": 0.2327, "step": 6200 }, { "epoch": 9.296407185628743, "grad_norm": 0.353515625, "learning_rate": 4.406135407334668e-05, "loss": 0.2291, "step": 6210 }, { "epoch": 9.311377245508982, "grad_norm": 0.330078125, "learning_rate": 4.37713869233759e-05, "loss": 0.2161, "step": 6220 }, { "epoch": 9.326347305389222, "grad_norm": 0.330078125, "learning_rate": 4.348210949176491e-05, "loss": 0.2255, "step": 6230 }, { "epoch": 9.341317365269461, "grad_norm": 0.36328125, "learning_rate": 4.3193525326884435e-05, "loss": 0.2231, "step": 6240 }, { "epoch": 9.3562874251497, "grad_norm": 0.375, "learning_rate": 4.2905637968601134e-05, "loss": 0.2338, "step": 6250 }, { "epoch": 9.37125748502994, "grad_norm": 0.365234375, "learning_rate": 4.26184509482344e-05, "loss": 0.2295, "step": 6260 }, { "epoch": 9.386227544910179, "grad_norm": 0.33203125, "learning_rate": 4.2331967788513295e-05, "loss": 0.2293, "step": 6270 }, { "epoch": 9.401197604790418, "grad_norm": 0.40234375, "learning_rate": 4.2046192003532834e-05, "loss": 0.2384, "step": 6280 }, { "epoch": 9.41616766467066, "grad_norm": 0.3671875, "learning_rate": 4.176112709871134e-05, "loss": 0.2361, "step": 6290 }, { "epoch": 9.431137724550899, "grad_norm": 0.361328125, "learning_rate": 4.1476776570747066e-05, "loss": 0.229, "step": 6300 }, { "epoch": 9.446107784431138, "grad_norm": 0.486328125, "learning_rate": 4.119314390757564e-05, "loss": 0.23, "step": 6310 }, { "epoch": 9.461077844311378, "grad_norm": 0.4140625, "learning_rate": 4.091023258832709e-05, "loss": 0.2367, "step": 6320 }, { "epoch": 9.476047904191617, "grad_norm": 0.3203125, "learning_rate": 4.0628046083283136e-05, "loss": 0.2207, "step": 6330 }, { "epoch": 9.491017964071856, "grad_norm": 0.365234375, "learning_rate": 4.034658785383477e-05, "loss": 0.2054, "step": 6340 }, { "epoch": 9.505988023952096, "grad_norm": 0.31640625, "learning_rate": 4.006586135243975e-05, "loss": 0.2193, "step": 6350 }, { "epoch": 9.520958083832335, "grad_norm": 0.3828125, "learning_rate": 3.9785870022580075e-05, "loss": 0.2336, "step": 6360 }, { "epoch": 9.535928143712574, "grad_norm": 0.328125, "learning_rate": 3.950661729872002e-05, "loss": 0.2282, "step": 6370 }, { "epoch": 9.550898203592814, "grad_norm": 0.31640625, "learning_rate": 3.9228106606263894e-05, "loss": 0.2301, "step": 6380 }, { "epoch": 9.565868263473053, "grad_norm": 0.330078125, "learning_rate": 3.8950341361513876e-05, "loss": 0.2215, "step": 6390 }, { "epoch": 9.580838323353294, "grad_norm": 0.369140625, "learning_rate": 3.8673324971628357e-05, "loss": 0.2334, "step": 6400 }, { "epoch": 9.595808383233534, "grad_norm": 0.330078125, "learning_rate": 3.839706083458005e-05, "loss": 0.2214, "step": 6410 }, { "epoch": 9.610778443113773, "grad_norm": 0.353515625, "learning_rate": 3.812155233911417e-05, "loss": 0.2239, "step": 6420 }, { "epoch": 9.625748502994012, "grad_norm": 0.3359375, "learning_rate": 3.7846802864707066e-05, "loss": 0.2263, "step": 6430 }, { "epoch": 9.640718562874252, "grad_norm": 0.3203125, "learning_rate": 3.757281578152474e-05, "loss": 0.2171, "step": 6440 }, { "epoch": 9.655688622754491, "grad_norm": 0.357421875, "learning_rate": 3.729959445038136e-05, "loss": 0.2304, "step": 6450 }, { "epoch": 9.67065868263473, "grad_norm": 0.37109375, "learning_rate": 3.7027142222698106e-05, "loss": 0.2134, "step": 6460 }, { "epoch": 9.68562874251497, "grad_norm": 0.44140625, "learning_rate": 3.675546244046228e-05, "loss": 0.2345, "step": 6470 }, { "epoch": 9.70059880239521, "grad_norm": 0.35546875, "learning_rate": 3.6484558436185936e-05, "loss": 0.2173, "step": 6480 }, { "epoch": 9.715568862275449, "grad_norm": 0.419921875, "learning_rate": 3.6214433532865134e-05, "loss": 0.2308, "step": 6490 }, { "epoch": 9.730538922155688, "grad_norm": 0.34375, "learning_rate": 3.594509104393951e-05, "loss": 0.2159, "step": 6500 }, { "epoch": 9.730538922155688, "eval_loss": 0.2160806953907013, "eval_runtime": 110.3889, "eval_samples_per_second": 9.059, "eval_steps_per_second": 1.132, "step": 6500 }, { "epoch": 9.745508982035929, "grad_norm": 0.3203125, "learning_rate": 3.567653427325107e-05, "loss": 0.2173, "step": 6510 }, { "epoch": 9.760479041916168, "grad_norm": 0.365234375, "learning_rate": 3.540876651500404e-05, "loss": 0.2223, "step": 6520 }, { "epoch": 9.775449101796408, "grad_norm": 0.38671875, "learning_rate": 3.5141791053724405e-05, "loss": 0.2494, "step": 6530 }, { "epoch": 9.790419161676647, "grad_norm": 0.392578125, "learning_rate": 3.487561116421958e-05, "loss": 0.2444, "step": 6540 }, { "epoch": 9.805389221556887, "grad_norm": 0.328125, "learning_rate": 3.4610230111538144e-05, "loss": 0.2414, "step": 6550 }, { "epoch": 9.820359281437126, "grad_norm": 0.44140625, "learning_rate": 3.434565115092998e-05, "loss": 0.2376, "step": 6560 }, { "epoch": 9.835329341317365, "grad_norm": 0.431640625, "learning_rate": 3.408187752780624e-05, "loss": 0.242, "step": 6570 }, { "epoch": 9.850299401197605, "grad_norm": 0.373046875, "learning_rate": 3.3818912477699486e-05, "loss": 0.2328, "step": 6580 }, { "epoch": 9.865269461077844, "grad_norm": 0.349609375, "learning_rate": 3.355675922622411e-05, "loss": 0.2434, "step": 6590 }, { "epoch": 9.880239520958083, "grad_norm": 0.3359375, "learning_rate": 3.329542098903674e-05, "loss": 0.2237, "step": 6600 }, { "epoch": 9.895209580838323, "grad_norm": 0.412109375, "learning_rate": 3.303490097179671e-05, "loss": 0.2274, "step": 6610 }, { "epoch": 9.910179640718562, "grad_norm": 0.361328125, "learning_rate": 3.2775202370126845e-05, "loss": 0.2246, "step": 6620 }, { "epoch": 9.925149700598803, "grad_norm": 0.365234375, "learning_rate": 3.2516328369574247e-05, "loss": 0.2181, "step": 6630 }, { "epoch": 9.940119760479043, "grad_norm": 0.384765625, "learning_rate": 3.225828214557111e-05, "loss": 0.2063, "step": 6640 }, { "epoch": 9.955089820359282, "grad_norm": 0.41015625, "learning_rate": 3.200106686339596e-05, "loss": 0.2449, "step": 6650 }, { "epoch": 9.970059880239521, "grad_norm": 0.3125, "learning_rate": 3.174468567813461e-05, "loss": 0.2178, "step": 6660 }, { "epoch": 9.98502994011976, "grad_norm": 0.408203125, "learning_rate": 3.1489141734641645e-05, "loss": 0.2173, "step": 6670 }, { "epoch": 10.0, "grad_norm": 0.671875, "learning_rate": 3.12344381675018e-05, "loss": 0.2331, "step": 6680 }, { "epoch": 10.01497005988024, "grad_norm": 0.416015625, "learning_rate": 3.098057810099135e-05, "loss": 0.2217, "step": 6690 }, { "epoch": 10.029940119760479, "grad_norm": 0.31640625, "learning_rate": 3.072756464904006e-05, "loss": 0.2091, "step": 6700 }, { "epoch": 10.044910179640718, "grad_norm": 0.365234375, "learning_rate": 3.0475400915192807e-05, "loss": 0.226, "step": 6710 }, { "epoch": 10.059880239520957, "grad_norm": 0.357421875, "learning_rate": 3.022408999257148e-05, "loss": 0.2081, "step": 6720 }, { "epoch": 10.074850299401197, "grad_norm": 0.353515625, "learning_rate": 2.997363496383718e-05, "loss": 0.2089, "step": 6730 }, { "epoch": 10.089820359281438, "grad_norm": 0.408203125, "learning_rate": 2.9724038901152372e-05, "loss": 0.2172, "step": 6740 }, { "epoch": 10.104790419161677, "grad_norm": 0.37890625, "learning_rate": 2.9475304866143027e-05, "loss": 0.2338, "step": 6750 }, { "epoch": 10.119760479041917, "grad_norm": 0.337890625, "learning_rate": 2.9227435909861345e-05, "loss": 0.2038, "step": 6760 }, { "epoch": 10.134730538922156, "grad_norm": 0.384765625, "learning_rate": 2.8980435072748157e-05, "loss": 0.2112, "step": 6770 }, { "epoch": 10.149700598802395, "grad_norm": 0.42578125, "learning_rate": 2.8734305384595595e-05, "loss": 0.2099, "step": 6780 }, { "epoch": 10.164670658682635, "grad_norm": 0.3359375, "learning_rate": 2.8489049864510054e-05, "loss": 0.211, "step": 6790 }, { "epoch": 10.179640718562874, "grad_norm": 0.357421875, "learning_rate": 2.8244671520875143e-05, "loss": 0.2113, "step": 6800 }, { "epoch": 10.194610778443113, "grad_norm": 0.369140625, "learning_rate": 2.8001173351314625e-05, "loss": 0.2111, "step": 6810 }, { "epoch": 10.209580838323353, "grad_norm": 0.361328125, "learning_rate": 2.775855834265584e-05, "loss": 0.2207, "step": 6820 }, { "epoch": 10.224550898203592, "grad_norm": 0.380859375, "learning_rate": 2.7516829470893035e-05, "loss": 0.209, "step": 6830 }, { "epoch": 10.239520958083832, "grad_norm": 0.34765625, "learning_rate": 2.7275989701150683e-05, "loss": 0.2263, "step": 6840 }, { "epoch": 10.254491017964073, "grad_norm": 0.318359375, "learning_rate": 2.7036041987647354e-05, "loss": 0.2224, "step": 6850 }, { "epoch": 10.269461077844312, "grad_norm": 0.341796875, "learning_rate": 2.6796989273659355e-05, "loss": 0.2131, "step": 6860 }, { "epoch": 10.284431137724551, "grad_norm": 0.353515625, "learning_rate": 2.6558834491484573e-05, "loss": 0.2056, "step": 6870 }, { "epoch": 10.29940119760479, "grad_norm": 0.37890625, "learning_rate": 2.6321580562406656e-05, "loss": 0.2203, "step": 6880 }, { "epoch": 10.31437125748503, "grad_norm": 0.34765625, "learning_rate": 2.608523039665902e-05, "loss": 0.225, "step": 6890 }, { "epoch": 10.32934131736527, "grad_norm": 0.32421875, "learning_rate": 2.5849786893389295e-05, "loss": 0.2182, "step": 6900 }, { "epoch": 10.344311377245509, "grad_norm": 0.408203125, "learning_rate": 2.5615252940623668e-05, "loss": 0.2186, "step": 6910 }, { "epoch": 10.359281437125748, "grad_norm": 0.3984375, "learning_rate": 2.5381631415231454e-05, "loss": 0.2159, "step": 6920 }, { "epoch": 10.374251497005988, "grad_norm": 0.421875, "learning_rate": 2.514892518288988e-05, "loss": 0.2302, "step": 6930 }, { "epoch": 10.389221556886227, "grad_norm": 0.34375, "learning_rate": 2.4917137098048926e-05, "loss": 0.2169, "step": 6940 }, { "epoch": 10.404191616766466, "grad_norm": 0.341796875, "learning_rate": 2.468627000389616e-05, "loss": 0.2079, "step": 6950 }, { "epoch": 10.419161676646706, "grad_norm": 0.419921875, "learning_rate": 2.4456326732322077e-05, "loss": 0.2211, "step": 6960 }, { "epoch": 10.434131736526947, "grad_norm": 0.33203125, "learning_rate": 2.4227310103885293e-05, "loss": 0.2297, "step": 6970 }, { "epoch": 10.449101796407186, "grad_norm": 0.35546875, "learning_rate": 2.3999222927777775e-05, "loss": 0.2166, "step": 6980 }, { "epoch": 10.464071856287426, "grad_norm": 0.3515625, "learning_rate": 2.377206800179068e-05, "loss": 0.2134, "step": 6990 }, { "epoch": 10.479041916167665, "grad_norm": 0.31640625, "learning_rate": 2.3545848112279833e-05, "loss": 0.2241, "step": 7000 }, { "epoch": 10.479041916167665, "eval_loss": 0.2133660614490509, "eval_runtime": 110.5822, "eval_samples_per_second": 9.043, "eval_steps_per_second": 1.13, "step": 7000 }, { "epoch": 10.494011976047904, "grad_norm": 0.43359375, "learning_rate": 2.332056603413154e-05, "loss": 0.2227, "step": 7010 }, { "epoch": 10.508982035928144, "grad_norm": 0.37890625, "learning_rate": 2.3096224530728673e-05, "loss": 0.2152, "step": 7020 }, { "epoch": 10.523952095808383, "grad_norm": 0.380859375, "learning_rate": 2.2872826353916742e-05, "loss": 0.2137, "step": 7030 }, { "epoch": 10.538922155688622, "grad_norm": 0.392578125, "learning_rate": 2.2650374243970017e-05, "loss": 0.2196, "step": 7040 }, { "epoch": 10.553892215568862, "grad_norm": 0.30859375, "learning_rate": 2.242887092955801e-05, "loss": 0.2269, "step": 7050 }, { "epoch": 10.568862275449101, "grad_norm": 0.333984375, "learning_rate": 2.220831912771213e-05, "loss": 0.2234, "step": 7060 }, { "epoch": 10.58383233532934, "grad_norm": 0.333984375, "learning_rate": 2.1988721543792102e-05, "loss": 0.2244, "step": 7070 }, { "epoch": 10.598802395209582, "grad_norm": 0.380859375, "learning_rate": 2.177008087145286e-05, "loss": 0.2149, "step": 7080 }, { "epoch": 10.613772455089821, "grad_norm": 0.328125, "learning_rate": 2.1552399792611756e-05, "loss": 0.2087, "step": 7090 }, { "epoch": 10.62874251497006, "grad_norm": 0.388671875, "learning_rate": 2.1335680977415263e-05, "loss": 0.2278, "step": 7100 }, { "epoch": 10.6437125748503, "grad_norm": 0.390625, "learning_rate": 2.111992708420646e-05, "loss": 0.2178, "step": 7110 }, { "epoch": 10.658682634730539, "grad_norm": 0.326171875, "learning_rate": 2.0905140759492438e-05, "loss": 0.2131, "step": 7120 }, { "epoch": 10.673652694610778, "grad_norm": 0.380859375, "learning_rate": 2.069132463791177e-05, "loss": 0.2238, "step": 7130 }, { "epoch": 10.688622754491018, "grad_norm": 0.35546875, "learning_rate": 2.0478481342202126e-05, "loss": 0.2069, "step": 7140 }, { "epoch": 10.703592814371257, "grad_norm": 0.455078125, "learning_rate": 2.0266613483168263e-05, "loss": 0.2364, "step": 7150 }, { "epoch": 10.718562874251496, "grad_norm": 0.36328125, "learning_rate": 2.0055723659649904e-05, "loss": 0.2214, "step": 7160 }, { "epoch": 10.733532934131736, "grad_norm": 0.38671875, "learning_rate": 1.9845814458489807e-05, "loss": 0.2198, "step": 7170 }, { "epoch": 10.748502994011975, "grad_norm": 0.408203125, "learning_rate": 1.9636888454502178e-05, "loss": 0.2159, "step": 7180 }, { "epoch": 10.763473053892216, "grad_norm": 0.384765625, "learning_rate": 1.9428948210441e-05, "loss": 0.2141, "step": 7190 }, { "epoch": 10.778443113772456, "grad_norm": 0.30859375, "learning_rate": 1.9221996276968524e-05, "loss": 0.2321, "step": 7200 }, { "epoch": 10.793413173652695, "grad_norm": 0.341796875, "learning_rate": 1.901603519262416e-05, "loss": 0.2159, "step": 7210 }, { "epoch": 10.808383233532934, "grad_norm": 0.33984375, "learning_rate": 1.881106748379321e-05, "loss": 0.2085, "step": 7220 }, { "epoch": 10.823353293413174, "grad_norm": 0.341796875, "learning_rate": 1.8607095664675868e-05, "loss": 0.2159, "step": 7230 }, { "epoch": 10.838323353293413, "grad_norm": 0.328125, "learning_rate": 1.8404122237256516e-05, "loss": 0.2159, "step": 7240 }, { "epoch": 10.853293413173652, "grad_norm": 0.4140625, "learning_rate": 1.8202149691272818e-05, "loss": 0.2055, "step": 7250 }, { "epoch": 10.868263473053892, "grad_norm": 0.375, "learning_rate": 1.80011805041854e-05, "loss": 0.2217, "step": 7260 }, { "epoch": 10.883233532934131, "grad_norm": 0.3203125, "learning_rate": 1.7801217141147353e-05, "loss": 0.2178, "step": 7270 }, { "epoch": 10.89820359281437, "grad_norm": 0.435546875, "learning_rate": 1.7602262054973917e-05, "loss": 0.2193, "step": 7280 }, { "epoch": 10.91317365269461, "grad_norm": 0.400390625, "learning_rate": 1.7404317686112637e-05, "loss": 0.2107, "step": 7290 }, { "epoch": 10.928143712574851, "grad_norm": 0.41015625, "learning_rate": 1.720738646261314e-05, "loss": 0.2292, "step": 7300 }, { "epoch": 10.94311377245509, "grad_norm": 0.404296875, "learning_rate": 1.7011470800097496e-05, "loss": 0.2315, "step": 7310 }, { "epoch": 10.95808383233533, "grad_norm": 0.353515625, "learning_rate": 1.6816573101730636e-05, "loss": 0.219, "step": 7320 }, { "epoch": 10.97305389221557, "grad_norm": 0.337890625, "learning_rate": 1.6622695758190808e-05, "loss": 0.2083, "step": 7330 }, { "epoch": 10.988023952095809, "grad_norm": 0.34375, "learning_rate": 1.642984114764019e-05, "loss": 0.2032, "step": 7340 }, { "epoch": 11.002994011976048, "grad_norm": 0.3828125, "learning_rate": 1.6238011635695848e-05, "loss": 0.2239, "step": 7350 }, { "epoch": 11.017964071856287, "grad_norm": 0.333984375, "learning_rate": 1.6047209575400668e-05, "loss": 0.2273, "step": 7360 }, { "epoch": 11.032934131736527, "grad_norm": 0.333984375, "learning_rate": 1.58574373071944e-05, "loss": 0.2133, "step": 7370 }, { "epoch": 11.047904191616766, "grad_norm": 0.341796875, "learning_rate": 1.5668697158885105e-05, "loss": 0.2117, "step": 7380 }, { "epoch": 11.062874251497005, "grad_norm": 0.349609375, "learning_rate": 1.5480991445620542e-05, "loss": 0.2163, "step": 7390 }, { "epoch": 11.077844311377245, "grad_norm": 0.38671875, "learning_rate": 1.5294322469859656e-05, "loss": 0.2195, "step": 7400 }, { "epoch": 11.092814371257486, "grad_norm": 0.337890625, "learning_rate": 1.5108692521344525e-05, "loss": 0.21, "step": 7410 }, { "epoch": 11.107784431137725, "grad_norm": 0.34765625, "learning_rate": 1.4924103877072193e-05, "loss": 0.2192, "step": 7420 }, { "epoch": 11.122754491017965, "grad_norm": 0.326171875, "learning_rate": 1.4740558801266625e-05, "loss": 0.229, "step": 7430 }, { "epoch": 11.137724550898204, "grad_norm": 0.365234375, "learning_rate": 1.4558059545351143e-05, "loss": 0.2116, "step": 7440 }, { "epoch": 11.152694610778443, "grad_norm": 0.365234375, "learning_rate": 1.4376608347920673e-05, "loss": 0.2195, "step": 7450 }, { "epoch": 11.167664670658683, "grad_norm": 0.318359375, "learning_rate": 1.419620743471427e-05, "loss": 0.206, "step": 7460 }, { "epoch": 11.182634730538922, "grad_norm": 0.353515625, "learning_rate": 1.4016859018587958e-05, "loss": 0.2157, "step": 7470 }, { "epoch": 11.197604790419161, "grad_norm": 0.359375, "learning_rate": 1.3838565299487371e-05, "loss": 0.2057, "step": 7480 }, { "epoch": 11.2125748502994, "grad_norm": 0.4296875, "learning_rate": 1.3661328464420974e-05, "loss": 0.2202, "step": 7490 }, { "epoch": 11.22754491017964, "grad_norm": 0.330078125, "learning_rate": 1.3485150687433167e-05, "loss": 0.2108, "step": 7500 }, { "epoch": 11.22754491017964, "eval_loss": 0.21242891252040863, "eval_runtime": 110.6298, "eval_samples_per_second": 9.039, "eval_steps_per_second": 1.13, "step": 7500 } ], "logging_steps": 10, "max_steps": 9000, "num_input_tokens_seen": 0, "num_train_epochs": 14, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.213677275788083e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }