{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15600928255231186, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015600928255231187, "grad_norm": 0.5937075614929199, "learning_rate": 2.666666666666667e-06, "loss": 3.7681, "step": 1 }, { "epoch": 0.00031201856510462375, "grad_norm": 0.5836828947067261, "learning_rate": 5.333333333333334e-06, "loss": 3.8601, "step": 2 }, { "epoch": 0.00046802784765693557, "grad_norm": 0.6111788749694824, "learning_rate": 8.000000000000001e-06, "loss": 3.643, "step": 3 }, { "epoch": 0.0006240371302092475, "grad_norm": 0.5807424783706665, "learning_rate": 1.0666666666666667e-05, "loss": 3.724, "step": 4 }, { "epoch": 0.0007800464127615594, "grad_norm": 0.5708947777748108, "learning_rate": 1.3333333333333333e-05, "loss": 3.9727, "step": 5 }, { "epoch": 0.0009360556953138711, "grad_norm": 0.5662252902984619, "learning_rate": 1.6000000000000003e-05, "loss": 3.6801, "step": 6 }, { "epoch": 0.001092064977866183, "grad_norm": 0.5653729438781738, "learning_rate": 1.866666666666667e-05, "loss": 3.6898, "step": 7 }, { "epoch": 0.001248074260418495, "grad_norm": 0.5451233983039856, "learning_rate": 2.1333333333333335e-05, "loss": 3.5484, "step": 8 }, { "epoch": 0.0014040835429708068, "grad_norm": 0.5682435035705566, "learning_rate": 2.4e-05, "loss": 3.538, "step": 9 }, { "epoch": 0.0015600928255231187, "grad_norm": 0.6180667877197266, "learning_rate": 2.6666666666666667e-05, "loss": 3.6757, "step": 10 }, { "epoch": 0.0017161021080754305, "grad_norm": 0.6358373165130615, "learning_rate": 2.9333333333333336e-05, "loss": 3.6489, "step": 11 }, { "epoch": 0.0018721113906277423, "grad_norm": 0.6643233895301819, "learning_rate": 3.2000000000000005e-05, "loss": 3.4313, "step": 12 }, { "epoch": 0.0020281206731800542, "grad_norm": 0.6591399908065796, "learning_rate": 3.466666666666667e-05, "loss": 3.472, "step": 13 }, { "epoch": 0.002184129955732366, "grad_norm": 0.8929205536842346, "learning_rate": 3.733333333333334e-05, "loss": 3.3531, "step": 14 }, { "epoch": 0.0023401392382846778, "grad_norm": 1.2845464944839478, "learning_rate": 4e-05, "loss": 3.143, "step": 15 }, { "epoch": 0.00249614852083699, "grad_norm": 1.216373085975647, "learning_rate": 4.266666666666667e-05, "loss": 3.1297, "step": 16 }, { "epoch": 0.0026521578033893017, "grad_norm": 0.9192391633987427, "learning_rate": 4.5333333333333335e-05, "loss": 2.9826, "step": 17 }, { "epoch": 0.0028081670859416135, "grad_norm": 0.8917486667633057, "learning_rate": 4.8e-05, "loss": 2.9068, "step": 18 }, { "epoch": 0.0029641763684939253, "grad_norm": 0.7141512632369995, "learning_rate": 5.0666666666666674e-05, "loss": 2.7797, "step": 19 }, { "epoch": 0.0031201856510462375, "grad_norm": 0.8795380592346191, "learning_rate": 5.333333333333333e-05, "loss": 2.2265, "step": 20 }, { "epoch": 0.0032761949335985492, "grad_norm": 1.047784447669983, "learning_rate": 5.6000000000000006e-05, "loss": 2.6089, "step": 21 }, { "epoch": 0.003432204216150861, "grad_norm": 1.0959978103637695, "learning_rate": 5.866666666666667e-05, "loss": 2.3416, "step": 22 }, { "epoch": 0.0035882134987031728, "grad_norm": 1.283445954322815, "learning_rate": 6.133333333333334e-05, "loss": 2.0565, "step": 23 }, { "epoch": 0.0037442227812554845, "grad_norm": 1.655569314956665, "learning_rate": 6.400000000000001e-05, "loss": 1.6648, "step": 24 }, { "epoch": 0.0039002320638077967, "grad_norm": 1.4048818349838257, "learning_rate": 6.666666666666667e-05, "loss": 1.7566, "step": 25 }, { "epoch": 0.0040562413463601085, "grad_norm": 1.0755441188812256, "learning_rate": 6.933333333333334e-05, "loss": 1.6591, "step": 26 }, { "epoch": 0.00421225062891242, "grad_norm": 0.7240940928459167, "learning_rate": 7.2e-05, "loss": 2.21, "step": 27 }, { "epoch": 0.004368259911464732, "grad_norm": 0.48980680108070374, "learning_rate": 7.466666666666667e-05, "loss": 1.8157, "step": 28 }, { "epoch": 0.004524269194017044, "grad_norm": 0.4145239591598511, "learning_rate": 7.733333333333333e-05, "loss": 1.8679, "step": 29 }, { "epoch": 0.0046802784765693555, "grad_norm": 0.3905705213546753, "learning_rate": 8e-05, "loss": 1.5733, "step": 30 }, { "epoch": 0.004836287759121667, "grad_norm": 0.40969792008399963, "learning_rate": 8.266666666666667e-05, "loss": 1.531, "step": 31 }, { "epoch": 0.00499229704167398, "grad_norm": 0.4269125759601593, "learning_rate": 8.533333333333334e-05, "loss": 1.3705, "step": 32 }, { "epoch": 0.005148306324226292, "grad_norm": 0.5876020789146423, "learning_rate": 8.800000000000001e-05, "loss": 1.4055, "step": 33 }, { "epoch": 0.0053043156067786035, "grad_norm": 0.39753037691116333, "learning_rate": 9.066666666666667e-05, "loss": 1.5403, "step": 34 }, { "epoch": 0.005460324889330915, "grad_norm": 0.4157419800758362, "learning_rate": 9.333333333333334e-05, "loss": 1.5698, "step": 35 }, { "epoch": 0.005616334171883227, "grad_norm": 0.4430864155292511, "learning_rate": 9.6e-05, "loss": 1.5466, "step": 36 }, { "epoch": 0.005772343454435539, "grad_norm": 0.5259338021278381, "learning_rate": 9.866666666666668e-05, "loss": 1.5878, "step": 37 }, { "epoch": 0.0059283527369878505, "grad_norm": 0.4409235417842865, "learning_rate": 0.00010133333333333335, "loss": 1.4203, "step": 38 }, { "epoch": 0.006084362019540162, "grad_norm": 0.5432307124137878, "learning_rate": 0.00010400000000000001, "loss": 1.3843, "step": 39 }, { "epoch": 0.006240371302092475, "grad_norm": 0.5789123177528381, "learning_rate": 0.00010666666666666667, "loss": 1.4649, "step": 40 }, { "epoch": 0.006396380584644787, "grad_norm": 0.5596875548362732, "learning_rate": 0.00010933333333333333, "loss": 1.3569, "step": 41 }, { "epoch": 0.0065523898671970985, "grad_norm": 0.6517161726951599, "learning_rate": 0.00011200000000000001, "loss": 1.4306, "step": 42 }, { "epoch": 0.00670839914974941, "grad_norm": 0.7665486335754395, "learning_rate": 0.00011466666666666667, "loss": 1.5369, "step": 43 }, { "epoch": 0.006864408432301722, "grad_norm": 0.8421632647514343, "learning_rate": 0.00011733333333333334, "loss": 1.5651, "step": 44 }, { "epoch": 0.007020417714854034, "grad_norm": 0.8437005877494812, "learning_rate": 0.00012, "loss": 1.33, "step": 45 }, { "epoch": 0.0071764269974063455, "grad_norm": 0.3544560670852661, "learning_rate": 0.00012266666666666668, "loss": 1.5436, "step": 46 }, { "epoch": 0.007332436279958657, "grad_norm": 0.35725343227386475, "learning_rate": 0.00012533333333333334, "loss": 1.5792, "step": 47 }, { "epoch": 0.007488445562510969, "grad_norm": 0.7203790545463562, "learning_rate": 0.00012800000000000002, "loss": 1.6707, "step": 48 }, { "epoch": 0.007644454845063282, "grad_norm": 0.352791428565979, "learning_rate": 0.00013066666666666668, "loss": 1.5251, "step": 49 }, { "epoch": 0.0078004641276155934, "grad_norm": 0.49014368653297424, "learning_rate": 0.00013333333333333334, "loss": 1.3478, "step": 50 }, { "epoch": 0.007956473410167905, "grad_norm": 0.29890525341033936, "learning_rate": 0.00013600000000000003, "loss": 1.3164, "step": 51 }, { "epoch": 0.008112482692720217, "grad_norm": 0.34632885456085205, "learning_rate": 0.00013866666666666669, "loss": 1.3785, "step": 52 }, { "epoch": 0.008268491975272529, "grad_norm": 0.3631187677383423, "learning_rate": 0.00014133333333333334, "loss": 1.7873, "step": 53 }, { "epoch": 0.00842450125782484, "grad_norm": 0.290487140417099, "learning_rate": 0.000144, "loss": 1.1681, "step": 54 }, { "epoch": 0.008580510540377152, "grad_norm": 0.3136501610279083, "learning_rate": 0.00014666666666666666, "loss": 1.1332, "step": 55 }, { "epoch": 0.008736519822929464, "grad_norm": 0.3708946406841278, "learning_rate": 0.00014933333333333335, "loss": 1.4316, "step": 56 }, { "epoch": 0.008892529105481776, "grad_norm": 0.3645316958427429, "learning_rate": 0.000152, "loss": 1.3522, "step": 57 }, { "epoch": 0.009048538388034088, "grad_norm": 0.4074520170688629, "learning_rate": 0.00015466666666666667, "loss": 1.5344, "step": 58 }, { "epoch": 0.0092045476705864, "grad_norm": 0.3106740713119507, "learning_rate": 0.00015733333333333333, "loss": 1.2959, "step": 59 }, { "epoch": 0.009360556953138711, "grad_norm": 0.32623976469039917, "learning_rate": 0.00016, "loss": 1.6472, "step": 60 }, { "epoch": 0.009516566235691023, "grad_norm": 0.35396724939346313, "learning_rate": 0.00016266666666666667, "loss": 1.2655, "step": 61 }, { "epoch": 0.009672575518243335, "grad_norm": 0.3455830216407776, "learning_rate": 0.00016533333333333333, "loss": 1.2153, "step": 62 }, { "epoch": 0.009828584800795648, "grad_norm": 0.3116808235645294, "learning_rate": 0.000168, "loss": 1.0851, "step": 63 }, { "epoch": 0.00998459408334796, "grad_norm": 0.3416989743709564, "learning_rate": 0.00017066666666666668, "loss": 1.5828, "step": 64 }, { "epoch": 0.010140603365900272, "grad_norm": 0.3509654104709625, "learning_rate": 0.00017333333333333334, "loss": 1.4832, "step": 65 }, { "epoch": 0.010296612648452583, "grad_norm": 0.3034147322177887, "learning_rate": 0.00017600000000000002, "loss": 1.4326, "step": 66 }, { "epoch": 0.010452621931004895, "grad_norm": 0.3084355890750885, "learning_rate": 0.00017866666666666668, "loss": 1.2452, "step": 67 }, { "epoch": 0.010608631213557207, "grad_norm": 0.3001956343650818, "learning_rate": 0.00018133333333333334, "loss": 1.2484, "step": 68 }, { "epoch": 0.010764640496109519, "grad_norm": 0.30605360865592957, "learning_rate": 0.00018400000000000003, "loss": 1.2137, "step": 69 }, { "epoch": 0.01092064977866183, "grad_norm": 0.32967764139175415, "learning_rate": 0.0001866666666666667, "loss": 1.4, "step": 70 }, { "epoch": 0.011076659061214142, "grad_norm": 0.3161776661872864, "learning_rate": 0.00018933333333333335, "loss": 1.3203, "step": 71 }, { "epoch": 0.011232668343766454, "grad_norm": 0.28808867931365967, "learning_rate": 0.000192, "loss": 1.3034, "step": 72 }, { "epoch": 0.011388677626318766, "grad_norm": 0.2804367244243622, "learning_rate": 0.0001946666666666667, "loss": 1.2753, "step": 73 }, { "epoch": 0.011544686908871078, "grad_norm": 0.30980467796325684, "learning_rate": 0.00019733333333333335, "loss": 1.3733, "step": 74 }, { "epoch": 0.01170069619142339, "grad_norm": 0.31240588426589966, "learning_rate": 0.0002, "loss": 1.1602, "step": 75 }, { "epoch": 0.011856705473975701, "grad_norm": 0.28906041383743286, "learning_rate": 0.00019999991608372393, "loss": 1.3243, "step": 76 }, { "epoch": 0.012012714756528013, "grad_norm": 0.2740985155105591, "learning_rate": 0.00019999966433503652, "loss": 1.1853, "step": 77 }, { "epoch": 0.012168724039080325, "grad_norm": 0.30425482988357544, "learning_rate": 0.0001999992447543603, "loss": 1.3282, "step": 78 }, { "epoch": 0.012324733321632636, "grad_norm": 0.3216018080711365, "learning_rate": 0.00019999865734239946, "loss": 1.3696, "step": 79 }, { "epoch": 0.01248074260418495, "grad_norm": 0.34770438075065613, "learning_rate": 0.00019999790210013988, "loss": 1.261, "step": 80 }, { "epoch": 0.012636751886737262, "grad_norm": 0.3883892297744751, "learning_rate": 0.0001999969790288491, "loss": 1.5873, "step": 81 }, { "epoch": 0.012792761169289573, "grad_norm": 0.3061410188674927, "learning_rate": 0.00019999588813007633, "loss": 1.5559, "step": 82 }, { "epoch": 0.012948770451841885, "grad_norm": 0.3044775128364563, "learning_rate": 0.00019999462940565243, "loss": 1.2439, "step": 83 }, { "epoch": 0.013104779734394197, "grad_norm": 0.3562803864479065, "learning_rate": 0.00019999320285769, "loss": 1.4121, "step": 84 }, { "epoch": 0.013260789016946509, "grad_norm": 0.3367731273174286, "learning_rate": 0.0001999916084885832, "loss": 1.1937, "step": 85 }, { "epoch": 0.01341679829949882, "grad_norm": 0.3613661527633667, "learning_rate": 0.00019998984630100792, "loss": 1.4267, "step": 86 }, { "epoch": 0.013572807582051132, "grad_norm": 0.30924999713897705, "learning_rate": 0.0001999879162979217, "loss": 1.3358, "step": 87 }, { "epoch": 0.013728816864603444, "grad_norm": 0.34925562143325806, "learning_rate": 0.0001999858184825637, "loss": 1.3758, "step": 88 }, { "epoch": 0.013884826147155756, "grad_norm": 0.3182036280632019, "learning_rate": 0.00019998355285845475, "loss": 1.3151, "step": 89 }, { "epoch": 0.014040835429708068, "grad_norm": 0.6028950810432434, "learning_rate": 0.0001999811194293973, "loss": 1.3797, "step": 90 }, { "epoch": 0.01419684471226038, "grad_norm": 0.3221015930175781, "learning_rate": 0.00019997851819947537, "loss": 1.3293, "step": 91 }, { "epoch": 0.014352853994812691, "grad_norm": 0.3003532290458679, "learning_rate": 0.00019997574917305478, "loss": 1.5671, "step": 92 }, { "epoch": 0.014508863277365003, "grad_norm": 0.32144418358802795, "learning_rate": 0.00019997281235478278, "loss": 1.3733, "step": 93 }, { "epoch": 0.014664872559917315, "grad_norm": 1.3427015542984009, "learning_rate": 0.00019996970774958836, "loss": 1.246, "step": 94 }, { "epoch": 0.014820881842469626, "grad_norm": 0.3254302144050598, "learning_rate": 0.00019996643536268204, "loss": 1.3829, "step": 95 }, { "epoch": 0.014976891125021938, "grad_norm": 0.2829325795173645, "learning_rate": 0.0001999629951995559, "loss": 1.2176, "step": 96 }, { "epoch": 0.01513290040757425, "grad_norm": 0.2943004071712494, "learning_rate": 0.00019995938726598373, "loss": 1.4021, "step": 97 }, { "epoch": 0.015288909690126563, "grad_norm": 0.2698727548122406, "learning_rate": 0.00019995561156802079, "loss": 1.2897, "step": 98 }, { "epoch": 0.015444918972678875, "grad_norm": 0.32416194677352905, "learning_rate": 0.0001999516681120039, "loss": 1.218, "step": 99 }, { "epoch": 0.015600928255231187, "grad_norm": 0.3309131860733032, "learning_rate": 0.00019994755690455152, "loss": 1.4658, "step": 100 }, { "epoch": 0.015756937537783497, "grad_norm": 0.31126394867897034, "learning_rate": 0.0001999432779525635, "loss": 1.5518, "step": 101 }, { "epoch": 0.01591294682033581, "grad_norm": 0.28427934646606445, "learning_rate": 0.0001999388312632214, "loss": 1.1435, "step": 102 }, { "epoch": 0.01606895610288812, "grad_norm": 0.28065958619117737, "learning_rate": 0.00019993421684398824, "loss": 1.3537, "step": 103 }, { "epoch": 0.016224965385440434, "grad_norm": 0.3787417411804199, "learning_rate": 0.00019992943470260844, "loss": 1.2151, "step": 104 }, { "epoch": 0.016380974667992744, "grad_norm": 0.32704487442970276, "learning_rate": 0.00019992448484710797, "loss": 1.1383, "step": 105 }, { "epoch": 0.016536983950545057, "grad_norm": 0.34436190128326416, "learning_rate": 0.00019991936728579437, "loss": 1.3949, "step": 106 }, { "epoch": 0.01669299323309737, "grad_norm": 0.29938092827796936, "learning_rate": 0.00019991408202725655, "loss": 1.2821, "step": 107 }, { "epoch": 0.01684900251564968, "grad_norm": 0.3192508816719055, "learning_rate": 0.0001999086290803649, "loss": 1.3655, "step": 108 }, { "epoch": 0.017005011798201995, "grad_norm": 0.2626635730266571, "learning_rate": 0.00019990300845427125, "loss": 1.2366, "step": 109 }, { "epoch": 0.017161021080754305, "grad_norm": 0.288725882768631, "learning_rate": 0.0001998972201584088, "loss": 1.0589, "step": 110 }, { "epoch": 0.017317030363306618, "grad_norm": 0.3340204358100891, "learning_rate": 0.00019989126420249221, "loss": 1.7077, "step": 111 }, { "epoch": 0.017473039645858928, "grad_norm": 0.27165043354034424, "learning_rate": 0.00019988514059651752, "loss": 1.3596, "step": 112 }, { "epoch": 0.01762904892841124, "grad_norm": 0.2751217186450958, "learning_rate": 0.00019987884935076213, "loss": 1.281, "step": 113 }, { "epoch": 0.01778505821096355, "grad_norm": 0.2712443172931671, "learning_rate": 0.00019987239047578482, "loss": 1.2686, "step": 114 }, { "epoch": 0.017941067493515865, "grad_norm": 0.2898474931716919, "learning_rate": 0.00019986576398242566, "loss": 1.2425, "step": 115 }, { "epoch": 0.018097076776068175, "grad_norm": 0.29883307218551636, "learning_rate": 0.00019985896988180605, "loss": 1.6326, "step": 116 }, { "epoch": 0.01825308605862049, "grad_norm": 0.2548903524875641, "learning_rate": 0.00019985200818532875, "loss": 1.317, "step": 117 }, { "epoch": 0.0184090953411728, "grad_norm": 0.260768860578537, "learning_rate": 0.0001998448789046777, "loss": 1.4137, "step": 118 }, { "epoch": 0.018565104623725112, "grad_norm": 0.27813923358917236, "learning_rate": 0.00019983758205181822, "loss": 1.1758, "step": 119 }, { "epoch": 0.018721113906277422, "grad_norm": 0.29539602994918823, "learning_rate": 0.00019983011763899673, "loss": 1.2805, "step": 120 }, { "epoch": 0.018877123188829736, "grad_norm": 0.2691763937473297, "learning_rate": 0.00019982248567874098, "loss": 1.3098, "step": 121 }, { "epoch": 0.019033132471382046, "grad_norm": 0.2895521819591522, "learning_rate": 0.00019981468618385988, "loss": 1.1475, "step": 122 }, { "epoch": 0.01918914175393436, "grad_norm": 0.24555402994155884, "learning_rate": 0.00019980671916744352, "loss": 1.075, "step": 123 }, { "epoch": 0.01934515103648667, "grad_norm": 0.29935726523399353, "learning_rate": 0.00019979858464286317, "loss": 1.278, "step": 124 }, { "epoch": 0.019501160319038983, "grad_norm": 0.3469449579715729, "learning_rate": 0.00019979028262377118, "loss": 1.602, "step": 125 }, { "epoch": 0.019657169601591296, "grad_norm": 0.2707567811012268, "learning_rate": 0.00019978181312410104, "loss": 1.3181, "step": 126 }, { "epoch": 0.019813178884143606, "grad_norm": 0.32349273562431335, "learning_rate": 0.00019977317615806737, "loss": 1.4862, "step": 127 }, { "epoch": 0.01996918816669592, "grad_norm": 0.24527911841869354, "learning_rate": 0.00019976437174016573, "loss": 1.169, "step": 128 }, { "epoch": 0.02012519744924823, "grad_norm": 0.2882062494754791, "learning_rate": 0.00019975539988517288, "loss": 1.275, "step": 129 }, { "epoch": 0.020281206731800543, "grad_norm": 0.3206437826156616, "learning_rate": 0.00019974626060814647, "loss": 1.682, "step": 130 }, { "epoch": 0.020437216014352853, "grad_norm": 0.3423447012901306, "learning_rate": 0.0001997369539244252, "loss": 1.2018, "step": 131 }, { "epoch": 0.020593225296905167, "grad_norm": 0.29081955552101135, "learning_rate": 0.0001997274798496287, "loss": 1.5849, "step": 132 }, { "epoch": 0.020749234579457477, "grad_norm": 0.2659798860549927, "learning_rate": 0.00019971783839965756, "loss": 1.1371, "step": 133 }, { "epoch": 0.02090524386200979, "grad_norm": 0.3395417034626007, "learning_rate": 0.00019970802959069328, "loss": 1.5046, "step": 134 }, { "epoch": 0.0210612531445621, "grad_norm": 0.22527103126049042, "learning_rate": 0.00019969805343919821, "loss": 1.0543, "step": 135 }, { "epoch": 0.021217262427114414, "grad_norm": 0.30680522322654724, "learning_rate": 0.0001996879099619156, "loss": 1.5067, "step": 136 }, { "epoch": 0.021373271709666724, "grad_norm": 0.22828875482082367, "learning_rate": 0.00019967759917586953, "loss": 1.1201, "step": 137 }, { "epoch": 0.021529280992219037, "grad_norm": 0.2578384280204773, "learning_rate": 0.00019966712109836476, "loss": 1.104, "step": 138 }, { "epoch": 0.021685290274771347, "grad_norm": 0.23175813257694244, "learning_rate": 0.000199656475746987, "loss": 0.9706, "step": 139 }, { "epoch": 0.02184129955732366, "grad_norm": 0.29308339953422546, "learning_rate": 0.00019964566313960264, "loss": 1.4769, "step": 140 }, { "epoch": 0.02199730883987597, "grad_norm": 0.3059382438659668, "learning_rate": 0.0001996346832943587, "loss": 1.4555, "step": 141 }, { "epoch": 0.022153318122428284, "grad_norm": 0.2929370701313019, "learning_rate": 0.00019962353622968295, "loss": 1.4051, "step": 142 }, { "epoch": 0.022309327404980598, "grad_norm": 0.24365079402923584, "learning_rate": 0.00019961222196428378, "loss": 1.189, "step": 143 }, { "epoch": 0.022465336687532908, "grad_norm": 0.27418485283851624, "learning_rate": 0.0001996007405171502, "loss": 1.206, "step": 144 }, { "epoch": 0.02262134597008522, "grad_norm": 0.2554856836795807, "learning_rate": 0.00019958909190755187, "loss": 1.4053, "step": 145 }, { "epoch": 0.02277735525263753, "grad_norm": 0.2674770951271057, "learning_rate": 0.00019957727615503888, "loss": 1.2412, "step": 146 }, { "epoch": 0.022933364535189845, "grad_norm": 0.3177204728126526, "learning_rate": 0.00019956529327944198, "loss": 1.4231, "step": 147 }, { "epoch": 0.023089373817742155, "grad_norm": 0.2678688168525696, "learning_rate": 0.00019955314330087225, "loss": 1.2494, "step": 148 }, { "epoch": 0.02324538310029447, "grad_norm": 0.28164568543434143, "learning_rate": 0.00019954082623972142, "loss": 1.2008, "step": 149 }, { "epoch": 0.02340139238284678, "grad_norm": 0.2897564172744751, "learning_rate": 0.0001995283421166614, "loss": 1.463, "step": 150 }, { "epoch": 0.023557401665399092, "grad_norm": 0.276509165763855, "learning_rate": 0.00019951569095264473, "loss": 1.4891, "step": 151 }, { "epoch": 0.023713410947951402, "grad_norm": 0.2585453689098358, "learning_rate": 0.0001995028727689041, "loss": 1.1551, "step": 152 }, { "epoch": 0.023869420230503716, "grad_norm": 0.25659292936325073, "learning_rate": 0.00019948988758695263, "loss": 1.1622, "step": 153 }, { "epoch": 0.024025429513056026, "grad_norm": 0.27132928371429443, "learning_rate": 0.00019947673542858367, "loss": 1.2015, "step": 154 }, { "epoch": 0.02418143879560834, "grad_norm": 0.2951599955558777, "learning_rate": 0.00019946341631587087, "loss": 1.1842, "step": 155 }, { "epoch": 0.02433744807816065, "grad_norm": 0.3114786148071289, "learning_rate": 0.00019944993027116797, "loss": 1.4509, "step": 156 }, { "epoch": 0.024493457360712963, "grad_norm": 0.25183674693107605, "learning_rate": 0.00019943627731710897, "loss": 1.1474, "step": 157 }, { "epoch": 0.024649466643265273, "grad_norm": 0.2717629075050354, "learning_rate": 0.00019942245747660796, "loss": 1.2899, "step": 158 }, { "epoch": 0.024805475925817586, "grad_norm": 0.2532605826854706, "learning_rate": 0.00019940847077285916, "loss": 1.0811, "step": 159 }, { "epoch": 0.0249614852083699, "grad_norm": 0.2951716482639313, "learning_rate": 0.0001993943172293368, "loss": 1.6252, "step": 160 }, { "epoch": 0.02511749449092221, "grad_norm": 0.29894542694091797, "learning_rate": 0.0001993799968697951, "loss": 1.3754, "step": 161 }, { "epoch": 0.025273503773474523, "grad_norm": 0.28648853302001953, "learning_rate": 0.00019936550971826834, "loss": 1.2769, "step": 162 }, { "epoch": 0.025429513056026833, "grad_norm": 0.2540144920349121, "learning_rate": 0.00019935085579907063, "loss": 1.281, "step": 163 }, { "epoch": 0.025585522338579147, "grad_norm": 0.30044910311698914, "learning_rate": 0.00019933603513679605, "loss": 1.1689, "step": 164 }, { "epoch": 0.025741531621131457, "grad_norm": 0.31799909472465515, "learning_rate": 0.00019932104775631846, "loss": 1.287, "step": 165 }, { "epoch": 0.02589754090368377, "grad_norm": 0.290565550327301, "learning_rate": 0.0001993058936827916, "loss": 1.4751, "step": 166 }, { "epoch": 0.02605355018623608, "grad_norm": 0.28967443108558655, "learning_rate": 0.00019929057294164893, "loss": 1.2459, "step": 167 }, { "epoch": 0.026209559468788394, "grad_norm": 0.25141966342926025, "learning_rate": 0.0001992750855586036, "loss": 1.1215, "step": 168 }, { "epoch": 0.026365568751340704, "grad_norm": 0.2819644808769226, "learning_rate": 0.00019925943155964856, "loss": 1.5238, "step": 169 }, { "epoch": 0.026521578033893017, "grad_norm": 0.2336016446352005, "learning_rate": 0.00019924361097105623, "loss": 1.2218, "step": 170 }, { "epoch": 0.026677587316445327, "grad_norm": 0.23773479461669922, "learning_rate": 0.00019922762381937878, "loss": 1.0842, "step": 171 }, { "epoch": 0.02683359659899764, "grad_norm": 0.266222208738327, "learning_rate": 0.0001992114701314478, "loss": 1.2076, "step": 172 }, { "epoch": 0.02698960588154995, "grad_norm": 0.29275181889533997, "learning_rate": 0.00019919514993437445, "loss": 1.3901, "step": 173 }, { "epoch": 0.027145615164102264, "grad_norm": 0.2334383726119995, "learning_rate": 0.00019917866325554938, "loss": 1.2012, "step": 174 }, { "epoch": 0.027301624446654574, "grad_norm": 0.293888121843338, "learning_rate": 0.00019916201012264254, "loss": 1.6131, "step": 175 }, { "epoch": 0.027457633729206888, "grad_norm": 0.3042750954627991, "learning_rate": 0.0001991451905636033, "loss": 1.3144, "step": 176 }, { "epoch": 0.027613643011759198, "grad_norm": 0.2652626633644104, "learning_rate": 0.00019912820460666044, "loss": 1.4368, "step": 177 }, { "epoch": 0.02776965229431151, "grad_norm": 0.28741374611854553, "learning_rate": 0.00019911105228032186, "loss": 1.4643, "step": 178 }, { "epoch": 0.027925661576863825, "grad_norm": 0.2808038890361786, "learning_rate": 0.00019909373361337476, "loss": 1.3013, "step": 179 }, { "epoch": 0.028081670859416135, "grad_norm": 0.22930848598480225, "learning_rate": 0.0001990762486348855, "loss": 1.0587, "step": 180 }, { "epoch": 0.02823768014196845, "grad_norm": 0.24289073050022125, "learning_rate": 0.00019905859737419956, "loss": 1.1174, "step": 181 }, { "epoch": 0.02839368942452076, "grad_norm": 0.2626672685146332, "learning_rate": 0.00019904077986094152, "loss": 1.1746, "step": 182 }, { "epoch": 0.028549698707073072, "grad_norm": 0.3174870014190674, "learning_rate": 0.00019902279612501493, "loss": 1.4464, "step": 183 }, { "epoch": 0.028705707989625382, "grad_norm": 0.2851637303829193, "learning_rate": 0.0001990046461966024, "loss": 1.3527, "step": 184 }, { "epoch": 0.028861717272177696, "grad_norm": 0.2576538622379303, "learning_rate": 0.00019898633010616542, "loss": 1.2546, "step": 185 }, { "epoch": 0.029017726554730006, "grad_norm": 0.2922312319278717, "learning_rate": 0.0001989678478844443, "loss": 1.1445, "step": 186 }, { "epoch": 0.02917373583728232, "grad_norm": 0.25312724709510803, "learning_rate": 0.00019894919956245824, "loss": 1.0533, "step": 187 }, { "epoch": 0.02932974511983463, "grad_norm": 0.3193413019180298, "learning_rate": 0.00019893038517150525, "loss": 1.655, "step": 188 }, { "epoch": 0.029485754402386943, "grad_norm": 0.26104092597961426, "learning_rate": 0.00019891140474316194, "loss": 1.5094, "step": 189 }, { "epoch": 0.029641763684939253, "grad_norm": 0.2679871916770935, "learning_rate": 0.00019889225830928365, "loss": 1.3535, "step": 190 }, { "epoch": 0.029797772967491566, "grad_norm": 0.2835332751274109, "learning_rate": 0.00019887294590200435, "loss": 1.647, "step": 191 }, { "epoch": 0.029953782250043876, "grad_norm": 0.2309991866350174, "learning_rate": 0.00019885346755373656, "loss": 1.1869, "step": 192 }, { "epoch": 0.03010979153259619, "grad_norm": 0.28801408410072327, "learning_rate": 0.00019883382329717128, "loss": 1.4037, "step": 193 }, { "epoch": 0.0302658008151485, "grad_norm": 0.309851735830307, "learning_rate": 0.00019881401316527793, "loss": 1.2832, "step": 194 }, { "epoch": 0.030421810097700813, "grad_norm": 0.27529048919677734, "learning_rate": 0.0001987940371913044, "loss": 1.5466, "step": 195 }, { "epoch": 0.030577819380253127, "grad_norm": 0.25759854912757874, "learning_rate": 0.00019877389540877687, "loss": 1.2432, "step": 196 }, { "epoch": 0.030733828662805437, "grad_norm": 0.27557173371315, "learning_rate": 0.0001987535878514998, "loss": 1.5681, "step": 197 }, { "epoch": 0.03088983794535775, "grad_norm": 0.25760918855667114, "learning_rate": 0.0001987331145535559, "loss": 1.3067, "step": 198 }, { "epoch": 0.03104584722791006, "grad_norm": 0.299180269241333, "learning_rate": 0.000198712475549306, "loss": 1.4642, "step": 199 }, { "epoch": 0.031201856510462374, "grad_norm": 0.2398681640625, "learning_rate": 0.00019869167087338907, "loss": 1.0748, "step": 200 }, { "epoch": 0.03135786579301469, "grad_norm": 0.2560211420059204, "learning_rate": 0.00019867070056072214, "loss": 1.2508, "step": 201 }, { "epoch": 0.031513875075566994, "grad_norm": 0.25509408116340637, "learning_rate": 0.00019864956464650025, "loss": 1.4073, "step": 202 }, { "epoch": 0.03166988435811931, "grad_norm": 0.27500587701797485, "learning_rate": 0.00019862826316619628, "loss": 1.3473, "step": 203 }, { "epoch": 0.03182589364067162, "grad_norm": 0.2923906445503235, "learning_rate": 0.0001986067961555611, "loss": 1.4293, "step": 204 }, { "epoch": 0.031981902923223934, "grad_norm": 0.24456267058849335, "learning_rate": 0.00019858516365062334, "loss": 1.2196, "step": 205 }, { "epoch": 0.03213791220577624, "grad_norm": 0.3021962344646454, "learning_rate": 0.00019856336568768935, "loss": 1.5066, "step": 206 }, { "epoch": 0.032293921488328554, "grad_norm": 0.2485729455947876, "learning_rate": 0.00019854140230334322, "loss": 1.2002, "step": 207 }, { "epoch": 0.03244993077088087, "grad_norm": 0.26055216789245605, "learning_rate": 0.0001985192735344467, "loss": 1.3207, "step": 208 }, { "epoch": 0.03260594005343318, "grad_norm": 0.2658592760562897, "learning_rate": 0.00019849697941813898, "loss": 0.9025, "step": 209 }, { "epoch": 0.03276194933598549, "grad_norm": 0.30481112003326416, "learning_rate": 0.00019847451999183694, "loss": 1.5238, "step": 210 }, { "epoch": 0.0329179586185378, "grad_norm": 0.28382736444473267, "learning_rate": 0.00019845189529323475, "loss": 1.3224, "step": 211 }, { "epoch": 0.033073967901090115, "grad_norm": 0.2757686972618103, "learning_rate": 0.00019842910536030403, "loss": 1.3672, "step": 212 }, { "epoch": 0.03322997718364243, "grad_norm": 0.2743508219718933, "learning_rate": 0.00019840615023129372, "loss": 1.3628, "step": 213 }, { "epoch": 0.03338598646619474, "grad_norm": 0.26412197947502136, "learning_rate": 0.00019838302994472997, "loss": 1.141, "step": 214 }, { "epoch": 0.03354199574874705, "grad_norm": 0.2859683632850647, "learning_rate": 0.0001983597445394162, "loss": 1.1566, "step": 215 }, { "epoch": 0.03369800503129936, "grad_norm": 0.24881964921951294, "learning_rate": 0.00019833629405443284, "loss": 1.2038, "step": 216 }, { "epoch": 0.033854014313851676, "grad_norm": 0.25597479939460754, "learning_rate": 0.0001983126785291375, "loss": 0.9913, "step": 217 }, { "epoch": 0.03401002359640399, "grad_norm": 0.26771095395088196, "learning_rate": 0.00019828889800316466, "loss": 1.5417, "step": 218 }, { "epoch": 0.034166032878956296, "grad_norm": 0.2678371071815491, "learning_rate": 0.00019826495251642578, "loss": 1.208, "step": 219 }, { "epoch": 0.03432204216150861, "grad_norm": 0.2947763204574585, "learning_rate": 0.00019824084210910925, "loss": 1.3908, "step": 220 }, { "epoch": 0.03447805144406092, "grad_norm": 0.2821643650531769, "learning_rate": 0.00019821656682168012, "loss": 1.6573, "step": 221 }, { "epoch": 0.034634060726613236, "grad_norm": 0.24507346749305725, "learning_rate": 0.00019819212669488026, "loss": 1.0647, "step": 222 }, { "epoch": 0.03479007000916554, "grad_norm": 0.2718466520309448, "learning_rate": 0.00019816752176972813, "loss": 1.3013, "step": 223 }, { "epoch": 0.034946079291717856, "grad_norm": 0.2902746796607971, "learning_rate": 0.0001981427520875188, "loss": 1.2212, "step": 224 }, { "epoch": 0.03510208857427017, "grad_norm": 0.25822389125823975, "learning_rate": 0.0001981178176898239, "loss": 1.4543, "step": 225 }, { "epoch": 0.03525809785682248, "grad_norm": 0.3506292700767517, "learning_rate": 0.00019809271861849145, "loss": 1.8549, "step": 226 }, { "epoch": 0.03541410713937479, "grad_norm": 0.2610777020454407, "learning_rate": 0.00019806745491564586, "loss": 1.3161, "step": 227 }, { "epoch": 0.0355701164219271, "grad_norm": 0.29803603887557983, "learning_rate": 0.0001980420266236878, "loss": 1.2983, "step": 228 }, { "epoch": 0.03572612570447942, "grad_norm": 0.24572676420211792, "learning_rate": 0.0001980164337852943, "loss": 1.291, "step": 229 }, { "epoch": 0.03588213498703173, "grad_norm": 0.25573092699050903, "learning_rate": 0.00019799067644341844, "loss": 1.3207, "step": 230 }, { "epoch": 0.036038144269584044, "grad_norm": 0.28766271471977234, "learning_rate": 0.00019796475464128942, "loss": 1.4527, "step": 231 }, { "epoch": 0.03619415355213635, "grad_norm": 0.2636454701423645, "learning_rate": 0.00019793866842241243, "loss": 1.3899, "step": 232 }, { "epoch": 0.036350162834688664, "grad_norm": 0.3094368577003479, "learning_rate": 0.00019791241783056874, "loss": 1.2935, "step": 233 }, { "epoch": 0.03650617211724098, "grad_norm": 0.2588469088077545, "learning_rate": 0.00019788600290981525, "loss": 1.2457, "step": 234 }, { "epoch": 0.03666218139979329, "grad_norm": 0.26457706093788147, "learning_rate": 0.0001978594237044849, "loss": 1.1753, "step": 235 }, { "epoch": 0.0368181906823456, "grad_norm": 0.2559141516685486, "learning_rate": 0.0001978326802591862, "loss": 1.2004, "step": 236 }, { "epoch": 0.03697419996489791, "grad_norm": 0.2815738320350647, "learning_rate": 0.00019780577261880336, "loss": 1.3706, "step": 237 }, { "epoch": 0.037130209247450224, "grad_norm": 0.2584588825702667, "learning_rate": 0.0001977787008284962, "loss": 1.4192, "step": 238 }, { "epoch": 0.03728621853000254, "grad_norm": 0.290865421295166, "learning_rate": 0.00019775146493369994, "loss": 1.2308, "step": 239 }, { "epoch": 0.037442227812554844, "grad_norm": 0.2788088023662567, "learning_rate": 0.0001977240649801253, "loss": 1.2095, "step": 240 }, { "epoch": 0.03759823709510716, "grad_norm": 0.28903988003730774, "learning_rate": 0.00019769650101375837, "loss": 1.5138, "step": 241 }, { "epoch": 0.03775424637765947, "grad_norm": 0.29985305666923523, "learning_rate": 0.00019766877308086036, "loss": 1.4594, "step": 242 }, { "epoch": 0.037910255660211785, "grad_norm": 0.3033303916454315, "learning_rate": 0.00019764088122796783, "loss": 1.6108, "step": 243 }, { "epoch": 0.03806626494276409, "grad_norm": 0.2854767143726349, "learning_rate": 0.0001976128255018924, "loss": 1.377, "step": 244 }, { "epoch": 0.038222274225316405, "grad_norm": 0.30725011229515076, "learning_rate": 0.00019758460594972068, "loss": 1.2651, "step": 245 }, { "epoch": 0.03837828350786872, "grad_norm": 0.28218191862106323, "learning_rate": 0.00019755622261881427, "loss": 1.4354, "step": 246 }, { "epoch": 0.03853429279042103, "grad_norm": 0.2794611155986786, "learning_rate": 0.00019752767555680968, "loss": 1.4666, "step": 247 }, { "epoch": 0.03869030207297334, "grad_norm": 0.2824796736240387, "learning_rate": 0.00019749896481161808, "loss": 1.3645, "step": 248 }, { "epoch": 0.03884631135552565, "grad_norm": 0.26165372133255005, "learning_rate": 0.00019747009043142555, "loss": 1.3445, "step": 249 }, { "epoch": 0.039002320638077966, "grad_norm": 0.29985979199409485, "learning_rate": 0.00019744105246469263, "loss": 1.4558, "step": 250 }, { "epoch": 0.03915832992063028, "grad_norm": 0.25439903140068054, "learning_rate": 0.00019741185096015448, "loss": 1.1075, "step": 251 }, { "epoch": 0.03931433920318259, "grad_norm": 0.2533755898475647, "learning_rate": 0.00019738248596682078, "loss": 1.0891, "step": 252 }, { "epoch": 0.0394703484857349, "grad_norm": 0.27487608790397644, "learning_rate": 0.0001973529575339755, "loss": 1.3128, "step": 253 }, { "epoch": 0.03962635776828721, "grad_norm": 0.27824172377586365, "learning_rate": 0.00019732326571117703, "loss": 1.4045, "step": 254 }, { "epoch": 0.039782367050839526, "grad_norm": 0.27959418296813965, "learning_rate": 0.00019729341054825782, "loss": 1.2169, "step": 255 }, { "epoch": 0.03993837633339184, "grad_norm": 0.3103275001049042, "learning_rate": 0.00019726339209532462, "loss": 1.3043, "step": 256 }, { "epoch": 0.040094385615944146, "grad_norm": 0.2712806463241577, "learning_rate": 0.00019723321040275815, "loss": 1.1747, "step": 257 }, { "epoch": 0.04025039489849646, "grad_norm": 0.2961602210998535, "learning_rate": 0.0001972028655212131, "loss": 1.5744, "step": 258 }, { "epoch": 0.04040640418104877, "grad_norm": 0.2686194181442261, "learning_rate": 0.00019717235750161806, "loss": 1.2442, "step": 259 }, { "epoch": 0.04056241346360109, "grad_norm": 0.2742723822593689, "learning_rate": 0.00019714168639517544, "loss": 1.3225, "step": 260 }, { "epoch": 0.04071842274615339, "grad_norm": 0.28742754459381104, "learning_rate": 0.00019711085225336132, "loss": 1.3711, "step": 261 }, { "epoch": 0.04087443202870571, "grad_norm": 0.30374589562416077, "learning_rate": 0.00019707985512792543, "loss": 1.215, "step": 262 }, { "epoch": 0.04103044131125802, "grad_norm": 0.2738686800003052, "learning_rate": 0.00019704869507089105, "loss": 1.4628, "step": 263 }, { "epoch": 0.041186450593810334, "grad_norm": 0.2695278823375702, "learning_rate": 0.0001970173721345549, "loss": 1.4632, "step": 264 }, { "epoch": 0.04134245987636264, "grad_norm": 0.2954547107219696, "learning_rate": 0.00019698588637148703, "loss": 1.2785, "step": 265 }, { "epoch": 0.041498469158914954, "grad_norm": 0.2756305932998657, "learning_rate": 0.00019695423783453088, "loss": 1.4258, "step": 266 }, { "epoch": 0.04165447844146727, "grad_norm": 0.2642769515514374, "learning_rate": 0.00019692242657680286, "loss": 1.3034, "step": 267 }, { "epoch": 0.04181048772401958, "grad_norm": 0.2760365307331085, "learning_rate": 0.00019689045265169273, "loss": 1.5845, "step": 268 }, { "epoch": 0.041966497006571894, "grad_norm": 0.23845522105693817, "learning_rate": 0.0001968583161128631, "loss": 1.113, "step": 269 }, { "epoch": 0.0421225062891242, "grad_norm": 0.2855961322784424, "learning_rate": 0.0001968260170142496, "loss": 1.4019, "step": 270 }, { "epoch": 0.042278515571676514, "grad_norm": 0.26462671160697937, "learning_rate": 0.00019679355541006054, "loss": 1.2425, "step": 271 }, { "epoch": 0.04243452485422883, "grad_norm": 0.28468820452690125, "learning_rate": 0.00019676093135477713, "loss": 1.6525, "step": 272 }, { "epoch": 0.04259053413678114, "grad_norm": 0.3233076333999634, "learning_rate": 0.0001967281449031531, "loss": 1.2168, "step": 273 }, { "epoch": 0.04274654341933345, "grad_norm": 0.2688952684402466, "learning_rate": 0.00019669519611021486, "loss": 1.3948, "step": 274 }, { "epoch": 0.04290255270188576, "grad_norm": 0.25911059975624084, "learning_rate": 0.00019666208503126112, "loss": 1.2875, "step": 275 }, { "epoch": 0.043058561984438075, "grad_norm": 0.2789272964000702, "learning_rate": 0.00019662881172186313, "loss": 1.257, "step": 276 }, { "epoch": 0.04321457126699039, "grad_norm": 0.26854726672172546, "learning_rate": 0.00019659537623786428, "loss": 1.4554, "step": 277 }, { "epoch": 0.043370580549542695, "grad_norm": 0.31813284754753113, "learning_rate": 0.00019656177863538026, "loss": 1.667, "step": 278 }, { "epoch": 0.04352658983209501, "grad_norm": 0.2801772356033325, "learning_rate": 0.00019652801897079869, "loss": 1.4555, "step": 279 }, { "epoch": 0.04368259911464732, "grad_norm": 0.30256757140159607, "learning_rate": 0.00019649409730077935, "loss": 1.2628, "step": 280 }, { "epoch": 0.043838608397199635, "grad_norm": 0.2807087302207947, "learning_rate": 0.00019646001368225382, "loss": 1.5143, "step": 281 }, { "epoch": 0.04399461767975194, "grad_norm": 0.27217531204223633, "learning_rate": 0.0001964257681724255, "loss": 1.5372, "step": 282 }, { "epoch": 0.044150626962304255, "grad_norm": 0.2996511459350586, "learning_rate": 0.00019639136082876953, "loss": 1.2692, "step": 283 }, { "epoch": 0.04430663624485657, "grad_norm": 0.263231098651886, "learning_rate": 0.00019635679170903258, "loss": 1.2328, "step": 284 }, { "epoch": 0.04446264552740888, "grad_norm": 0.3060413897037506, "learning_rate": 0.00019632206087123296, "loss": 1.5173, "step": 285 }, { "epoch": 0.044618654809961196, "grad_norm": 0.25136467814445496, "learning_rate": 0.00019628716837366027, "loss": 1.1781, "step": 286 }, { "epoch": 0.0447746640925135, "grad_norm": 0.27105534076690674, "learning_rate": 0.00019625211427487548, "loss": 1.4542, "step": 287 }, { "epoch": 0.044930673375065816, "grad_norm": 0.27552956342697144, "learning_rate": 0.00019621689863371083, "loss": 1.3352, "step": 288 }, { "epoch": 0.04508668265761813, "grad_norm": 0.26462072134017944, "learning_rate": 0.00019618152150926955, "loss": 1.2531, "step": 289 }, { "epoch": 0.04524269194017044, "grad_norm": 0.2736480236053467, "learning_rate": 0.000196145982960926, "loss": 1.402, "step": 290 }, { "epoch": 0.04539870122272275, "grad_norm": 0.2739974856376648, "learning_rate": 0.00019611028304832546, "loss": 1.4881, "step": 291 }, { "epoch": 0.04555471050527506, "grad_norm": 0.25353673100471497, "learning_rate": 0.000196074421831384, "loss": 1.3935, "step": 292 }, { "epoch": 0.04571071978782738, "grad_norm": 0.2595098614692688, "learning_rate": 0.00019603839937028838, "loss": 1.3306, "step": 293 }, { "epoch": 0.04586672907037969, "grad_norm": 0.27779051661491394, "learning_rate": 0.00019600221572549606, "loss": 1.5111, "step": 294 }, { "epoch": 0.046022738352932, "grad_norm": 0.26458942890167236, "learning_rate": 0.00019596587095773495, "loss": 1.1354, "step": 295 }, { "epoch": 0.04617874763548431, "grad_norm": 0.3711000084877014, "learning_rate": 0.00019592936512800342, "loss": 1.387, "step": 296 }, { "epoch": 0.046334756918036624, "grad_norm": 0.26172423362731934, "learning_rate": 0.00019589269829757008, "loss": 1.1995, "step": 297 }, { "epoch": 0.04649076620058894, "grad_norm": 0.30684447288513184, "learning_rate": 0.00019585587052797389, "loss": 1.2853, "step": 298 }, { "epoch": 0.046646775483141244, "grad_norm": 0.27383920550346375, "learning_rate": 0.00019581888188102375, "loss": 1.1397, "step": 299 }, { "epoch": 0.04680278476569356, "grad_norm": 0.28926682472229004, "learning_rate": 0.00019578173241879872, "loss": 1.2977, "step": 300 }, { "epoch": 0.04695879404824587, "grad_norm": 0.2573678195476532, "learning_rate": 0.00019574442220364767, "loss": 1.315, "step": 301 }, { "epoch": 0.047114803330798184, "grad_norm": 0.286785751581192, "learning_rate": 0.00019570695129818926, "loss": 1.196, "step": 302 }, { "epoch": 0.0472708126133505, "grad_norm": 0.26392433047294617, "learning_rate": 0.0001956693197653119, "loss": 1.067, "step": 303 }, { "epoch": 0.047426821895902804, "grad_norm": 0.29351645708084106, "learning_rate": 0.00019563152766817354, "loss": 1.2977, "step": 304 }, { "epoch": 0.04758283117845512, "grad_norm": 0.3556276857852936, "learning_rate": 0.00019559357507020162, "loss": 1.1268, "step": 305 }, { "epoch": 0.04773884046100743, "grad_norm": 0.3044413924217224, "learning_rate": 0.00019555546203509297, "loss": 1.3528, "step": 306 }, { "epoch": 0.047894849743559745, "grad_norm": 0.25455671548843384, "learning_rate": 0.00019551718862681364, "loss": 1.2099, "step": 307 }, { "epoch": 0.04805085902611205, "grad_norm": 0.2863021492958069, "learning_rate": 0.00019547875490959885, "loss": 1.514, "step": 308 }, { "epoch": 0.048206868308664365, "grad_norm": 0.2713131010532379, "learning_rate": 0.00019544016094795295, "loss": 1.2479, "step": 309 }, { "epoch": 0.04836287759121668, "grad_norm": 0.28673309087753296, "learning_rate": 0.00019540140680664913, "loss": 1.4822, "step": 310 }, { "epoch": 0.04851888687376899, "grad_norm": 0.28506314754486084, "learning_rate": 0.00019536249255072948, "loss": 1.1714, "step": 311 }, { "epoch": 0.0486748961563213, "grad_norm": 0.2814370393753052, "learning_rate": 0.00019532341824550479, "loss": 1.3045, "step": 312 }, { "epoch": 0.04883090543887361, "grad_norm": 0.2505611181259155, "learning_rate": 0.0001952841839565544, "loss": 1.1565, "step": 313 }, { "epoch": 0.048986914721425925, "grad_norm": 0.27159830927848816, "learning_rate": 0.0001952447897497263, "loss": 1.0939, "step": 314 }, { "epoch": 0.04914292400397824, "grad_norm": 0.27552008628845215, "learning_rate": 0.00019520523569113677, "loss": 1.4382, "step": 315 }, { "epoch": 0.049298933286530545, "grad_norm": 0.2567708492279053, "learning_rate": 0.00019516552184717037, "loss": 1.2241, "step": 316 }, { "epoch": 0.04945494256908286, "grad_norm": 0.27663713693618774, "learning_rate": 0.00019512564828447988, "loss": 1.2449, "step": 317 }, { "epoch": 0.04961095185163517, "grad_norm": 0.2683660089969635, "learning_rate": 0.0001950856150699861, "loss": 1.1652, "step": 318 }, { "epoch": 0.049766961134187486, "grad_norm": 0.25226572155952454, "learning_rate": 0.0001950454222708778, "loss": 1.1307, "step": 319 }, { "epoch": 0.0499229704167398, "grad_norm": 0.23380513489246368, "learning_rate": 0.0001950050699546116, "loss": 1.1257, "step": 320 }, { "epoch": 0.050078979699292106, "grad_norm": 0.2385280281305313, "learning_rate": 0.0001949645581889118, "loss": 0.9917, "step": 321 }, { "epoch": 0.05023498898184442, "grad_norm": 0.23746567964553833, "learning_rate": 0.00019492388704177036, "loss": 1.1364, "step": 322 }, { "epoch": 0.05039099826439673, "grad_norm": 0.27820831537246704, "learning_rate": 0.00019488305658144667, "loss": 1.3707, "step": 323 }, { "epoch": 0.050547007546949047, "grad_norm": 0.2663419544696808, "learning_rate": 0.00019484206687646753, "loss": 1.3662, "step": 324 }, { "epoch": 0.05070301682950135, "grad_norm": 0.27196773886680603, "learning_rate": 0.00019480091799562704, "loss": 1.2766, "step": 325 }, { "epoch": 0.05085902611205367, "grad_norm": 0.296779602766037, "learning_rate": 0.00019475961000798645, "loss": 1.5789, "step": 326 }, { "epoch": 0.05101503539460598, "grad_norm": 0.3267677128314972, "learning_rate": 0.0001947181429828739, "loss": 1.2782, "step": 327 }, { "epoch": 0.051171044677158294, "grad_norm": 0.2852894067764282, "learning_rate": 0.00019467651698988462, "loss": 1.1466, "step": 328 }, { "epoch": 0.0513270539597106, "grad_norm": 0.2959722876548767, "learning_rate": 0.0001946347320988806, "loss": 1.1929, "step": 329 }, { "epoch": 0.051483063242262914, "grad_norm": 0.25998443365097046, "learning_rate": 0.00019459278837999046, "loss": 1.4104, "step": 330 }, { "epoch": 0.05163907252481523, "grad_norm": 0.27319809794425964, "learning_rate": 0.00019455068590360942, "loss": 1.417, "step": 331 }, { "epoch": 0.05179508180736754, "grad_norm": 0.22395959496498108, "learning_rate": 0.00019450842474039913, "loss": 1.2159, "step": 332 }, { "epoch": 0.05195109108991985, "grad_norm": 0.24947980046272278, "learning_rate": 0.00019446600496128758, "loss": 1.1063, "step": 333 }, { "epoch": 0.05210710037247216, "grad_norm": 0.235429584980011, "learning_rate": 0.00019442342663746902, "loss": 1.2234, "step": 334 }, { "epoch": 0.052263109655024474, "grad_norm": 0.27443963289260864, "learning_rate": 0.00019438068984040365, "loss": 1.2038, "step": 335 }, { "epoch": 0.05241911893757679, "grad_norm": 0.26688772439956665, "learning_rate": 0.00019433779464181778, "loss": 1.2956, "step": 336 }, { "epoch": 0.052575128220129094, "grad_norm": 0.23804551362991333, "learning_rate": 0.00019429474111370352, "loss": 0.9525, "step": 337 }, { "epoch": 0.05273113750268141, "grad_norm": 0.262890487909317, "learning_rate": 0.0001942515293283187, "loss": 1.2713, "step": 338 }, { "epoch": 0.05288714678523372, "grad_norm": 0.29796820878982544, "learning_rate": 0.00019420815935818672, "loss": 1.5058, "step": 339 }, { "epoch": 0.053043156067786035, "grad_norm": 0.275143563747406, "learning_rate": 0.00019416463127609656, "loss": 1.2604, "step": 340 }, { "epoch": 0.05319916535033835, "grad_norm": 0.27801284193992615, "learning_rate": 0.00019412094515510248, "loss": 1.2588, "step": 341 }, { "epoch": 0.053355174632890655, "grad_norm": 0.2604374885559082, "learning_rate": 0.00019407710106852404, "loss": 1.1432, "step": 342 }, { "epoch": 0.05351118391544297, "grad_norm": 0.2863079011440277, "learning_rate": 0.00019403309908994586, "loss": 1.4854, "step": 343 }, { "epoch": 0.05366719319799528, "grad_norm": 0.2515758275985718, "learning_rate": 0.00019398893929321761, "loss": 1.1682, "step": 344 }, { "epoch": 0.053823202480547595, "grad_norm": 0.27037686109542847, "learning_rate": 0.00019394462175245381, "loss": 1.3679, "step": 345 }, { "epoch": 0.0539792117630999, "grad_norm": 0.2368054836988449, "learning_rate": 0.00019390014654203369, "loss": 1.1406, "step": 346 }, { "epoch": 0.054135221045652215, "grad_norm": 0.27759966254234314, "learning_rate": 0.0001938555137366011, "loss": 1.1669, "step": 347 }, { "epoch": 0.05429123032820453, "grad_norm": 0.3004835546016693, "learning_rate": 0.00019381072341106452, "loss": 1.4811, "step": 348 }, { "epoch": 0.05444723961075684, "grad_norm": 0.30656251311302185, "learning_rate": 0.0001937657756405966, "loss": 1.515, "step": 349 }, { "epoch": 0.05460324889330915, "grad_norm": 0.31442925333976746, "learning_rate": 0.00019372067050063438, "loss": 1.4848, "step": 350 }, { "epoch": 0.05475925817586146, "grad_norm": 0.2230207473039627, "learning_rate": 0.00019367540806687893, "loss": 0.9535, "step": 351 }, { "epoch": 0.054915267458413776, "grad_norm": 0.2552795708179474, "learning_rate": 0.0001936299884152954, "loss": 1.2254, "step": 352 }, { "epoch": 0.05507127674096609, "grad_norm": 0.29775241017341614, "learning_rate": 0.0001935844116221127, "loss": 1.3821, "step": 353 }, { "epoch": 0.055227286023518396, "grad_norm": 0.24480530619621277, "learning_rate": 0.00019353867776382354, "loss": 1.1073, "step": 354 }, { "epoch": 0.05538329530607071, "grad_norm": 0.2612270414829254, "learning_rate": 0.00019349278691718427, "loss": 1.3114, "step": 355 }, { "epoch": 0.05553930458862302, "grad_norm": 0.307085245847702, "learning_rate": 0.0001934467391592146, "loss": 1.3602, "step": 356 }, { "epoch": 0.055695313871175336, "grad_norm": 0.2688599228858948, "learning_rate": 0.00019340053456719768, "loss": 1.4347, "step": 357 }, { "epoch": 0.05585132315372765, "grad_norm": 0.25372791290283203, "learning_rate": 0.00019335417321867987, "loss": 1.3468, "step": 358 }, { "epoch": 0.05600733243627996, "grad_norm": 0.2706502377986908, "learning_rate": 0.0001933076551914706, "loss": 1.4489, "step": 359 }, { "epoch": 0.05616334171883227, "grad_norm": 0.22997525334358215, "learning_rate": 0.00019326098056364222, "loss": 1.1305, "step": 360 }, { "epoch": 0.056319351001384584, "grad_norm": 0.30573347210884094, "learning_rate": 0.00019321414941353003, "loss": 1.4231, "step": 361 }, { "epoch": 0.0564753602839369, "grad_norm": 0.30873847007751465, "learning_rate": 0.00019316716181973188, "loss": 1.3478, "step": 362 }, { "epoch": 0.056631369566489204, "grad_norm": 0.2514902651309967, "learning_rate": 0.00019312001786110828, "loss": 1.2094, "step": 363 }, { "epoch": 0.05678737884904152, "grad_norm": 0.26067742705345154, "learning_rate": 0.00019307271761678213, "loss": 1.5841, "step": 364 }, { "epoch": 0.05694338813159383, "grad_norm": 0.23508694767951965, "learning_rate": 0.00019302526116613864, "loss": 1.103, "step": 365 }, { "epoch": 0.057099397414146144, "grad_norm": 0.24878567457199097, "learning_rate": 0.00019297764858882514, "loss": 1.0968, "step": 366 }, { "epoch": 0.05725540669669845, "grad_norm": 0.23707476258277893, "learning_rate": 0.00019292987996475113, "loss": 1.0831, "step": 367 }, { "epoch": 0.057411415979250764, "grad_norm": 0.2691617012023926, "learning_rate": 0.0001928819553740878, "loss": 1.2254, "step": 368 }, { "epoch": 0.05756742526180308, "grad_norm": 0.26831138134002686, "learning_rate": 0.00019283387489726827, "loss": 1.3084, "step": 369 }, { "epoch": 0.05772343454435539, "grad_norm": 0.281770259141922, "learning_rate": 0.00019278563861498723, "loss": 1.3377, "step": 370 }, { "epoch": 0.0578794438269077, "grad_norm": 0.2634589970111847, "learning_rate": 0.00019273724660820088, "loss": 1.2648, "step": 371 }, { "epoch": 0.05803545310946001, "grad_norm": 0.27592259645462036, "learning_rate": 0.00019268869895812672, "loss": 1.2751, "step": 372 }, { "epoch": 0.058191462392012325, "grad_norm": 0.23107245564460754, "learning_rate": 0.00019263999574624355, "loss": 1.2651, "step": 373 }, { "epoch": 0.05834747167456464, "grad_norm": 0.2582552134990692, "learning_rate": 0.0001925911370542912, "loss": 1.4914, "step": 374 }, { "epoch": 0.05850348095711695, "grad_norm": 0.27152058482170105, "learning_rate": 0.00019254212296427044, "loss": 1.2227, "step": 375 }, { "epoch": 0.05865949023966926, "grad_norm": 0.23554329574108124, "learning_rate": 0.00019249295355844285, "loss": 1.4113, "step": 376 }, { "epoch": 0.05881549952222157, "grad_norm": 0.2793971300125122, "learning_rate": 0.00019244362891933077, "loss": 1.3325, "step": 377 }, { "epoch": 0.058971508804773885, "grad_norm": 0.2800885736942291, "learning_rate": 0.00019239414912971696, "loss": 1.358, "step": 378 }, { "epoch": 0.0591275180873262, "grad_norm": 0.27139201760292053, "learning_rate": 0.0001923445142726446, "loss": 1.2269, "step": 379 }, { "epoch": 0.059283527369878505, "grad_norm": 0.276579886674881, "learning_rate": 0.0001922947244314172, "loss": 1.1521, "step": 380 }, { "epoch": 0.05943953665243082, "grad_norm": 0.28917452692985535, "learning_rate": 0.0001922447796895982, "loss": 1.2803, "step": 381 }, { "epoch": 0.05959554593498313, "grad_norm": 0.28668197989463806, "learning_rate": 0.00019219468013101124, "loss": 1.4025, "step": 382 }, { "epoch": 0.059751555217535446, "grad_norm": 0.2973851263523102, "learning_rate": 0.00019214442583973966, "loss": 1.5472, "step": 383 }, { "epoch": 0.05990756450008775, "grad_norm": 0.25934460759162903, "learning_rate": 0.00019209401690012653, "loss": 1.2496, "step": 384 }, { "epoch": 0.060063573782640066, "grad_norm": 0.22885724902153015, "learning_rate": 0.00019204345339677442, "loss": 1.2088, "step": 385 }, { "epoch": 0.06021958306519238, "grad_norm": 0.28346025943756104, "learning_rate": 0.00019199273541454538, "loss": 1.1561, "step": 386 }, { "epoch": 0.06037559234774469, "grad_norm": 0.2574789822101593, "learning_rate": 0.00019194186303856067, "loss": 1.3209, "step": 387 }, { "epoch": 0.060531601630297, "grad_norm": 0.26535728573799133, "learning_rate": 0.00019189083635420075, "loss": 1.3022, "step": 388 }, { "epoch": 0.06068761091284931, "grad_norm": 0.2844642698764801, "learning_rate": 0.00019183965544710495, "loss": 1.3881, "step": 389 }, { "epoch": 0.060843620195401626, "grad_norm": 0.24562187492847443, "learning_rate": 0.00019178832040317155, "loss": 1.159, "step": 390 }, { "epoch": 0.06099962947795394, "grad_norm": 0.25778669118881226, "learning_rate": 0.0001917368313085574, "loss": 1.5154, "step": 391 }, { "epoch": 0.061155638760506253, "grad_norm": 0.22877171635627747, "learning_rate": 0.00019168518824967795, "loss": 1.201, "step": 392 }, { "epoch": 0.06131164804305856, "grad_norm": 0.2764502465724945, "learning_rate": 0.00019163339131320718, "loss": 1.4165, "step": 393 }, { "epoch": 0.061467657325610874, "grad_norm": 0.23493847250938416, "learning_rate": 0.00019158144058607708, "loss": 1.1334, "step": 394 }, { "epoch": 0.06162366660816319, "grad_norm": 0.2605098783969879, "learning_rate": 0.00019152933615547798, "loss": 1.1613, "step": 395 }, { "epoch": 0.0617796758907155, "grad_norm": 0.23720701038837433, "learning_rate": 0.000191477078108858, "loss": 1.1966, "step": 396 }, { "epoch": 0.06193568517326781, "grad_norm": 0.27043676376342773, "learning_rate": 0.00019142466653392318, "loss": 1.2793, "step": 397 }, { "epoch": 0.06209169445582012, "grad_norm": 0.27630025148391724, "learning_rate": 0.0001913721015186372, "loss": 1.3858, "step": 398 }, { "epoch": 0.062247703738372434, "grad_norm": 0.29454129934310913, "learning_rate": 0.0001913193831512213, "loss": 1.5234, "step": 399 }, { "epoch": 0.06240371302092475, "grad_norm": 0.26943233609199524, "learning_rate": 0.00019126651152015403, "loss": 1.3181, "step": 400 }, { "epoch": 0.06255972230347706, "grad_norm": 0.28831520676612854, "learning_rate": 0.0001912134867141712, "loss": 1.46, "step": 401 }, { "epoch": 0.06271573158602937, "grad_norm": 0.26342567801475525, "learning_rate": 0.0001911603088222657, "loss": 1.4073, "step": 402 }, { "epoch": 0.06287174086858167, "grad_norm": 0.2623300552368164, "learning_rate": 0.0001911069779336873, "loss": 1.3473, "step": 403 }, { "epoch": 0.06302775015113399, "grad_norm": 0.25125861167907715, "learning_rate": 0.00019105349413794272, "loss": 1.0346, "step": 404 }, { "epoch": 0.0631837594336863, "grad_norm": 0.30890092253685, "learning_rate": 0.00019099985752479506, "loss": 1.5751, "step": 405 }, { "epoch": 0.06333976871623861, "grad_norm": 0.31404733657836914, "learning_rate": 0.00019094606818426403, "loss": 1.5458, "step": 406 }, { "epoch": 0.06349577799879093, "grad_norm": 0.2684463858604431, "learning_rate": 0.00019089212620662568, "loss": 1.2342, "step": 407 }, { "epoch": 0.06365178728134324, "grad_norm": 0.2748461365699768, "learning_rate": 0.00019083803168241223, "loss": 1.3353, "step": 408 }, { "epoch": 0.06380779656389556, "grad_norm": 0.3061840832233429, "learning_rate": 0.00019078378470241183, "loss": 1.3197, "step": 409 }, { "epoch": 0.06396380584644787, "grad_norm": 0.25601011514663696, "learning_rate": 0.00019072938535766865, "loss": 1.3904, "step": 410 }, { "epoch": 0.06411981512900018, "grad_norm": 0.2844060957431793, "learning_rate": 0.00019067483373948243, "loss": 1.42, "step": 411 }, { "epoch": 0.06427582441155248, "grad_norm": 0.2969295382499695, "learning_rate": 0.00019062012993940859, "loss": 1.4255, "step": 412 }, { "epoch": 0.0644318336941048, "grad_norm": 0.2655050456523895, "learning_rate": 0.00019056527404925789, "loss": 1.1618, "step": 413 }, { "epoch": 0.06458784297665711, "grad_norm": 0.2571544349193573, "learning_rate": 0.00019051026616109638, "loss": 1.2064, "step": 414 }, { "epoch": 0.06474385225920942, "grad_norm": 0.29847028851509094, "learning_rate": 0.0001904551063672452, "loss": 1.2847, "step": 415 }, { "epoch": 0.06489986154176174, "grad_norm": 0.24265627562999725, "learning_rate": 0.00019039979476028043, "loss": 1.2745, "step": 416 }, { "epoch": 0.06505587082431405, "grad_norm": 0.24038730561733246, "learning_rate": 0.000190344331433033, "loss": 1.2761, "step": 417 }, { "epoch": 0.06521188010686636, "grad_norm": 0.26194193959236145, "learning_rate": 0.00019028871647858834, "loss": 1.5021, "step": 418 }, { "epoch": 0.06536788938941868, "grad_norm": 0.2636980712413788, "learning_rate": 0.00019023294999028653, "loss": 1.5029, "step": 419 }, { "epoch": 0.06552389867197098, "grad_norm": 0.26995277404785156, "learning_rate": 0.00019017703206172185, "loss": 1.3068, "step": 420 }, { "epoch": 0.06567990795452329, "grad_norm": 0.26835623383522034, "learning_rate": 0.0001901209627867428, "loss": 1.2868, "step": 421 }, { "epoch": 0.0658359172370756, "grad_norm": 0.24785400927066803, "learning_rate": 0.0001900647422594519, "loss": 1.1875, "step": 422 }, { "epoch": 0.06599192651962792, "grad_norm": 0.3184250593185425, "learning_rate": 0.0001900083705742054, "loss": 1.3802, "step": 423 }, { "epoch": 0.06614793580218023, "grad_norm": 0.2850029766559601, "learning_rate": 0.00018995184782561345, "loss": 1.3043, "step": 424 }, { "epoch": 0.06630394508473254, "grad_norm": 0.2940841317176819, "learning_rate": 0.00018989517410853955, "loss": 1.287, "step": 425 }, { "epoch": 0.06645995436728486, "grad_norm": 0.2668844163417816, "learning_rate": 0.0001898383495181007, "loss": 1.3723, "step": 426 }, { "epoch": 0.06661596364983717, "grad_norm": 0.2814147472381592, "learning_rate": 0.00018978137414966698, "loss": 1.2339, "step": 427 }, { "epoch": 0.06677197293238948, "grad_norm": 0.3722403049468994, "learning_rate": 0.0001897242480988617, "loss": 1.2755, "step": 428 }, { "epoch": 0.06692798221494178, "grad_norm": 0.2689428925514221, "learning_rate": 0.00018966697146156092, "loss": 1.4238, "step": 429 }, { "epoch": 0.0670839914974941, "grad_norm": 0.29616808891296387, "learning_rate": 0.00018960954433389345, "loss": 1.3167, "step": 430 }, { "epoch": 0.06724000078004641, "grad_norm": 0.2477925419807434, "learning_rate": 0.0001895519668122408, "loss": 1.1773, "step": 431 }, { "epoch": 0.06739601006259872, "grad_norm": 0.23961544036865234, "learning_rate": 0.0001894942389932367, "loss": 1.1387, "step": 432 }, { "epoch": 0.06755201934515104, "grad_norm": 0.26128751039505005, "learning_rate": 0.00018943636097376726, "loss": 1.0468, "step": 433 }, { "epoch": 0.06770802862770335, "grad_norm": 0.33279022574424744, "learning_rate": 0.00018937833285097066, "loss": 1.8791, "step": 434 }, { "epoch": 0.06786403791025566, "grad_norm": 0.2876769006252289, "learning_rate": 0.00018932015472223693, "loss": 1.3633, "step": 435 }, { "epoch": 0.06802004719280798, "grad_norm": 0.24108922481536865, "learning_rate": 0.00018926182668520792, "loss": 1.2012, "step": 436 }, { "epoch": 0.06817605647536028, "grad_norm": 0.29062169790267944, "learning_rate": 0.0001892033488377771, "loss": 1.3658, "step": 437 }, { "epoch": 0.06833206575791259, "grad_norm": 0.26536259055137634, "learning_rate": 0.0001891447212780893, "loss": 1.2464, "step": 438 }, { "epoch": 0.0684880750404649, "grad_norm": 0.2940811514854431, "learning_rate": 0.0001890859441045407, "loss": 1.4609, "step": 439 }, { "epoch": 0.06864408432301722, "grad_norm": 0.27625903487205505, "learning_rate": 0.0001890270174157784, "loss": 1.4098, "step": 440 }, { "epoch": 0.06880009360556953, "grad_norm": 0.2586573362350464, "learning_rate": 0.00018896794131070073, "loss": 1.3857, "step": 441 }, { "epoch": 0.06895610288812185, "grad_norm": 0.28287774324417114, "learning_rate": 0.0001889087158884565, "loss": 1.2967, "step": 442 }, { "epoch": 0.06911211217067416, "grad_norm": 0.2692122459411621, "learning_rate": 0.00018884934124844532, "loss": 1.5216, "step": 443 }, { "epoch": 0.06926812145322647, "grad_norm": 0.3004090189933777, "learning_rate": 0.00018878981749031716, "loss": 1.1913, "step": 444 }, { "epoch": 0.06942413073577879, "grad_norm": 0.253542423248291, "learning_rate": 0.00018873014471397224, "loss": 1.1299, "step": 445 }, { "epoch": 0.06958014001833109, "grad_norm": 0.3034575283527374, "learning_rate": 0.00018867032301956088, "loss": 1.3577, "step": 446 }, { "epoch": 0.0697361493008834, "grad_norm": 0.31302767992019653, "learning_rate": 0.00018861035250748343, "loss": 1.6029, "step": 447 }, { "epoch": 0.06989215858343571, "grad_norm": 0.26993393898010254, "learning_rate": 0.00018855023327838983, "loss": 1.2035, "step": 448 }, { "epoch": 0.07004816786598803, "grad_norm": 0.27148422598838806, "learning_rate": 0.00018848996543317982, "loss": 1.5843, "step": 449 }, { "epoch": 0.07020417714854034, "grad_norm": 0.2631765305995941, "learning_rate": 0.00018842954907300236, "loss": 1.2641, "step": 450 }, { "epoch": 0.07036018643109265, "grad_norm": 0.2621013820171356, "learning_rate": 0.00018836898429925585, "loss": 1.2167, "step": 451 }, { "epoch": 0.07051619571364497, "grad_norm": 0.25064215064048767, "learning_rate": 0.0001883082712135877, "loss": 1.2631, "step": 452 }, { "epoch": 0.07067220499619728, "grad_norm": 0.2558056712150574, "learning_rate": 0.00018824740991789415, "loss": 0.9964, "step": 453 }, { "epoch": 0.07082821427874958, "grad_norm": 0.2675093412399292, "learning_rate": 0.00018818640051432035, "loss": 1.4953, "step": 454 }, { "epoch": 0.07098422356130189, "grad_norm": 0.2550821006298065, "learning_rate": 0.0001881252431052599, "loss": 1.1283, "step": 455 }, { "epoch": 0.0711402328438542, "grad_norm": 0.24893717467784882, "learning_rate": 0.00018806393779335483, "loss": 1.1725, "step": 456 }, { "epoch": 0.07129624212640652, "grad_norm": 0.24471914768218994, "learning_rate": 0.00018800248468149543, "loss": 1.19, "step": 457 }, { "epoch": 0.07145225140895883, "grad_norm": 0.27745166420936584, "learning_rate": 0.00018794088387282, "loss": 1.6347, "step": 458 }, { "epoch": 0.07160826069151115, "grad_norm": 0.2930917739868164, "learning_rate": 0.00018787913547071484, "loss": 1.5139, "step": 459 }, { "epoch": 0.07176426997406346, "grad_norm": 0.2656380534172058, "learning_rate": 0.00018781723957881372, "loss": 1.1726, "step": 460 }, { "epoch": 0.07192027925661577, "grad_norm": 0.27983731031417847, "learning_rate": 0.0001877551963009982, "loss": 1.3818, "step": 461 }, { "epoch": 0.07207628853916809, "grad_norm": 0.2744976580142975, "learning_rate": 0.0001876930057413971, "loss": 1.2756, "step": 462 }, { "epoch": 0.07223229782172039, "grad_norm": 0.2684760093688965, "learning_rate": 0.00018763066800438636, "loss": 1.2302, "step": 463 }, { "epoch": 0.0723883071042727, "grad_norm": 0.25079357624053955, "learning_rate": 0.00018756818319458907, "loss": 1.1575, "step": 464 }, { "epoch": 0.07254431638682501, "grad_norm": 0.2802796959877014, "learning_rate": 0.000187505551416875, "loss": 1.3711, "step": 465 }, { "epoch": 0.07270032566937733, "grad_norm": 0.7640414237976074, "learning_rate": 0.0001874427727763607, "loss": 1.3431, "step": 466 }, { "epoch": 0.07285633495192964, "grad_norm": 0.265717089176178, "learning_rate": 0.0001873798473784092, "loss": 1.1778, "step": 467 }, { "epoch": 0.07301234423448195, "grad_norm": 0.23273074626922607, "learning_rate": 0.00018731677532862976, "loss": 1.02, "step": 468 }, { "epoch": 0.07316835351703427, "grad_norm": 0.248812735080719, "learning_rate": 0.00018725355673287778, "loss": 1.1423, "step": 469 }, { "epoch": 0.07332436279958658, "grad_norm": 0.24919858574867249, "learning_rate": 0.00018719019169725472, "loss": 1.2377, "step": 470 }, { "epoch": 0.07348037208213888, "grad_norm": 0.25503799319267273, "learning_rate": 0.00018712668032810768, "loss": 1.3236, "step": 471 }, { "epoch": 0.0736363813646912, "grad_norm": 0.28893566131591797, "learning_rate": 0.00018706302273202943, "loss": 1.4662, "step": 472 }, { "epoch": 0.07379239064724351, "grad_norm": 0.2384706735610962, "learning_rate": 0.00018699921901585813, "loss": 1.2817, "step": 473 }, { "epoch": 0.07394839992979582, "grad_norm": 0.2527397572994232, "learning_rate": 0.0001869352692866772, "loss": 1.1766, "step": 474 }, { "epoch": 0.07410440921234814, "grad_norm": 0.25340378284454346, "learning_rate": 0.00018687117365181512, "loss": 1.1876, "step": 475 }, { "epoch": 0.07426041849490045, "grad_norm": 0.2570219039916992, "learning_rate": 0.00018680693221884517, "loss": 1.3472, "step": 476 }, { "epoch": 0.07441642777745276, "grad_norm": 0.25267085433006287, "learning_rate": 0.00018674254509558544, "loss": 1.5048, "step": 477 }, { "epoch": 0.07457243706000508, "grad_norm": 0.24603790044784546, "learning_rate": 0.00018667801239009846, "loss": 1.276, "step": 478 }, { "epoch": 0.07472844634255738, "grad_norm": 0.2434520423412323, "learning_rate": 0.00018661333421069113, "loss": 1.3999, "step": 479 }, { "epoch": 0.07488445562510969, "grad_norm": 0.27032792568206787, "learning_rate": 0.00018654851066591448, "loss": 1.3909, "step": 480 }, { "epoch": 0.075040464907662, "grad_norm": 0.26559844613075256, "learning_rate": 0.00018648354186456348, "loss": 1.2931, "step": 481 }, { "epoch": 0.07519647419021432, "grad_norm": 0.2563202679157257, "learning_rate": 0.000186418427915677, "loss": 1.2773, "step": 482 }, { "epoch": 0.07535248347276663, "grad_norm": 0.2463751882314682, "learning_rate": 0.00018635316892853741, "loss": 1.4017, "step": 483 }, { "epoch": 0.07550849275531894, "grad_norm": 0.26452189683914185, "learning_rate": 0.00018628776501267052, "loss": 1.2236, "step": 484 }, { "epoch": 0.07566450203787126, "grad_norm": 0.48540955781936646, "learning_rate": 0.0001862222162778454, "loss": 1.1676, "step": 485 }, { "epoch": 0.07582051132042357, "grad_norm": 0.2931404411792755, "learning_rate": 0.0001861565228340742, "loss": 1.3877, "step": 486 }, { "epoch": 0.07597652060297588, "grad_norm": 0.2707270383834839, "learning_rate": 0.00018609068479161182, "loss": 1.2828, "step": 487 }, { "epoch": 0.07613252988552818, "grad_norm": 0.25902295112609863, "learning_rate": 0.00018602470226095603, "loss": 1.2393, "step": 488 }, { "epoch": 0.0762885391680805, "grad_norm": 0.27907291054725647, "learning_rate": 0.00018595857535284692, "loss": 1.1944, "step": 489 }, { "epoch": 0.07644454845063281, "grad_norm": 0.3079850375652313, "learning_rate": 0.00018589230417826697, "loss": 1.3686, "step": 490 }, { "epoch": 0.07660055773318512, "grad_norm": 0.250303715467453, "learning_rate": 0.00018582588884844084, "loss": 1.2497, "step": 491 }, { "epoch": 0.07675656701573744, "grad_norm": 0.260257750749588, "learning_rate": 0.00018575932947483502, "loss": 1.4186, "step": 492 }, { "epoch": 0.07691257629828975, "grad_norm": 0.2537723481655121, "learning_rate": 0.00018569262616915784, "loss": 1.28, "step": 493 }, { "epoch": 0.07706858558084206, "grad_norm": 0.21861004829406738, "learning_rate": 0.00018562577904335912, "loss": 0.9705, "step": 494 }, { "epoch": 0.07722459486339438, "grad_norm": 0.322566956281662, "learning_rate": 0.00018555878820963013, "loss": 1.4941, "step": 495 }, { "epoch": 0.07738060414594668, "grad_norm": 0.24904873967170715, "learning_rate": 0.00018549165378040327, "loss": 1.2277, "step": 496 }, { "epoch": 0.07753661342849899, "grad_norm": 0.2692057490348816, "learning_rate": 0.00018542437586835202, "loss": 1.3786, "step": 497 }, { "epoch": 0.0776926227110513, "grad_norm": 0.27876508235931396, "learning_rate": 0.00018535695458639056, "loss": 1.3822, "step": 498 }, { "epoch": 0.07784863199360362, "grad_norm": 0.2497495859861374, "learning_rate": 0.00018528939004767376, "loss": 1.1872, "step": 499 }, { "epoch": 0.07800464127615593, "grad_norm": 0.28155678510665894, "learning_rate": 0.00018522168236559695, "loss": 1.2253, "step": 500 }, { "epoch": 0.07800464127615593, "eval_loss": 1.3168833255767822, "eval_runtime": 110.9584, "eval_samples_per_second": 38.51, "eval_steps_per_second": 4.822, "step": 500 }, { "epoch": 0.07816065055870824, "grad_norm": 0.25162461400032043, "learning_rate": 0.0001851538316537956, "loss": 1.2308, "step": 501 }, { "epoch": 0.07831665984126056, "grad_norm": 0.33541133999824524, "learning_rate": 0.0001850858380261453, "loss": 1.2788, "step": 502 }, { "epoch": 0.07847266912381287, "grad_norm": 0.29069721698760986, "learning_rate": 0.00018501770159676156, "loss": 1.4186, "step": 503 }, { "epoch": 0.07862867840636519, "grad_norm": 0.24337412416934967, "learning_rate": 0.0001849494224799994, "loss": 1.2268, "step": 504 }, { "epoch": 0.07878468768891748, "grad_norm": 0.2503622770309448, "learning_rate": 0.00018488100079045344, "loss": 1.1121, "step": 505 }, { "epoch": 0.0789406969714698, "grad_norm": 0.3061240017414093, "learning_rate": 0.0001848124366429576, "loss": 1.4207, "step": 506 }, { "epoch": 0.07909670625402211, "grad_norm": 0.3209320902824402, "learning_rate": 0.00018474373015258473, "loss": 1.3531, "step": 507 }, { "epoch": 0.07925271553657443, "grad_norm": 0.26510298252105713, "learning_rate": 0.0001846748814346468, "loss": 1.1614, "step": 508 }, { "epoch": 0.07940872481912674, "grad_norm": 0.24753335118293762, "learning_rate": 0.00018460589060469425, "loss": 1.2711, "step": 509 }, { "epoch": 0.07956473410167905, "grad_norm": 0.2837298512458801, "learning_rate": 0.00018453675777851627, "loss": 1.2325, "step": 510 }, { "epoch": 0.07972074338423137, "grad_norm": 0.30447372794151306, "learning_rate": 0.00018446748307214019, "loss": 1.2425, "step": 511 }, { "epoch": 0.07987675266678368, "grad_norm": 0.27281391620635986, "learning_rate": 0.0001843980666018315, "loss": 1.3095, "step": 512 }, { "epoch": 0.08003276194933598, "grad_norm": 0.27750325202941895, "learning_rate": 0.00018432850848409363, "loss": 1.5124, "step": 513 }, { "epoch": 0.08018877123188829, "grad_norm": 0.32551145553588867, "learning_rate": 0.00018425880883566782, "loss": 1.5727, "step": 514 }, { "epoch": 0.0803447805144406, "grad_norm": 0.29455453157424927, "learning_rate": 0.0001841889677735327, "loss": 1.1937, "step": 515 }, { "epoch": 0.08050078979699292, "grad_norm": 0.271435022354126, "learning_rate": 0.00018411898541490434, "loss": 1.3523, "step": 516 }, { "epoch": 0.08065679907954523, "grad_norm": 0.28192776441574097, "learning_rate": 0.0001840488618772359, "loss": 1.4196, "step": 517 }, { "epoch": 0.08081280836209755, "grad_norm": 0.32622769474983215, "learning_rate": 0.00018397859727821748, "loss": 1.3939, "step": 518 }, { "epoch": 0.08096881764464986, "grad_norm": 0.26916465163230896, "learning_rate": 0.00018390819173577598, "loss": 1.315, "step": 519 }, { "epoch": 0.08112482692720217, "grad_norm": 0.2807716429233551, "learning_rate": 0.00018383764536807485, "loss": 1.4009, "step": 520 }, { "epoch": 0.08128083620975449, "grad_norm": 0.2609405517578125, "learning_rate": 0.00018376695829351377, "loss": 0.9599, "step": 521 }, { "epoch": 0.08143684549230679, "grad_norm": 0.27300071716308594, "learning_rate": 0.00018369613063072874, "loss": 1.2349, "step": 522 }, { "epoch": 0.0815928547748591, "grad_norm": 0.26670917868614197, "learning_rate": 0.00018362516249859163, "loss": 1.2895, "step": 523 }, { "epoch": 0.08174886405741141, "grad_norm": 0.2805304527282715, "learning_rate": 0.00018355405401621001, "loss": 1.3661, "step": 524 }, { "epoch": 0.08190487333996373, "grad_norm": 0.25124502182006836, "learning_rate": 0.00018348280530292713, "loss": 1.2215, "step": 525 }, { "epoch": 0.08206088262251604, "grad_norm": 0.2374117225408554, "learning_rate": 0.00018341141647832147, "loss": 1.1662, "step": 526 }, { "epoch": 0.08221689190506835, "grad_norm": 0.2681942582130432, "learning_rate": 0.00018333988766220676, "loss": 1.3256, "step": 527 }, { "epoch": 0.08237290118762067, "grad_norm": 0.26264506578445435, "learning_rate": 0.0001832682189746316, "loss": 1.1417, "step": 528 }, { "epoch": 0.08252891047017298, "grad_norm": 0.2661115527153015, "learning_rate": 0.00018319641053587938, "loss": 1.2202, "step": 529 }, { "epoch": 0.08268491975272528, "grad_norm": 0.23459146916866302, "learning_rate": 0.0001831244624664681, "loss": 1.0511, "step": 530 }, { "epoch": 0.0828409290352776, "grad_norm": 0.31903690099716187, "learning_rate": 0.00018305237488714995, "loss": 1.565, "step": 531 }, { "epoch": 0.08299693831782991, "grad_norm": 0.28528186678886414, "learning_rate": 0.00018298014791891137, "loss": 1.5023, "step": 532 }, { "epoch": 0.08315294760038222, "grad_norm": 0.2572003901004791, "learning_rate": 0.00018290778168297277, "loss": 1.1518, "step": 533 }, { "epoch": 0.08330895688293453, "grad_norm": 0.27797260880470276, "learning_rate": 0.00018283527630078825, "loss": 1.344, "step": 534 }, { "epoch": 0.08346496616548685, "grad_norm": 0.3142591416835785, "learning_rate": 0.0001827626318940454, "loss": 1.4126, "step": 535 }, { "epoch": 0.08362097544803916, "grad_norm": 0.2703491151332855, "learning_rate": 0.00018268984858466522, "loss": 1.2156, "step": 536 }, { "epoch": 0.08377698473059147, "grad_norm": 0.29505112767219543, "learning_rate": 0.00018261692649480175, "loss": 1.421, "step": 537 }, { "epoch": 0.08393299401314379, "grad_norm": 0.2756875157356262, "learning_rate": 0.00018254386574684204, "loss": 1.4858, "step": 538 }, { "epoch": 0.08408900329569609, "grad_norm": 0.2744990885257721, "learning_rate": 0.0001824706664634058, "loss": 1.3441, "step": 539 }, { "epoch": 0.0842450125782484, "grad_norm": 0.2834165096282959, "learning_rate": 0.00018239732876734527, "loss": 1.4142, "step": 540 }, { "epoch": 0.08440102186080072, "grad_norm": 0.2717669904232025, "learning_rate": 0.0001823238527817449, "loss": 1.3199, "step": 541 }, { "epoch": 0.08455703114335303, "grad_norm": 0.26433441042900085, "learning_rate": 0.00018225023862992142, "loss": 1.3197, "step": 542 }, { "epoch": 0.08471304042590534, "grad_norm": 0.27460265159606934, "learning_rate": 0.00018217648643542323, "loss": 1.216, "step": 543 }, { "epoch": 0.08486904970845766, "grad_norm": 0.26642194390296936, "learning_rate": 0.0001821025963220306, "loss": 1.1716, "step": 544 }, { "epoch": 0.08502505899100997, "grad_norm": 0.2999640703201294, "learning_rate": 0.00018202856841375518, "loss": 1.394, "step": 545 }, { "epoch": 0.08518106827356228, "grad_norm": 0.2676008641719818, "learning_rate": 0.00018195440283483988, "loss": 1.2725, "step": 546 }, { "epoch": 0.08533707755611458, "grad_norm": 0.26116111874580383, "learning_rate": 0.0001818800997097587, "loss": 1.329, "step": 547 }, { "epoch": 0.0854930868386669, "grad_norm": 0.26923874020576477, "learning_rate": 0.00018180565916321647, "loss": 1.2228, "step": 548 }, { "epoch": 0.08564909612121921, "grad_norm": 0.2784603536128998, "learning_rate": 0.0001817310813201486, "loss": 1.1249, "step": 549 }, { "epoch": 0.08580510540377152, "grad_norm": 0.27981552481651306, "learning_rate": 0.0001816563663057211, "loss": 1.2778, "step": 550 }, { "epoch": 0.08596111468632384, "grad_norm": 0.2464422732591629, "learning_rate": 0.00018158151424533002, "loss": 1.0316, "step": 551 }, { "epoch": 0.08611712396887615, "grad_norm": 0.23159442842006683, "learning_rate": 0.00018150652526460146, "loss": 0.9794, "step": 552 }, { "epoch": 0.08627313325142846, "grad_norm": 0.28374752402305603, "learning_rate": 0.00018143139948939137, "loss": 1.0572, "step": 553 }, { "epoch": 0.08642914253398078, "grad_norm": 0.28464943170547485, "learning_rate": 0.00018135613704578526, "loss": 1.024, "step": 554 }, { "epoch": 0.08658515181653309, "grad_norm": 0.23248714208602905, "learning_rate": 0.000181280738060098, "loss": 0.9151, "step": 555 }, { "epoch": 0.08674116109908539, "grad_norm": 0.2613517940044403, "learning_rate": 0.00018120520265887363, "loss": 1.2155, "step": 556 }, { "epoch": 0.0868971703816377, "grad_norm": 0.2925867438316345, "learning_rate": 0.00018112953096888516, "loss": 1.2136, "step": 557 }, { "epoch": 0.08705317966419002, "grad_norm": 0.3145943582057953, "learning_rate": 0.00018105372311713432, "loss": 1.4368, "step": 558 }, { "epoch": 0.08720918894674233, "grad_norm": 0.29513052105903625, "learning_rate": 0.0001809777792308513, "loss": 1.4516, "step": 559 }, { "epoch": 0.08736519822929464, "grad_norm": 0.22099293768405914, "learning_rate": 0.00018090169943749476, "loss": 1.0234, "step": 560 }, { "epoch": 0.08752120751184696, "grad_norm": 0.24346297979354858, "learning_rate": 0.0001808254838647513, "loss": 1.3492, "step": 561 }, { "epoch": 0.08767721679439927, "grad_norm": 0.2770818769931793, "learning_rate": 0.00018074913264053545, "loss": 1.4692, "step": 562 }, { "epoch": 0.08783322607695158, "grad_norm": 0.2789641320705414, "learning_rate": 0.00018067264589298945, "loss": 1.3942, "step": 563 }, { "epoch": 0.08798923535950388, "grad_norm": 0.2892186939716339, "learning_rate": 0.00018059602375048293, "loss": 1.3621, "step": 564 }, { "epoch": 0.0881452446420562, "grad_norm": 0.28431588411331177, "learning_rate": 0.00018051926634161282, "loss": 1.3073, "step": 565 }, { "epoch": 0.08830125392460851, "grad_norm": 0.3204723000526428, "learning_rate": 0.00018044237379520305, "loss": 1.8154, "step": 566 }, { "epoch": 0.08845726320716082, "grad_norm": 0.2658674716949463, "learning_rate": 0.0001803653462403043, "loss": 1.1807, "step": 567 }, { "epoch": 0.08861327248971314, "grad_norm": 0.2409079521894455, "learning_rate": 0.0001802881838061939, "loss": 1.2165, "step": 568 }, { "epoch": 0.08876928177226545, "grad_norm": 0.25896573066711426, "learning_rate": 0.00018021088662237552, "loss": 1.1993, "step": 569 }, { "epoch": 0.08892529105481776, "grad_norm": 0.27663204073905945, "learning_rate": 0.00018013345481857903, "loss": 1.1241, "step": 570 }, { "epoch": 0.08908130033737008, "grad_norm": 0.2892790734767914, "learning_rate": 0.00018005588852476015, "loss": 1.6163, "step": 571 }, { "epoch": 0.08923730961992239, "grad_norm": 0.30898550152778625, "learning_rate": 0.00017997818787110042, "loss": 1.2483, "step": 572 }, { "epoch": 0.08939331890247469, "grad_norm": 0.23732271790504456, "learning_rate": 0.0001799003529880068, "loss": 1.1204, "step": 573 }, { "epoch": 0.089549328185027, "grad_norm": 0.2597337067127228, "learning_rate": 0.0001798223840061116, "loss": 1.258, "step": 574 }, { "epoch": 0.08970533746757932, "grad_norm": 0.31342512369155884, "learning_rate": 0.00017974428105627208, "loss": 1.4074, "step": 575 }, { "epoch": 0.08986134675013163, "grad_norm": 0.30252331495285034, "learning_rate": 0.00017966604426957047, "loss": 1.2059, "step": 576 }, { "epoch": 0.09001735603268395, "grad_norm": 0.29326415061950684, "learning_rate": 0.00017958767377731358, "loss": 1.4294, "step": 577 }, { "epoch": 0.09017336531523626, "grad_norm": 0.2915484607219696, "learning_rate": 0.00017950916971103259, "loss": 1.3728, "step": 578 }, { "epoch": 0.09032937459778857, "grad_norm": 0.2966526746749878, "learning_rate": 0.00017943053220248283, "loss": 1.5332, "step": 579 }, { "epoch": 0.09048538388034089, "grad_norm": 0.24311012029647827, "learning_rate": 0.0001793517613836437, "loss": 1.1254, "step": 580 }, { "epoch": 0.09064139316289319, "grad_norm": 0.2950594127178192, "learning_rate": 0.00017927285738671825, "loss": 1.7255, "step": 581 }, { "epoch": 0.0907974024454455, "grad_norm": 0.24679097533226013, "learning_rate": 0.00017919382034413305, "loss": 1.2781, "step": 582 }, { "epoch": 0.09095341172799781, "grad_norm": 0.2747292220592499, "learning_rate": 0.00017911465038853805, "loss": 1.3434, "step": 583 }, { "epoch": 0.09110942101055013, "grad_norm": 0.30099523067474365, "learning_rate": 0.00017903534765280614, "loss": 1.4518, "step": 584 }, { "epoch": 0.09126543029310244, "grad_norm": 0.2866073548793793, "learning_rate": 0.00017895591227003315, "loss": 1.1706, "step": 585 }, { "epoch": 0.09142143957565475, "grad_norm": 0.28832805156707764, "learning_rate": 0.00017887634437353754, "loss": 1.2271, "step": 586 }, { "epoch": 0.09157744885820707, "grad_norm": 0.3714962601661682, "learning_rate": 0.00017879664409686008, "loss": 1.4474, "step": 587 }, { "epoch": 0.09173345814075938, "grad_norm": 0.30591243505477905, "learning_rate": 0.00017871681157376383, "loss": 1.0327, "step": 588 }, { "epoch": 0.0918894674233117, "grad_norm": 0.3032775819301605, "learning_rate": 0.00017863684693823374, "loss": 1.6824, "step": 589 }, { "epoch": 0.092045476705864, "grad_norm": 0.26961666345596313, "learning_rate": 0.00017855675032447648, "loss": 1.1249, "step": 590 }, { "epoch": 0.0922014859884163, "grad_norm": 0.2679152488708496, "learning_rate": 0.00017847652186692026, "loss": 1.2182, "step": 591 }, { "epoch": 0.09235749527096862, "grad_norm": 0.24089114367961884, "learning_rate": 0.00017839616170021452, "loss": 1.1095, "step": 592 }, { "epoch": 0.09251350455352093, "grad_norm": 0.25100457668304443, "learning_rate": 0.00017831566995922985, "loss": 1.1441, "step": 593 }, { "epoch": 0.09266951383607325, "grad_norm": 0.2766099274158478, "learning_rate": 0.0001782350467790575, "loss": 1.1893, "step": 594 }, { "epoch": 0.09282552311862556, "grad_norm": 0.2666013240814209, "learning_rate": 0.00017815429229500946, "loss": 1.1802, "step": 595 }, { "epoch": 0.09298153240117787, "grad_norm": 0.28148403763771057, "learning_rate": 0.00017807340664261802, "loss": 1.3232, "step": 596 }, { "epoch": 0.09313754168373019, "grad_norm": 0.23684674501419067, "learning_rate": 0.00017799238995763568, "loss": 1.1869, "step": 597 }, { "epoch": 0.09329355096628249, "grad_norm": 0.2614571154117584, "learning_rate": 0.00017791124237603477, "loss": 1.4023, "step": 598 }, { "epoch": 0.0934495602488348, "grad_norm": 0.3051559329032898, "learning_rate": 0.00017782996403400736, "loss": 1.407, "step": 599 }, { "epoch": 0.09360556953138711, "grad_norm": 0.2745681405067444, "learning_rate": 0.00017774855506796496, "loss": 1.3265, "step": 600 }, { "epoch": 0.09376157881393943, "grad_norm": 0.2689257860183716, "learning_rate": 0.0001776670156145383, "loss": 1.3046, "step": 601 }, { "epoch": 0.09391758809649174, "grad_norm": 0.29333195090293884, "learning_rate": 0.00017758534581057718, "loss": 1.2624, "step": 602 }, { "epoch": 0.09407359737904405, "grad_norm": 0.30287420749664307, "learning_rate": 0.00017750354579315004, "loss": 1.28, "step": 603 }, { "epoch": 0.09422960666159637, "grad_norm": 0.27796801924705505, "learning_rate": 0.00017742161569954398, "loss": 1.3305, "step": 604 }, { "epoch": 0.09438561594414868, "grad_norm": 0.2703540325164795, "learning_rate": 0.0001773395556672644, "loss": 1.4356, "step": 605 }, { "epoch": 0.094541625226701, "grad_norm": 0.26395589113235474, "learning_rate": 0.0001772573658340347, "loss": 1.1984, "step": 606 }, { "epoch": 0.0946976345092533, "grad_norm": 0.2784560024738312, "learning_rate": 0.0001771750463377962, "loss": 1.3625, "step": 607 }, { "epoch": 0.09485364379180561, "grad_norm": 0.31962451338768005, "learning_rate": 0.00017709259731670774, "loss": 1.3956, "step": 608 }, { "epoch": 0.09500965307435792, "grad_norm": 0.274460107088089, "learning_rate": 0.00017701001890914572, "loss": 1.3071, "step": 609 }, { "epoch": 0.09516566235691024, "grad_norm": 0.25924167037010193, "learning_rate": 0.00017692731125370354, "loss": 1.034, "step": 610 }, { "epoch": 0.09532167163946255, "grad_norm": 0.3091680705547333, "learning_rate": 0.00017684447448919154, "loss": 1.4134, "step": 611 }, { "epoch": 0.09547768092201486, "grad_norm": 0.25753480195999146, "learning_rate": 0.00017676150875463686, "loss": 1.2074, "step": 612 }, { "epoch": 0.09563369020456718, "grad_norm": 0.27256032824516296, "learning_rate": 0.0001766784141892829, "loss": 1.3758, "step": 613 }, { "epoch": 0.09578969948711949, "grad_norm": 0.24764277040958405, "learning_rate": 0.0001765951909325895, "loss": 1.0436, "step": 614 }, { "epoch": 0.09594570876967179, "grad_norm": 0.2722652554512024, "learning_rate": 0.00017651183912423228, "loss": 1.3623, "step": 615 }, { "epoch": 0.0961017180522241, "grad_norm": 0.27056217193603516, "learning_rate": 0.0001764283589041028, "loss": 1.2525, "step": 616 }, { "epoch": 0.09625772733477642, "grad_norm": 0.27987945079803467, "learning_rate": 0.00017634475041230797, "loss": 1.5075, "step": 617 }, { "epoch": 0.09641373661732873, "grad_norm": 0.29397958517074585, "learning_rate": 0.00017626101378917004, "loss": 1.3681, "step": 618 }, { "epoch": 0.09656974589988104, "grad_norm": 0.2876337766647339, "learning_rate": 0.0001761771491752264, "loss": 1.5848, "step": 619 }, { "epoch": 0.09672575518243336, "grad_norm": 0.237448051571846, "learning_rate": 0.0001760931567112291, "loss": 1.0918, "step": 620 }, { "epoch": 0.09688176446498567, "grad_norm": 0.29513096809387207, "learning_rate": 0.0001760090365381449, "loss": 1.3236, "step": 621 }, { "epoch": 0.09703777374753798, "grad_norm": 0.263920396566391, "learning_rate": 0.0001759247887971548, "loss": 1.4573, "step": 622 }, { "epoch": 0.0971937830300903, "grad_norm": 0.31876271963119507, "learning_rate": 0.00017584041362965396, "loss": 1.3874, "step": 623 }, { "epoch": 0.0973497923126426, "grad_norm": 0.30635690689086914, "learning_rate": 0.0001757559111772513, "loss": 1.2355, "step": 624 }, { "epoch": 0.09750580159519491, "grad_norm": 0.25926241278648376, "learning_rate": 0.00017567128158176953, "loss": 1.2641, "step": 625 }, { "epoch": 0.09766181087774722, "grad_norm": 0.2862091660499573, "learning_rate": 0.0001755865249852446, "loss": 1.3818, "step": 626 }, { "epoch": 0.09781782016029954, "grad_norm": 0.2540535628795624, "learning_rate": 0.00017550164152992573, "loss": 1.3807, "step": 627 }, { "epoch": 0.09797382944285185, "grad_norm": 0.30917900800704956, "learning_rate": 0.00017541663135827492, "loss": 1.1053, "step": 628 }, { "epoch": 0.09812983872540416, "grad_norm": 0.30465036630630493, "learning_rate": 0.000175331494612967, "loss": 1.4489, "step": 629 }, { "epoch": 0.09828584800795648, "grad_norm": 0.3043782711029053, "learning_rate": 0.00017524623143688902, "loss": 1.4544, "step": 630 }, { "epoch": 0.09844185729050879, "grad_norm": 0.2681322991847992, "learning_rate": 0.00017516084197314046, "loss": 1.1926, "step": 631 }, { "epoch": 0.09859786657306109, "grad_norm": 0.33450305461883545, "learning_rate": 0.00017507532636503256, "loss": 1.4383, "step": 632 }, { "epoch": 0.0987538758556134, "grad_norm": 0.2626807987689972, "learning_rate": 0.00017498968475608838, "loss": 1.1565, "step": 633 }, { "epoch": 0.09890988513816572, "grad_norm": 0.2553156912326813, "learning_rate": 0.00017490391729004244, "loss": 1.1327, "step": 634 }, { "epoch": 0.09906589442071803, "grad_norm": 0.23390045762062073, "learning_rate": 0.00017481802411084042, "loss": 0.9856, "step": 635 }, { "epoch": 0.09922190370327034, "grad_norm": 0.29881760478019714, "learning_rate": 0.00017473200536263905, "loss": 1.362, "step": 636 }, { "epoch": 0.09937791298582266, "grad_norm": 0.2904150187969208, "learning_rate": 0.0001746458611898058, "loss": 1.242, "step": 637 }, { "epoch": 0.09953392226837497, "grad_norm": 0.24842409789562225, "learning_rate": 0.00017455959173691863, "loss": 1.2694, "step": 638 }, { "epoch": 0.09968993155092729, "grad_norm": 0.3337212800979614, "learning_rate": 0.00017447319714876579, "loss": 1.2554, "step": 639 }, { "epoch": 0.0998459408334796, "grad_norm": 0.24105407297611237, "learning_rate": 0.00017438667757034546, "loss": 1.0582, "step": 640 }, { "epoch": 0.1000019501160319, "grad_norm": 0.24266989529132843, "learning_rate": 0.00017430003314686569, "loss": 1.2125, "step": 641 }, { "epoch": 0.10015795939858421, "grad_norm": 0.2654808461666107, "learning_rate": 0.00017421326402374405, "loss": 1.3229, "step": 642 }, { "epoch": 0.10031396868113653, "grad_norm": 0.21931445598602295, "learning_rate": 0.00017412637034660734, "loss": 1.1168, "step": 643 }, { "epoch": 0.10046997796368884, "grad_norm": 0.28860512375831604, "learning_rate": 0.0001740393522612915, "loss": 1.3681, "step": 644 }, { "epoch": 0.10062598724624115, "grad_norm": 0.2736460566520691, "learning_rate": 0.0001739522099138411, "loss": 1.4054, "step": 645 }, { "epoch": 0.10078199652879347, "grad_norm": 0.23222267627716064, "learning_rate": 0.00017386494345050942, "loss": 1.0973, "step": 646 }, { "epoch": 0.10093800581134578, "grad_norm": 0.2684474587440491, "learning_rate": 0.000173777553017758, "loss": 1.0637, "step": 647 }, { "epoch": 0.10109401509389809, "grad_norm": 0.2648880183696747, "learning_rate": 0.00017369003876225642, "loss": 1.5162, "step": 648 }, { "epoch": 0.10125002437645039, "grad_norm": 0.26263687014579773, "learning_rate": 0.00017360240083088213, "loss": 1.3613, "step": 649 }, { "epoch": 0.1014060336590027, "grad_norm": 0.2455459088087082, "learning_rate": 0.00017351463937072004, "loss": 1.3927, "step": 650 }, { "epoch": 0.10156204294155502, "grad_norm": 0.273078590631485, "learning_rate": 0.00017342675452906248, "loss": 1.2485, "step": 651 }, { "epoch": 0.10171805222410733, "grad_norm": 0.24480541050434113, "learning_rate": 0.00017333874645340884, "loss": 1.0656, "step": 652 }, { "epoch": 0.10187406150665965, "grad_norm": 0.24994470179080963, "learning_rate": 0.0001732506152914653, "loss": 1.3653, "step": 653 }, { "epoch": 0.10203007078921196, "grad_norm": 0.26110485196113586, "learning_rate": 0.00017316236119114463, "loss": 1.392, "step": 654 }, { "epoch": 0.10218608007176427, "grad_norm": 0.30197709798812866, "learning_rate": 0.00017307398430056593, "loss": 1.5184, "step": 655 }, { "epoch": 0.10234208935431659, "grad_norm": 0.26577743887901306, "learning_rate": 0.00017298548476805446, "loss": 1.4611, "step": 656 }, { "epoch": 0.10249809863686889, "grad_norm": 0.2677333950996399, "learning_rate": 0.00017289686274214118, "loss": 1.3282, "step": 657 }, { "epoch": 0.1026541079194212, "grad_norm": 0.2508523762226105, "learning_rate": 0.00017280811837156268, "loss": 1.1331, "step": 658 }, { "epoch": 0.10281011720197351, "grad_norm": 0.24873429536819458, "learning_rate": 0.00017271925180526094, "loss": 1.1351, "step": 659 }, { "epoch": 0.10296612648452583, "grad_norm": 0.2559413015842438, "learning_rate": 0.00017263026319238301, "loss": 1.245, "step": 660 }, { "epoch": 0.10312213576707814, "grad_norm": 0.29988738894462585, "learning_rate": 0.0001725411526822807, "loss": 1.4004, "step": 661 }, { "epoch": 0.10327814504963045, "grad_norm": 0.29719191789627075, "learning_rate": 0.0001724519204245105, "loss": 1.5687, "step": 662 }, { "epoch": 0.10343415433218277, "grad_norm": 0.30810216069221497, "learning_rate": 0.0001723625665688331, "loss": 1.3712, "step": 663 }, { "epoch": 0.10359016361473508, "grad_norm": 0.2754259407520294, "learning_rate": 0.00017227309126521348, "loss": 1.2083, "step": 664 }, { "epoch": 0.1037461728972874, "grad_norm": 0.26548734307289124, "learning_rate": 0.00017218349466382023, "loss": 1.2657, "step": 665 }, { "epoch": 0.1039021821798397, "grad_norm": 0.26369354128837585, "learning_rate": 0.00017209377691502565, "loss": 1.3359, "step": 666 }, { "epoch": 0.10405819146239201, "grad_norm": 0.2526211440563202, "learning_rate": 0.0001720039381694053, "loss": 1.0633, "step": 667 }, { "epoch": 0.10421420074494432, "grad_norm": 0.2874252498149872, "learning_rate": 0.00017191397857773788, "loss": 1.2833, "step": 668 }, { "epoch": 0.10437021002749663, "grad_norm": 0.26982390880584717, "learning_rate": 0.00017182389829100485, "loss": 1.1843, "step": 669 }, { "epoch": 0.10452621931004895, "grad_norm": 0.29615074396133423, "learning_rate": 0.00017173369746039025, "loss": 1.2992, "step": 670 }, { "epoch": 0.10468222859260126, "grad_norm": 0.29073938727378845, "learning_rate": 0.00017164337623728045, "loss": 1.5432, "step": 671 }, { "epoch": 0.10483823787515358, "grad_norm": 0.2858506143093109, "learning_rate": 0.00017155293477326384, "loss": 1.4446, "step": 672 }, { "epoch": 0.10499424715770589, "grad_norm": 0.2399512678384781, "learning_rate": 0.00017146237322013068, "loss": 1.1643, "step": 673 }, { "epoch": 0.10515025644025819, "grad_norm": 0.2796498239040375, "learning_rate": 0.00017137169172987268, "loss": 1.3158, "step": 674 }, { "epoch": 0.1053062657228105, "grad_norm": 0.26859599351882935, "learning_rate": 0.00017128089045468294, "loss": 1.1761, "step": 675 }, { "epoch": 0.10546227500536282, "grad_norm": 0.2749616503715515, "learning_rate": 0.00017118996954695553, "loss": 1.0586, "step": 676 }, { "epoch": 0.10561828428791513, "grad_norm": 0.27312207221984863, "learning_rate": 0.00017109892915928535, "loss": 1.1367, "step": 677 }, { "epoch": 0.10577429357046744, "grad_norm": 0.29626578092575073, "learning_rate": 0.00017100776944446781, "loss": 1.4223, "step": 678 }, { "epoch": 0.10593030285301976, "grad_norm": 0.24335867166519165, "learning_rate": 0.00017091649055549855, "loss": 1.1041, "step": 679 }, { "epoch": 0.10608631213557207, "grad_norm": 0.3017411530017853, "learning_rate": 0.0001708250926455733, "loss": 1.2854, "step": 680 }, { "epoch": 0.10624232141812438, "grad_norm": 0.2864495515823364, "learning_rate": 0.00017073357586808752, "loss": 1.2539, "step": 681 }, { "epoch": 0.1063983307006767, "grad_norm": 0.27407294511795044, "learning_rate": 0.0001706419403766361, "loss": 1.3136, "step": 682 }, { "epoch": 0.106554339983229, "grad_norm": 0.3100734055042267, "learning_rate": 0.00017055018632501325, "loss": 1.3231, "step": 683 }, { "epoch": 0.10671034926578131, "grad_norm": 0.3091520071029663, "learning_rate": 0.00017045831386721213, "loss": 1.3513, "step": 684 }, { "epoch": 0.10686635854833362, "grad_norm": 0.2930145561695099, "learning_rate": 0.00017036632315742462, "loss": 1.3292, "step": 685 }, { "epoch": 0.10702236783088594, "grad_norm": 0.30808883905410767, "learning_rate": 0.00017027421435004112, "loss": 1.6094, "step": 686 }, { "epoch": 0.10717837711343825, "grad_norm": 0.2715398073196411, "learning_rate": 0.00017018198759965016, "loss": 1.3641, "step": 687 }, { "epoch": 0.10733438639599056, "grad_norm": 0.2844456732273102, "learning_rate": 0.00017008964306103823, "loss": 1.3933, "step": 688 }, { "epoch": 0.10749039567854288, "grad_norm": 0.258504718542099, "learning_rate": 0.00016999718088918955, "loss": 1.0621, "step": 689 }, { "epoch": 0.10764640496109519, "grad_norm": 0.28674831986427307, "learning_rate": 0.00016990460123928575, "loss": 1.2759, "step": 690 }, { "epoch": 0.10780241424364749, "grad_norm": 0.3062899708747864, "learning_rate": 0.0001698119042667056, "loss": 1.1537, "step": 691 }, { "epoch": 0.1079584235261998, "grad_norm": 0.2539708614349365, "learning_rate": 0.00016971909012702483, "loss": 1.1463, "step": 692 }, { "epoch": 0.10811443280875212, "grad_norm": 0.30207210779190063, "learning_rate": 0.00016962615897601573, "loss": 1.4219, "step": 693 }, { "epoch": 0.10827044209130443, "grad_norm": 0.28675806522369385, "learning_rate": 0.00016953311096964705, "loss": 1.1476, "step": 694 }, { "epoch": 0.10842645137385674, "grad_norm": 0.33274316787719727, "learning_rate": 0.00016943994626408363, "loss": 1.3351, "step": 695 }, { "epoch": 0.10858246065640906, "grad_norm": 0.2725004553794861, "learning_rate": 0.00016934666501568617, "loss": 1.1795, "step": 696 }, { "epoch": 0.10873846993896137, "grad_norm": 0.29064077138900757, "learning_rate": 0.00016925326738101098, "loss": 1.4255, "step": 697 }, { "epoch": 0.10889447922151368, "grad_norm": 0.3007811903953552, "learning_rate": 0.00016915975351680968, "loss": 1.1951, "step": 698 }, { "epoch": 0.109050488504066, "grad_norm": 0.26098549365997314, "learning_rate": 0.000169066123580029, "loss": 1.0585, "step": 699 }, { "epoch": 0.1092064977866183, "grad_norm": 0.36355966329574585, "learning_rate": 0.00016897237772781044, "loss": 1.2911, "step": 700 }, { "epoch": 0.10936250706917061, "grad_norm": 0.2830749750137329, "learning_rate": 0.00016887851611749005, "loss": 1.4469, "step": 701 }, { "epoch": 0.10951851635172292, "grad_norm": 0.3175537884235382, "learning_rate": 0.00016878453890659814, "loss": 1.4589, "step": 702 }, { "epoch": 0.10967452563427524, "grad_norm": 0.2898159623146057, "learning_rate": 0.0001686904462528591, "loss": 1.4318, "step": 703 }, { "epoch": 0.10983053491682755, "grad_norm": 0.28991106152534485, "learning_rate": 0.000168596238314191, "loss": 1.3293, "step": 704 }, { "epoch": 0.10998654419937987, "grad_norm": 0.27654772996902466, "learning_rate": 0.00016850191524870546, "loss": 1.4909, "step": 705 }, { "epoch": 0.11014255348193218, "grad_norm": 0.29537513852119446, "learning_rate": 0.00016840747721470731, "loss": 1.4512, "step": 706 }, { "epoch": 0.11029856276448449, "grad_norm": 0.2656291723251343, "learning_rate": 0.00016831292437069427, "loss": 1.0375, "step": 707 }, { "epoch": 0.11045457204703679, "grad_norm": 0.3286688029766083, "learning_rate": 0.00016821825687535674, "loss": 1.3478, "step": 708 }, { "epoch": 0.1106105813295891, "grad_norm": 0.2618601322174072, "learning_rate": 0.00016812347488757772, "loss": 1.3448, "step": 709 }, { "epoch": 0.11076659061214142, "grad_norm": 0.29108762741088867, "learning_rate": 0.00016802857856643215, "loss": 1.3479, "step": 710 }, { "epoch": 0.11092259989469373, "grad_norm": 0.3029685914516449, "learning_rate": 0.00016793356807118695, "loss": 1.2162, "step": 711 }, { "epoch": 0.11107860917724605, "grad_norm": 0.2573980689048767, "learning_rate": 0.00016783844356130071, "loss": 1.0927, "step": 712 }, { "epoch": 0.11123461845979836, "grad_norm": 0.2836451828479767, "learning_rate": 0.0001677432051964233, "loss": 1.2136, "step": 713 }, { "epoch": 0.11139062774235067, "grad_norm": 0.2437037229537964, "learning_rate": 0.0001676478531363957, "loss": 1.0671, "step": 714 }, { "epoch": 0.11154663702490299, "grad_norm": 0.2603608965873718, "learning_rate": 0.00016755238754124965, "loss": 1.2128, "step": 715 }, { "epoch": 0.1117026463074553, "grad_norm": 0.2617943286895752, "learning_rate": 0.00016745680857120757, "loss": 1.3305, "step": 716 }, { "epoch": 0.1118586555900076, "grad_norm": 0.27264609932899475, "learning_rate": 0.00016736111638668204, "loss": 1.3456, "step": 717 }, { "epoch": 0.11201466487255991, "grad_norm": 0.33472567796707153, "learning_rate": 0.00016726531114827573, "loss": 1.2517, "step": 718 }, { "epoch": 0.11217067415511223, "grad_norm": 0.2825791835784912, "learning_rate": 0.00016716939301678098, "loss": 1.3156, "step": 719 }, { "epoch": 0.11232668343766454, "grad_norm": 0.2815983295440674, "learning_rate": 0.00016707336215317968, "loss": 1.2376, "step": 720 }, { "epoch": 0.11248269272021685, "grad_norm": 0.3158409595489502, "learning_rate": 0.00016697721871864284, "loss": 1.5252, "step": 721 }, { "epoch": 0.11263870200276917, "grad_norm": 0.27121129631996155, "learning_rate": 0.00016688096287453046, "loss": 1.3603, "step": 722 }, { "epoch": 0.11279471128532148, "grad_norm": 0.2568758428096771, "learning_rate": 0.00016678459478239118, "loss": 1.1337, "step": 723 }, { "epoch": 0.1129507205678738, "grad_norm": 0.26672929525375366, "learning_rate": 0.00016668811460396202, "loss": 1.1728, "step": 724 }, { "epoch": 0.1131067298504261, "grad_norm": 0.2683919370174408, "learning_rate": 0.00016659152250116812, "loss": 1.2833, "step": 725 }, { "epoch": 0.11326273913297841, "grad_norm": 0.2757527232170105, "learning_rate": 0.00016649481863612248, "loss": 1.0544, "step": 726 }, { "epoch": 0.11341874841553072, "grad_norm": 0.2571371793746948, "learning_rate": 0.0001663980031711257, "loss": 1.1212, "step": 727 }, { "epoch": 0.11357475769808303, "grad_norm": 0.2757047116756439, "learning_rate": 0.00016630107626866558, "loss": 1.1771, "step": 728 }, { "epoch": 0.11373076698063535, "grad_norm": 0.262979120016098, "learning_rate": 0.00016620403809141705, "loss": 0.9962, "step": 729 }, { "epoch": 0.11388677626318766, "grad_norm": 0.26567909121513367, "learning_rate": 0.00016610688880224178, "loss": 1.3037, "step": 730 }, { "epoch": 0.11404278554573997, "grad_norm": 0.27931660413742065, "learning_rate": 0.00016600962856418782, "loss": 1.1863, "step": 731 }, { "epoch": 0.11419879482829229, "grad_norm": 0.25071558356285095, "learning_rate": 0.00016591225754048963, "loss": 1.1437, "step": 732 }, { "epoch": 0.1143548041108446, "grad_norm": 0.2775113880634308, "learning_rate": 0.00016581477589456734, "loss": 1.2152, "step": 733 }, { "epoch": 0.1145108133933969, "grad_norm": 0.25055718421936035, "learning_rate": 0.00016571718379002705, "loss": 1.1479, "step": 734 }, { "epoch": 0.11466682267594921, "grad_norm": 0.25468993186950684, "learning_rate": 0.00016561948139065996, "loss": 1.148, "step": 735 }, { "epoch": 0.11482283195850153, "grad_norm": 0.26385918259620667, "learning_rate": 0.00016552166886044253, "loss": 1.3473, "step": 736 }, { "epoch": 0.11497884124105384, "grad_norm": 0.27051180601119995, "learning_rate": 0.00016542374636353604, "loss": 1.196, "step": 737 }, { "epoch": 0.11513485052360616, "grad_norm": 0.32731276750564575, "learning_rate": 0.0001653257140642863, "loss": 1.4514, "step": 738 }, { "epoch": 0.11529085980615847, "grad_norm": 0.26046180725097656, "learning_rate": 0.00016522757212722344, "loss": 1.2186, "step": 739 }, { "epoch": 0.11544686908871078, "grad_norm": 0.2661746144294739, "learning_rate": 0.00016512932071706152, "loss": 1.123, "step": 740 }, { "epoch": 0.1156028783712631, "grad_norm": 0.25739923119544983, "learning_rate": 0.0001650309599986985, "loss": 1.1832, "step": 741 }, { "epoch": 0.1157588876538154, "grad_norm": 0.30230990052223206, "learning_rate": 0.00016493249013721558, "loss": 1.5064, "step": 742 }, { "epoch": 0.11591489693636771, "grad_norm": 0.25831449031829834, "learning_rate": 0.00016483391129787727, "loss": 1.1212, "step": 743 }, { "epoch": 0.11607090621892002, "grad_norm": 0.24019654095172882, "learning_rate": 0.000164735223646131, "loss": 1.1555, "step": 744 }, { "epoch": 0.11622691550147234, "grad_norm": 0.28396427631378174, "learning_rate": 0.0001646364273476067, "loss": 1.4754, "step": 745 }, { "epoch": 0.11638292478402465, "grad_norm": 0.28211066126823425, "learning_rate": 0.00016453752256811674, "loss": 1.526, "step": 746 }, { "epoch": 0.11653893406657696, "grad_norm": 0.2596474289894104, "learning_rate": 0.00016443850947365558, "loss": 1.2072, "step": 747 }, { "epoch": 0.11669494334912928, "grad_norm": 0.25947293639183044, "learning_rate": 0.0001643393882303994, "loss": 1.3467, "step": 748 }, { "epoch": 0.11685095263168159, "grad_norm": 0.30946600437164307, "learning_rate": 0.00016424015900470587, "loss": 1.3948, "step": 749 }, { "epoch": 0.1170069619142339, "grad_norm": 0.3172161281108856, "learning_rate": 0.000164140821963114, "loss": 1.745, "step": 750 }, { "epoch": 0.1171629711967862, "grad_norm": 0.26674196124076843, "learning_rate": 0.00016404137727234365, "loss": 1.5021, "step": 751 }, { "epoch": 0.11731898047933852, "grad_norm": 0.26941999793052673, "learning_rate": 0.00016394182509929536, "loss": 1.2651, "step": 752 }, { "epoch": 0.11747498976189083, "grad_norm": 0.29353249073028564, "learning_rate": 0.00016384216561105014, "loss": 1.2397, "step": 753 }, { "epoch": 0.11763099904444314, "grad_norm": 0.2547638416290283, "learning_rate": 0.000163742398974869, "loss": 1.1032, "step": 754 }, { "epoch": 0.11778700832699546, "grad_norm": 0.25621354579925537, "learning_rate": 0.00016364252535819282, "loss": 1.0842, "step": 755 }, { "epoch": 0.11794301760954777, "grad_norm": 0.25465261936187744, "learning_rate": 0.00016354254492864211, "loss": 0.9941, "step": 756 }, { "epoch": 0.11809902689210008, "grad_norm": 0.25726544857025146, "learning_rate": 0.00016344245785401653, "loss": 1.2613, "step": 757 }, { "epoch": 0.1182550361746524, "grad_norm": 0.2696760594844818, "learning_rate": 0.00016334226430229475, "loss": 1.1349, "step": 758 }, { "epoch": 0.1184110454572047, "grad_norm": 0.29465997219085693, "learning_rate": 0.00016324196444163423, "loss": 1.3099, "step": 759 }, { "epoch": 0.11856705473975701, "grad_norm": 0.2854841351509094, "learning_rate": 0.00016314155844037074, "loss": 1.1648, "step": 760 }, { "epoch": 0.11872306402230932, "grad_norm": 0.28557366132736206, "learning_rate": 0.0001630410464670182, "loss": 1.4045, "step": 761 }, { "epoch": 0.11887907330486164, "grad_norm": 0.337882936000824, "learning_rate": 0.00016294042869026851, "loss": 1.4391, "step": 762 }, { "epoch": 0.11903508258741395, "grad_norm": 0.25410857796669006, "learning_rate": 0.000162839705278991, "loss": 1.025, "step": 763 }, { "epoch": 0.11919109186996626, "grad_norm": 0.2944369614124298, "learning_rate": 0.0001627388764022323, "loss": 1.3339, "step": 764 }, { "epoch": 0.11934710115251858, "grad_norm": 0.30941835045814514, "learning_rate": 0.0001626379422292162, "loss": 1.5238, "step": 765 }, { "epoch": 0.11950311043507089, "grad_norm": 0.2796765863895416, "learning_rate": 0.000162536902929343, "loss": 1.1711, "step": 766 }, { "epoch": 0.1196591197176232, "grad_norm": 0.2882195711135864, "learning_rate": 0.00016243575867218958, "loss": 1.2852, "step": 767 }, { "epoch": 0.1198151290001755, "grad_norm": 0.29050207138061523, "learning_rate": 0.00016233450962750893, "loss": 1.2789, "step": 768 }, { "epoch": 0.11997113828272782, "grad_norm": 0.2745670974254608, "learning_rate": 0.00016223315596522987, "loss": 1.2741, "step": 769 }, { "epoch": 0.12012714756528013, "grad_norm": 0.29764166474342346, "learning_rate": 0.0001621316978554569, "loss": 1.3636, "step": 770 }, { "epoch": 0.12028315684783245, "grad_norm": 0.29131025075912476, "learning_rate": 0.00016203013546846966, "loss": 1.5137, "step": 771 }, { "epoch": 0.12043916613038476, "grad_norm": 0.3370944857597351, "learning_rate": 0.00016192846897472297, "loss": 1.5541, "step": 772 }, { "epoch": 0.12059517541293707, "grad_norm": 0.2678642272949219, "learning_rate": 0.0001618266985448463, "loss": 1.2024, "step": 773 }, { "epoch": 0.12075118469548939, "grad_norm": 0.27655884623527527, "learning_rate": 0.00016172482434964353, "loss": 1.1084, "step": 774 }, { "epoch": 0.1209071939780417, "grad_norm": 0.23235641419887543, "learning_rate": 0.00016162284656009274, "loss": 0.8548, "step": 775 }, { "epoch": 0.121063203260594, "grad_norm": 0.2860414683818817, "learning_rate": 0.00016152076534734584, "loss": 1.5026, "step": 776 }, { "epoch": 0.12121921254314631, "grad_norm": 0.2980406582355499, "learning_rate": 0.00016141858088272837, "loss": 1.3692, "step": 777 }, { "epoch": 0.12137522182569863, "grad_norm": 0.29564347863197327, "learning_rate": 0.00016131629333773908, "loss": 1.6193, "step": 778 }, { "epoch": 0.12153123110825094, "grad_norm": 0.250028520822525, "learning_rate": 0.0001612139028840498, "loss": 1.3295, "step": 779 }, { "epoch": 0.12168724039080325, "grad_norm": 0.25812971591949463, "learning_rate": 0.00016111140969350503, "loss": 1.1061, "step": 780 }, { "epoch": 0.12184324967335557, "grad_norm": 0.2702666223049164, "learning_rate": 0.0001610088139381217, "loss": 1.2846, "step": 781 }, { "epoch": 0.12199925895590788, "grad_norm": 0.24256417155265808, "learning_rate": 0.00016090611579008888, "loss": 1.081, "step": 782 }, { "epoch": 0.1221552682384602, "grad_norm": 0.3177904784679413, "learning_rate": 0.00016080331542176753, "loss": 1.5862, "step": 783 }, { "epoch": 0.12231127752101251, "grad_norm": 0.25483664870262146, "learning_rate": 0.00016070041300569012, "loss": 1.1939, "step": 784 }, { "epoch": 0.1224672868035648, "grad_norm": 0.23578673601150513, "learning_rate": 0.00016059740871456036, "loss": 1.0371, "step": 785 }, { "epoch": 0.12262329608611712, "grad_norm": 0.28674736618995667, "learning_rate": 0.000160494302721253, "loss": 1.4739, "step": 786 }, { "epoch": 0.12277930536866943, "grad_norm": 0.29090616106987, "learning_rate": 0.0001603910951988135, "loss": 1.3862, "step": 787 }, { "epoch": 0.12293531465122175, "grad_norm": 0.2792899012565613, "learning_rate": 0.00016028778632045762, "loss": 1.3731, "step": 788 }, { "epoch": 0.12309132393377406, "grad_norm": 0.2683924436569214, "learning_rate": 0.00016018437625957133, "loss": 1.4514, "step": 789 }, { "epoch": 0.12324733321632637, "grad_norm": 0.331752747297287, "learning_rate": 0.00016008086518971037, "loss": 1.0936, "step": 790 }, { "epoch": 0.12340334249887869, "grad_norm": 0.32185712456703186, "learning_rate": 0.0001599772532846, "loss": 1.7093, "step": 791 }, { "epoch": 0.123559351781431, "grad_norm": 0.28801560401916504, "learning_rate": 0.0001598735407181347, "loss": 1.2923, "step": 792 }, { "epoch": 0.1237153610639833, "grad_norm": 0.2626672387123108, "learning_rate": 0.00015976972766437795, "loss": 1.196, "step": 793 }, { "epoch": 0.12387137034653561, "grad_norm": 0.30561795830726624, "learning_rate": 0.00015966581429756183, "loss": 1.5151, "step": 794 }, { "epoch": 0.12402737962908793, "grad_norm": 0.2764839828014374, "learning_rate": 0.00015956180079208682, "loss": 1.231, "step": 795 }, { "epoch": 0.12418338891164024, "grad_norm": 0.2506803870201111, "learning_rate": 0.00015945768732252144, "loss": 1.0394, "step": 796 }, { "epoch": 0.12433939819419255, "grad_norm": 0.28655874729156494, "learning_rate": 0.00015935347406360192, "loss": 1.4689, "step": 797 }, { "epoch": 0.12449540747674487, "grad_norm": 0.26048576831817627, "learning_rate": 0.00015924916119023212, "loss": 1.218, "step": 798 }, { "epoch": 0.12465141675929718, "grad_norm": 0.26712656021118164, "learning_rate": 0.00015914474887748295, "loss": 1.232, "step": 799 }, { "epoch": 0.1248074260418495, "grad_norm": 0.2652023434638977, "learning_rate": 0.00015904023730059228, "loss": 1.0205, "step": 800 }, { "epoch": 0.12496343532440181, "grad_norm": 0.3364275097846985, "learning_rate": 0.0001589356266349645, "loss": 1.4919, "step": 801 }, { "epoch": 0.12511944460695412, "grad_norm": 0.218467116355896, "learning_rate": 0.00015883091705617045, "loss": 0.8939, "step": 802 }, { "epoch": 0.12527545388950642, "grad_norm": 0.2554807960987091, "learning_rate": 0.00015872610873994685, "loss": 1.2568, "step": 803 }, { "epoch": 0.12543146317205875, "grad_norm": 0.2742806673049927, "learning_rate": 0.00015862120186219613, "loss": 1.0565, "step": 804 }, { "epoch": 0.12558747245461105, "grad_norm": 0.23994481563568115, "learning_rate": 0.00015851619659898623, "loss": 0.9631, "step": 805 }, { "epoch": 0.12574348173716335, "grad_norm": 0.29549404978752136, "learning_rate": 0.00015841109312655016, "loss": 1.2073, "step": 806 }, { "epoch": 0.12589949101971568, "grad_norm": 0.27470991015434265, "learning_rate": 0.00015830589162128572, "loss": 1.2345, "step": 807 }, { "epoch": 0.12605550030226798, "grad_norm": 0.27652519941329956, "learning_rate": 0.00015820059225975531, "loss": 1.2456, "step": 808 }, { "epoch": 0.1262115095848203, "grad_norm": 0.2571077346801758, "learning_rate": 0.0001580951952186856, "loss": 1.0009, "step": 809 }, { "epoch": 0.1263675188673726, "grad_norm": 0.27721402049064636, "learning_rate": 0.000157989700674967, "loss": 1.2101, "step": 810 }, { "epoch": 0.12652352814992493, "grad_norm": 0.29823631048202515, "learning_rate": 0.00015788410880565379, "loss": 1.3992, "step": 811 }, { "epoch": 0.12667953743247723, "grad_norm": 0.28366366028785706, "learning_rate": 0.00015777841978796347, "loss": 1.005, "step": 812 }, { "epoch": 0.12683554671502956, "grad_norm": 0.3597376048564911, "learning_rate": 0.0001576726337992766, "loss": 1.6046, "step": 813 }, { "epoch": 0.12699155599758186, "grad_norm": 0.27407100796699524, "learning_rate": 0.00015756675101713657, "loss": 1.0167, "step": 814 }, { "epoch": 0.12714756528013416, "grad_norm": 0.3212680220603943, "learning_rate": 0.00015746077161924905, "loss": 1.4425, "step": 815 }, { "epoch": 0.12730357456268648, "grad_norm": 0.25150859355926514, "learning_rate": 0.00015735469578348208, "loss": 1.2482, "step": 816 }, { "epoch": 0.12745958384523878, "grad_norm": 0.2753000855445862, "learning_rate": 0.00015724852368786537, "loss": 1.3006, "step": 817 }, { "epoch": 0.1276155931277911, "grad_norm": 0.27500027418136597, "learning_rate": 0.0001571422555105903, "loss": 1.2095, "step": 818 }, { "epoch": 0.1277716024103434, "grad_norm": 0.2696485221385956, "learning_rate": 0.0001570358914300094, "loss": 1.1708, "step": 819 }, { "epoch": 0.12792761169289574, "grad_norm": 0.2486962080001831, "learning_rate": 0.00015692943162463628, "loss": 1.0531, "step": 820 }, { "epoch": 0.12808362097544804, "grad_norm": 0.265824556350708, "learning_rate": 0.00015682287627314515, "loss": 1.0712, "step": 821 }, { "epoch": 0.12823963025800036, "grad_norm": 0.2963060140609741, "learning_rate": 0.00015671622555437053, "loss": 1.3806, "step": 822 }, { "epoch": 0.12839563954055266, "grad_norm": 0.2849713861942291, "learning_rate": 0.00015660947964730708, "loss": 1.2242, "step": 823 }, { "epoch": 0.12855164882310496, "grad_norm": 0.25108298659324646, "learning_rate": 0.0001565026387311092, "loss": 1.1128, "step": 824 }, { "epoch": 0.1287076581056573, "grad_norm": 0.27622735500335693, "learning_rate": 0.00015639570298509064, "loss": 1.3599, "step": 825 }, { "epoch": 0.1288636673882096, "grad_norm": 0.29195183515548706, "learning_rate": 0.0001562886725887245, "loss": 1.2931, "step": 826 }, { "epoch": 0.12901967667076192, "grad_norm": 0.2943118214607239, "learning_rate": 0.00015618154772164256, "loss": 1.5802, "step": 827 }, { "epoch": 0.12917568595331422, "grad_norm": 0.26325714588165283, "learning_rate": 0.00015607432856363525, "loss": 1.2455, "step": 828 }, { "epoch": 0.12933169523586655, "grad_norm": 0.286743700504303, "learning_rate": 0.00015596701529465117, "loss": 1.3008, "step": 829 }, { "epoch": 0.12948770451841884, "grad_norm": 0.2844702899456024, "learning_rate": 0.00015585960809479696, "loss": 1.3737, "step": 830 }, { "epoch": 0.12964371380097114, "grad_norm": 0.25531789660453796, "learning_rate": 0.00015575210714433686, "loss": 1.1425, "step": 831 }, { "epoch": 0.12979972308352347, "grad_norm": 0.26921185851097107, "learning_rate": 0.00015564451262369247, "loss": 1.106, "step": 832 }, { "epoch": 0.12995573236607577, "grad_norm": 0.28271836042404175, "learning_rate": 0.00015553682471344238, "loss": 1.3681, "step": 833 }, { "epoch": 0.1301117416486281, "grad_norm": 0.26876282691955566, "learning_rate": 0.00015542904359432198, "loss": 1.112, "step": 834 }, { "epoch": 0.1302677509311804, "grad_norm": 0.2895980179309845, "learning_rate": 0.00015532116944722308, "loss": 1.1285, "step": 835 }, { "epoch": 0.13042376021373273, "grad_norm": 0.2612462639808655, "learning_rate": 0.00015521320245319363, "loss": 1.2669, "step": 836 }, { "epoch": 0.13057976949628503, "grad_norm": 0.30689284205436707, "learning_rate": 0.00015510514279343734, "loss": 1.3512, "step": 837 }, { "epoch": 0.13073577877883735, "grad_norm": 0.2981073558330536, "learning_rate": 0.00015499699064931355, "loss": 1.1284, "step": 838 }, { "epoch": 0.13089178806138965, "grad_norm": 0.2637684643268585, "learning_rate": 0.00015488874620233674, "loss": 1.0698, "step": 839 }, { "epoch": 0.13104779734394195, "grad_norm": 0.3048469126224518, "learning_rate": 0.0001547804096341763, "loss": 1.5209, "step": 840 }, { "epoch": 0.13120380662649428, "grad_norm": 0.2396387904882431, "learning_rate": 0.00015467198112665632, "loss": 0.9584, "step": 841 }, { "epoch": 0.13135981590904658, "grad_norm": 0.27103736996650696, "learning_rate": 0.0001545634608617551, "loss": 1.2846, "step": 842 }, { "epoch": 0.1315158251915989, "grad_norm": 0.2971721589565277, "learning_rate": 0.00015445484902160491, "loss": 1.6074, "step": 843 }, { "epoch": 0.1316718344741512, "grad_norm": 0.2440243512392044, "learning_rate": 0.00015434614578849188, "loss": 1.045, "step": 844 }, { "epoch": 0.13182784375670353, "grad_norm": 0.30210787057876587, "learning_rate": 0.00015423735134485536, "loss": 1.2948, "step": 845 }, { "epoch": 0.13198385303925583, "grad_norm": 0.25344711542129517, "learning_rate": 0.00015412846587328782, "loss": 1.2089, "step": 846 }, { "epoch": 0.13213986232180816, "grad_norm": 0.2884974479675293, "learning_rate": 0.0001540194895565346, "loss": 1.1123, "step": 847 }, { "epoch": 0.13229587160436046, "grad_norm": 0.28012582659721375, "learning_rate": 0.00015391042257749336, "loss": 1.2269, "step": 848 }, { "epoch": 0.13245188088691276, "grad_norm": 0.26394879817962646, "learning_rate": 0.00015380126511921403, "loss": 1.4469, "step": 849 }, { "epoch": 0.1326078901694651, "grad_norm": 0.2717582583427429, "learning_rate": 0.0001536920173648984, "loss": 1.1494, "step": 850 }, { "epoch": 0.1327638994520174, "grad_norm": 0.2968549132347107, "learning_rate": 0.00015358267949789966, "loss": 1.1903, "step": 851 }, { "epoch": 0.13291990873456971, "grad_norm": 0.2570381164550781, "learning_rate": 0.00015347325170172245, "loss": 1.1035, "step": 852 }, { "epoch": 0.133075918017122, "grad_norm": 0.3070929944515228, "learning_rate": 0.0001533637341600221, "loss": 1.4062, "step": 853 }, { "epoch": 0.13323192729967434, "grad_norm": 0.2886407971382141, "learning_rate": 0.0001532541270566049, "loss": 1.3491, "step": 854 }, { "epoch": 0.13338793658222664, "grad_norm": 0.2572009861469269, "learning_rate": 0.00015314443057542703, "loss": 1.2643, "step": 855 }, { "epoch": 0.13354394586477897, "grad_norm": 0.2768828272819519, "learning_rate": 0.00015303464490059506, "loss": 1.1444, "step": 856 }, { "epoch": 0.13369995514733127, "grad_norm": 0.3006720542907715, "learning_rate": 0.00015292477021636497, "loss": 1.2172, "step": 857 }, { "epoch": 0.13385596442988357, "grad_norm": 0.24407751858234406, "learning_rate": 0.0001528148067071423, "loss": 0.9457, "step": 858 }, { "epoch": 0.1340119737124359, "grad_norm": 0.25638723373413086, "learning_rate": 0.00015270475455748166, "loss": 1.1478, "step": 859 }, { "epoch": 0.1341679829949882, "grad_norm": 0.24834637343883514, "learning_rate": 0.00015259461395208628, "loss": 0.9835, "step": 860 }, { "epoch": 0.13432399227754052, "grad_norm": 0.2611735463142395, "learning_rate": 0.00015248438507580806, "loss": 1.125, "step": 861 }, { "epoch": 0.13448000156009282, "grad_norm": 0.3239066004753113, "learning_rate": 0.00015237406811364682, "loss": 1.1973, "step": 862 }, { "epoch": 0.13463601084264515, "grad_norm": 0.2662723958492279, "learning_rate": 0.0001522636632507504, "loss": 1.1115, "step": 863 }, { "epoch": 0.13479202012519745, "grad_norm": 0.26053330302238464, "learning_rate": 0.00015215317067241414, "loss": 1.0885, "step": 864 }, { "epoch": 0.13494802940774975, "grad_norm": 0.337984561920166, "learning_rate": 0.00015204259056408046, "loss": 0.8782, "step": 865 }, { "epoch": 0.13510403869030208, "grad_norm": 0.2965889871120453, "learning_rate": 0.00015193192311133884, "loss": 1.3198, "step": 866 }, { "epoch": 0.13526004797285437, "grad_norm": 0.3056474030017853, "learning_rate": 0.00015182116849992526, "loss": 1.5133, "step": 867 }, { "epoch": 0.1354160572554067, "grad_norm": 0.29193446040153503, "learning_rate": 0.00015171032691572206, "loss": 1.2365, "step": 868 }, { "epoch": 0.135572066537959, "grad_norm": 0.28123265504837036, "learning_rate": 0.00015159939854475743, "loss": 1.1654, "step": 869 }, { "epoch": 0.13572807582051133, "grad_norm": 0.3033466041088104, "learning_rate": 0.00015148838357320537, "loss": 1.5473, "step": 870 }, { "epoch": 0.13588408510306363, "grad_norm": 0.26069045066833496, "learning_rate": 0.00015137728218738502, "loss": 1.2213, "step": 871 }, { "epoch": 0.13604009438561596, "grad_norm": 0.3010377883911133, "learning_rate": 0.0001512660945737608, "loss": 1.1906, "step": 872 }, { "epoch": 0.13619610366816826, "grad_norm": 0.2615121304988861, "learning_rate": 0.00015115482091894165, "loss": 1.0807, "step": 873 }, { "epoch": 0.13635211295072056, "grad_norm": 0.27064162492752075, "learning_rate": 0.00015104346140968095, "loss": 1.3376, "step": 874 }, { "epoch": 0.13650812223327288, "grad_norm": 0.26106327772140503, "learning_rate": 0.00015093201623287631, "loss": 1.2357, "step": 875 }, { "epoch": 0.13666413151582518, "grad_norm": 0.26505109667778015, "learning_rate": 0.00015082048557556893, "loss": 1.4311, "step": 876 }, { "epoch": 0.1368201407983775, "grad_norm": 0.2965877950191498, "learning_rate": 0.00015070886962494358, "loss": 1.3246, "step": 877 }, { "epoch": 0.1369761500809298, "grad_norm": 0.3173799216747284, "learning_rate": 0.0001505971685683282, "loss": 1.4795, "step": 878 }, { "epoch": 0.13713215936348214, "grad_norm": 0.2562354505062103, "learning_rate": 0.00015048538259319346, "loss": 1.0112, "step": 879 }, { "epoch": 0.13728816864603444, "grad_norm": 0.2736887037754059, "learning_rate": 0.00015037351188715265, "loss": 1.3539, "step": 880 }, { "epoch": 0.13744417792858676, "grad_norm": 0.30376073718070984, "learning_rate": 0.00015026155663796123, "loss": 1.2837, "step": 881 }, { "epoch": 0.13760018721113906, "grad_norm": 0.3052879869937897, "learning_rate": 0.00015014951703351653, "loss": 1.3994, "step": 882 }, { "epoch": 0.13775619649369136, "grad_norm": 0.25414812564849854, "learning_rate": 0.00015003739326185751, "loss": 0.9258, "step": 883 }, { "epoch": 0.1379122057762437, "grad_norm": 0.33165043592453003, "learning_rate": 0.00014992518551116434, "loss": 1.4427, "step": 884 }, { "epoch": 0.138068215058796, "grad_norm": 0.2764113247394562, "learning_rate": 0.00014981289396975817, "loss": 1.3084, "step": 885 }, { "epoch": 0.13822422434134832, "grad_norm": 0.3221314251422882, "learning_rate": 0.0001497005188261007, "loss": 1.0262, "step": 886 }, { "epoch": 0.13838023362390062, "grad_norm": 0.24285611510276794, "learning_rate": 0.0001495880602687941, "loss": 1.1275, "step": 887 }, { "epoch": 0.13853624290645294, "grad_norm": 0.27305787801742554, "learning_rate": 0.00014947551848658034, "loss": 1.3409, "step": 888 }, { "epoch": 0.13869225218900524, "grad_norm": 0.29822468757629395, "learning_rate": 0.00014936289366834123, "loss": 1.3696, "step": 889 }, { "epoch": 0.13884826147155757, "grad_norm": 0.259112685918808, "learning_rate": 0.00014925018600309785, "loss": 1.2456, "step": 890 }, { "epoch": 0.13900427075410987, "grad_norm": 0.28749990463256836, "learning_rate": 0.00014913739568001033, "loss": 1.2809, "step": 891 }, { "epoch": 0.13916028003666217, "grad_norm": 0.24120725691318512, "learning_rate": 0.0001490245228883776, "loss": 1.1092, "step": 892 }, { "epoch": 0.1393162893192145, "grad_norm": 0.2791595160961151, "learning_rate": 0.0001489115678176369, "loss": 1.024, "step": 893 }, { "epoch": 0.1394722986017668, "grad_norm": 0.260062038898468, "learning_rate": 0.00014879853065736365, "loss": 1.1766, "step": 894 }, { "epoch": 0.13962830788431912, "grad_norm": 0.2642684280872345, "learning_rate": 0.00014868541159727096, "loss": 1.3869, "step": 895 }, { "epoch": 0.13978431716687142, "grad_norm": 0.2463667243719101, "learning_rate": 0.00014857221082720948, "loss": 1.0662, "step": 896 }, { "epoch": 0.13994032644942375, "grad_norm": 0.2916738986968994, "learning_rate": 0.0001484589285371669, "loss": 1.3209, "step": 897 }, { "epoch": 0.14009633573197605, "grad_norm": 0.27236512303352356, "learning_rate": 0.0001483455649172678, "loss": 1.1833, "step": 898 }, { "epoch": 0.14025234501452835, "grad_norm": 0.2619946002960205, "learning_rate": 0.0001482321201577733, "loss": 1.3137, "step": 899 }, { "epoch": 0.14040835429708068, "grad_norm": 0.31396883726119995, "learning_rate": 0.00014811859444908052, "loss": 1.3727, "step": 900 }, { "epoch": 0.14056436357963298, "grad_norm": 0.25572189688682556, "learning_rate": 0.0001480049879817226, "loss": 1.1046, "step": 901 }, { "epoch": 0.1407203728621853, "grad_norm": 0.2937905490398407, "learning_rate": 0.0001478913009463682, "loss": 1.3542, "step": 902 }, { "epoch": 0.1408763821447376, "grad_norm": 0.253520131111145, "learning_rate": 0.00014777753353382119, "loss": 1.2329, "step": 903 }, { "epoch": 0.14103239142728993, "grad_norm": 0.32491999864578247, "learning_rate": 0.00014766368593502026, "loss": 1.3285, "step": 904 }, { "epoch": 0.14118840070984223, "grad_norm": 0.2527139484882355, "learning_rate": 0.00014754975834103877, "loss": 1.1277, "step": 905 }, { "epoch": 0.14134440999239456, "grad_norm": 0.275272399187088, "learning_rate": 0.00014743575094308431, "loss": 1.4177, "step": 906 }, { "epoch": 0.14150041927494686, "grad_norm": 0.26013612747192383, "learning_rate": 0.0001473216639324984, "loss": 1.2476, "step": 907 }, { "epoch": 0.14165642855749916, "grad_norm": 0.28431418538093567, "learning_rate": 0.0001472074975007562, "loss": 1.3947, "step": 908 }, { "epoch": 0.1418124378400515, "grad_norm": 0.2629927396774292, "learning_rate": 0.0001470932518394661, "loss": 1.1587, "step": 909 }, { "epoch": 0.14196844712260379, "grad_norm": 0.2944284975528717, "learning_rate": 0.00014697892714036958, "loss": 1.342, "step": 910 }, { "epoch": 0.1421244564051561, "grad_norm": 0.31365662813186646, "learning_rate": 0.00014686452359534066, "loss": 1.4326, "step": 911 }, { "epoch": 0.1422804656877084, "grad_norm": 0.255875825881958, "learning_rate": 0.0001467500413963857, "loss": 1.2305, "step": 912 }, { "epoch": 0.14243647497026074, "grad_norm": 0.2717350423336029, "learning_rate": 0.00014663548073564316, "loss": 1.1965, "step": 913 }, { "epoch": 0.14259248425281304, "grad_norm": 0.28059136867523193, "learning_rate": 0.00014652084180538302, "loss": 1.3361, "step": 914 }, { "epoch": 0.14274849353536537, "grad_norm": 0.2790951430797577, "learning_rate": 0.00014640612479800686, "loss": 1.2785, "step": 915 }, { "epoch": 0.14290450281791767, "grad_norm": 0.24599488079547882, "learning_rate": 0.00014629132990604706, "loss": 1.2433, "step": 916 }, { "epoch": 0.14306051210046997, "grad_norm": 0.288792222738266, "learning_rate": 0.00014617645732216685, "loss": 1.1779, "step": 917 }, { "epoch": 0.1432165213830223, "grad_norm": 0.3035881221294403, "learning_rate": 0.00014606150723915984, "loss": 1.3885, "step": 918 }, { "epoch": 0.1433725306655746, "grad_norm": 0.28884077072143555, "learning_rate": 0.00014594647984994964, "loss": 1.3079, "step": 919 }, { "epoch": 0.14352853994812692, "grad_norm": 0.26054033637046814, "learning_rate": 0.00014583137534758967, "loss": 1.1897, "step": 920 }, { "epoch": 0.14368454923067922, "grad_norm": 0.31249237060546875, "learning_rate": 0.00014571619392526278, "loss": 1.4518, "step": 921 }, { "epoch": 0.14384055851323155, "grad_norm": 0.27947118878364563, "learning_rate": 0.0001456009357762809, "loss": 1.2305, "step": 922 }, { "epoch": 0.14399656779578385, "grad_norm": 0.2928619980812073, "learning_rate": 0.00014548560109408466, "loss": 1.3645, "step": 923 }, { "epoch": 0.14415257707833617, "grad_norm": 0.2735868990421295, "learning_rate": 0.00014537019007224324, "loss": 1.4351, "step": 924 }, { "epoch": 0.14430858636088847, "grad_norm": 0.30757883191108704, "learning_rate": 0.00014525470290445392, "loss": 1.4073, "step": 925 }, { "epoch": 0.14446459564344077, "grad_norm": 0.28719013929367065, "learning_rate": 0.00014513913978454168, "loss": 1.2918, "step": 926 }, { "epoch": 0.1446206049259931, "grad_norm": 0.2720332145690918, "learning_rate": 0.00014502350090645917, "loss": 1.2763, "step": 927 }, { "epoch": 0.1447766142085454, "grad_norm": 0.24720966815948486, "learning_rate": 0.000144907786464286, "loss": 1.0549, "step": 928 }, { "epoch": 0.14493262349109773, "grad_norm": 0.3164946138858795, "learning_rate": 0.0001447919966522287, "loss": 1.1007, "step": 929 }, { "epoch": 0.14508863277365003, "grad_norm": 0.2940044105052948, "learning_rate": 0.00014467613166462023, "loss": 1.2818, "step": 930 }, { "epoch": 0.14524464205620236, "grad_norm": 0.34050655364990234, "learning_rate": 0.00014456019169591978, "loss": 1.2618, "step": 931 }, { "epoch": 0.14540065133875466, "grad_norm": 0.24612417817115784, "learning_rate": 0.0001444441769407124, "loss": 0.991, "step": 932 }, { "epoch": 0.14555666062130695, "grad_norm": 0.2636529505252838, "learning_rate": 0.00014432808759370854, "loss": 1.4259, "step": 933 }, { "epoch": 0.14571266990385928, "grad_norm": 0.2628234624862671, "learning_rate": 0.00014421192384974396, "loss": 1.2545, "step": 934 }, { "epoch": 0.14586867918641158, "grad_norm": 0.2733708918094635, "learning_rate": 0.00014409568590377918, "loss": 1.1442, "step": 935 }, { "epoch": 0.1460246884689639, "grad_norm": 0.24912774562835693, "learning_rate": 0.0001439793739508994, "loss": 1.039, "step": 936 }, { "epoch": 0.1461806977515162, "grad_norm": 0.2927952706813812, "learning_rate": 0.00014386298818631386, "loss": 1.179, "step": 937 }, { "epoch": 0.14633670703406854, "grad_norm": 0.29066377878189087, "learning_rate": 0.0001437465288053558, "loss": 1.2024, "step": 938 }, { "epoch": 0.14649271631662084, "grad_norm": 0.2862846553325653, "learning_rate": 0.00014362999600348196, "loss": 1.1401, "step": 939 }, { "epoch": 0.14664872559917316, "grad_norm": 0.3009769022464752, "learning_rate": 0.00014351338997627234, "loss": 1.3966, "step": 940 }, { "epoch": 0.14680473488172546, "grad_norm": 0.31753668189048767, "learning_rate": 0.00014339671091942978, "loss": 1.4626, "step": 941 }, { "epoch": 0.14696074416427776, "grad_norm": 0.28623080253601074, "learning_rate": 0.0001432799590287797, "loss": 1.2841, "step": 942 }, { "epoch": 0.1471167534468301, "grad_norm": 0.3344881534576416, "learning_rate": 0.00014316313450026986, "loss": 1.5589, "step": 943 }, { "epoch": 0.1472727627293824, "grad_norm": 0.3132301867008209, "learning_rate": 0.00014304623752996973, "loss": 1.4286, "step": 944 }, { "epoch": 0.14742877201193472, "grad_norm": 0.299078106880188, "learning_rate": 0.00014292926831407061, "loss": 1.2099, "step": 945 }, { "epoch": 0.14758478129448702, "grad_norm": 0.27058905363082886, "learning_rate": 0.0001428122270488848, "loss": 1.2331, "step": 946 }, { "epoch": 0.14774079057703934, "grad_norm": 0.3202461004257202, "learning_rate": 0.00014269511393084572, "loss": 1.0677, "step": 947 }, { "epoch": 0.14789679985959164, "grad_norm": 0.3005964756011963, "learning_rate": 0.00014257792915650728, "loss": 1.3382, "step": 948 }, { "epoch": 0.14805280914214397, "grad_norm": 0.28587067127227783, "learning_rate": 0.00014246067292254366, "loss": 1.2216, "step": 949 }, { "epoch": 0.14820881842469627, "grad_norm": 0.27515730261802673, "learning_rate": 0.00014234334542574906, "loss": 1.1608, "step": 950 }, { "epoch": 0.14836482770724857, "grad_norm": 0.26588740944862366, "learning_rate": 0.00014222594686303706, "loss": 1.1547, "step": 951 }, { "epoch": 0.1485208369898009, "grad_norm": 0.3122014105319977, "learning_rate": 0.00014210847743144087, "loss": 1.3642, "step": 952 }, { "epoch": 0.1486768462723532, "grad_norm": 0.34852224588394165, "learning_rate": 0.00014199093732811225, "loss": 1.4751, "step": 953 }, { "epoch": 0.14883285555490552, "grad_norm": 0.2674144208431244, "learning_rate": 0.00014187332675032188, "loss": 1.2941, "step": 954 }, { "epoch": 0.14898886483745782, "grad_norm": 0.30863744020462036, "learning_rate": 0.00014175564589545854, "loss": 1.298, "step": 955 }, { "epoch": 0.14914487412001015, "grad_norm": 0.26412221789360046, "learning_rate": 0.00014163789496102902, "loss": 1.218, "step": 956 }, { "epoch": 0.14930088340256245, "grad_norm": 0.2920873761177063, "learning_rate": 0.0001415200741446577, "loss": 1.5198, "step": 957 }, { "epoch": 0.14945689268511475, "grad_norm": 0.29869547486305237, "learning_rate": 0.00014140218364408632, "loss": 1.3896, "step": 958 }, { "epoch": 0.14961290196766708, "grad_norm": 0.2696417570114136, "learning_rate": 0.00014128422365717347, "loss": 1.2046, "step": 959 }, { "epoch": 0.14976891125021938, "grad_norm": 0.27298402786254883, "learning_rate": 0.0001411661943818944, "loss": 1.3599, "step": 960 }, { "epoch": 0.1499249205327717, "grad_norm": 0.27962544560432434, "learning_rate": 0.0001410480960163407, "loss": 1.25, "step": 961 }, { "epoch": 0.150080929815324, "grad_norm": 0.2612510323524475, "learning_rate": 0.00014092992875871979, "loss": 1.1053, "step": 962 }, { "epoch": 0.15023693909787633, "grad_norm": 0.27618667483329773, "learning_rate": 0.00014081169280735488, "loss": 1.3871, "step": 963 }, { "epoch": 0.15039294838042863, "grad_norm": 0.24976608157157898, "learning_rate": 0.00014069338836068433, "loss": 1.2613, "step": 964 }, { "epoch": 0.15054895766298096, "grad_norm": 0.267610102891922, "learning_rate": 0.00014057501561726157, "loss": 1.0631, "step": 965 }, { "epoch": 0.15070496694553326, "grad_norm": 0.29677531123161316, "learning_rate": 0.00014045657477575448, "loss": 1.3567, "step": 966 }, { "epoch": 0.15086097622808556, "grad_norm": 0.29539185762405396, "learning_rate": 0.0001403380660349455, "loss": 1.1386, "step": 967 }, { "epoch": 0.15101698551063789, "grad_norm": 0.2691122889518738, "learning_rate": 0.00014021948959373076, "loss": 1.1089, "step": 968 }, { "epoch": 0.15117299479319019, "grad_norm": 0.24394790828227997, "learning_rate": 0.0001401008456511202, "loss": 1.1893, "step": 969 }, { "epoch": 0.1513290040757425, "grad_norm": 0.2849481403827667, "learning_rate": 0.0001399821344062369, "loss": 1.4775, "step": 970 }, { "epoch": 0.1514850133582948, "grad_norm": 0.2634568512439728, "learning_rate": 0.00013986335605831705, "loss": 1.1655, "step": 971 }, { "epoch": 0.15164102264084714, "grad_norm": 0.269879013299942, "learning_rate": 0.00013974451080670934, "loss": 1.2047, "step": 972 }, { "epoch": 0.15179703192339944, "grad_norm": 0.27636033296585083, "learning_rate": 0.0001396255988508748, "loss": 1.2987, "step": 973 }, { "epoch": 0.15195304120595177, "grad_norm": 0.2572225332260132, "learning_rate": 0.00013950662039038643, "loss": 1.3322, "step": 974 }, { "epoch": 0.15210905048850407, "grad_norm": 0.2573801279067993, "learning_rate": 0.00013938757562492873, "loss": 1.2547, "step": 975 }, { "epoch": 0.15226505977105637, "grad_norm": 0.3160158395767212, "learning_rate": 0.00013926846475429766, "loss": 1.5537, "step": 976 }, { "epoch": 0.1524210690536087, "grad_norm": 0.30125337839126587, "learning_rate": 0.00013914928797839995, "loss": 1.0853, "step": 977 }, { "epoch": 0.152577078336161, "grad_norm": 0.25772640109062195, "learning_rate": 0.0001390300454972531, "loss": 1.198, "step": 978 }, { "epoch": 0.15273308761871332, "grad_norm": 0.257586270570755, "learning_rate": 0.0001389107375109848, "loss": 1.086, "step": 979 }, { "epoch": 0.15288909690126562, "grad_norm": 0.2763863205909729, "learning_rate": 0.00013879136421983266, "loss": 1.2639, "step": 980 }, { "epoch": 0.15304510618381795, "grad_norm": 0.2751125991344452, "learning_rate": 0.00013867192582414393, "loss": 1.2473, "step": 981 }, { "epoch": 0.15320111546637025, "grad_norm": 0.3138543367385864, "learning_rate": 0.0001385524225243751, "loss": 1.3107, "step": 982 }, { "epoch": 0.15335712474892257, "grad_norm": 0.27820733189582825, "learning_rate": 0.00013843285452109166, "loss": 1.048, "step": 983 }, { "epoch": 0.15351313403147487, "grad_norm": 0.25756746530532837, "learning_rate": 0.00013831322201496757, "loss": 1.0374, "step": 984 }, { "epoch": 0.15366914331402717, "grad_norm": 0.332603394985199, "learning_rate": 0.0001381935252067852, "loss": 1.3359, "step": 985 }, { "epoch": 0.1538251525965795, "grad_norm": 0.33936744928359985, "learning_rate": 0.00013807376429743467, "loss": 1.5814, "step": 986 }, { "epoch": 0.1539811618791318, "grad_norm": 0.2748062014579773, "learning_rate": 0.00013795393948791383, "loss": 1.201, "step": 987 }, { "epoch": 0.15413717116168413, "grad_norm": 0.26038771867752075, "learning_rate": 0.0001378340509793277, "loss": 1.2087, "step": 988 }, { "epoch": 0.15429318044423643, "grad_norm": 0.24746748805046082, "learning_rate": 0.00013771409897288822, "loss": 1.0487, "step": 989 }, { "epoch": 0.15444918972678875, "grad_norm": 0.270280122756958, "learning_rate": 0.0001375940836699139, "loss": 1.1529, "step": 990 }, { "epoch": 0.15460519900934105, "grad_norm": 0.28278234601020813, "learning_rate": 0.00013747400527182953, "loss": 1.4292, "step": 991 }, { "epoch": 0.15476120829189335, "grad_norm": 0.3091171681880951, "learning_rate": 0.0001373538639801657, "loss": 1.2118, "step": 992 }, { "epoch": 0.15491721757444568, "grad_norm": 0.264275461435318, "learning_rate": 0.0001372336599965586, "loss": 1.2727, "step": 993 }, { "epoch": 0.15507322685699798, "grad_norm": 0.3125738799571991, "learning_rate": 0.00013711339352274966, "loss": 1.3389, "step": 994 }, { "epoch": 0.1552292361395503, "grad_norm": 0.2750801146030426, "learning_rate": 0.0001369930647605852, "loss": 1.1031, "step": 995 }, { "epoch": 0.1553852454221026, "grad_norm": 0.274777889251709, "learning_rate": 0.00013687267391201605, "loss": 1.4329, "step": 996 }, { "epoch": 0.15554125470465494, "grad_norm": 0.28475117683410645, "learning_rate": 0.00013675222117909717, "loss": 1.1914, "step": 997 }, { "epoch": 0.15569726398720724, "grad_norm": 0.27364879846572876, "learning_rate": 0.00013663170676398752, "loss": 1.1511, "step": 998 }, { "epoch": 0.15585327326975956, "grad_norm": 0.310995489358902, "learning_rate": 0.00013651113086894952, "loss": 1.0349, "step": 999 }, { "epoch": 0.15600928255231186, "grad_norm": 0.2910314202308655, "learning_rate": 0.00013639049369634876, "loss": 1.3302, "step": 1000 }, { "epoch": 0.15600928255231186, "eval_loss": 1.2771576642990112, "eval_runtime": 110.8263, "eval_samples_per_second": 38.556, "eval_steps_per_second": 4.827, "step": 1000 } ], "logging_steps": 1, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.144559113202074e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }