{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 0, "global_step": 511, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0019569471624266144, "grad_norm": 1.746453046798706, "learning_rate": 9.980430528375734e-06, "loss": 1.5399, "step": 1 }, { "epoch": 0.003913894324853229, "grad_norm": 0.8929949998855591, "learning_rate": 9.960861056751468e-06, "loss": 1.4639, "step": 2 }, { "epoch": 0.005870841487279843, "grad_norm": 0.7871854305267334, "learning_rate": 9.941291585127202e-06, "loss": 1.3791, "step": 3 }, { "epoch": 0.007827788649706457, "grad_norm": 1.535941243171692, "learning_rate": 9.921722113502935e-06, "loss": 1.5319, "step": 4 }, { "epoch": 0.009784735812133072, "grad_norm": 0.8197427988052368, "learning_rate": 9.902152641878669e-06, "loss": 1.5193, "step": 5 }, { "epoch": 0.011741682974559686, "grad_norm": 0.7342958450317383, "learning_rate": 9.882583170254404e-06, "loss": 1.3804, "step": 6 }, { "epoch": 0.0136986301369863, "grad_norm": 0.6128910183906555, "learning_rate": 9.863013698630138e-06, "loss": 1.3007, "step": 7 }, { "epoch": 0.015655577299412915, "grad_norm": 0.6029248833656311, "learning_rate": 9.843444227005872e-06, "loss": 1.3657, "step": 8 }, { "epoch": 0.01761252446183953, "grad_norm": 0.5792056322097778, "learning_rate": 9.823874755381605e-06, "loss": 1.4067, "step": 9 }, { "epoch": 0.019569471624266144, "grad_norm": 0.6559755206108093, "learning_rate": 9.804305283757339e-06, "loss": 1.4272, "step": 10 }, { "epoch": 0.021526418786692758, "grad_norm": 0.5778592228889465, "learning_rate": 9.784735812133073e-06, "loss": 1.4143, "step": 11 }, { "epoch": 0.023483365949119372, "grad_norm": 0.5314830541610718, "learning_rate": 9.765166340508806e-06, "loss": 1.3531, "step": 12 }, { "epoch": 0.025440313111545987, "grad_norm": 0.49222293496131897, "learning_rate": 9.74559686888454e-06, "loss": 1.4155, "step": 13 }, { "epoch": 0.0273972602739726, "grad_norm": 0.9724328517913818, "learning_rate": 9.726027397260275e-06, "loss": 1.2311, "step": 14 }, { "epoch": 0.029354207436399216, "grad_norm": 0.488921582698822, "learning_rate": 9.706457925636007e-06, "loss": 1.3007, "step": 15 }, { "epoch": 0.03131115459882583, "grad_norm": 0.49755173921585083, "learning_rate": 9.686888454011743e-06, "loss": 1.2708, "step": 16 }, { "epoch": 0.033268101761252444, "grad_norm": 0.49553531408309937, "learning_rate": 9.667318982387476e-06, "loss": 1.3346, "step": 17 }, { "epoch": 0.03522504892367906, "grad_norm": 0.40435364842414856, "learning_rate": 9.64774951076321e-06, "loss": 1.2333, "step": 18 }, { "epoch": 0.03718199608610567, "grad_norm": 0.46682047843933105, "learning_rate": 9.628180039138944e-06, "loss": 1.2733, "step": 19 }, { "epoch": 0.03913894324853229, "grad_norm": 0.4170684218406677, "learning_rate": 9.608610567514677e-06, "loss": 1.2262, "step": 20 }, { "epoch": 0.0410958904109589, "grad_norm": 0.4080331027507782, "learning_rate": 9.589041095890411e-06, "loss": 1.1896, "step": 21 }, { "epoch": 0.043052837573385516, "grad_norm": 0.3215958774089813, "learning_rate": 9.569471624266146e-06, "loss": 1.2084, "step": 22 }, { "epoch": 0.04500978473581213, "grad_norm": 0.34072086215019226, "learning_rate": 9.549902152641878e-06, "loss": 1.2204, "step": 23 }, { "epoch": 0.046966731898238745, "grad_norm": 0.34246671199798584, "learning_rate": 9.530332681017614e-06, "loss": 1.2544, "step": 24 }, { "epoch": 0.04892367906066536, "grad_norm": 0.31154486536979675, "learning_rate": 9.510763209393347e-06, "loss": 1.2244, "step": 25 }, { "epoch": 0.050880626223091974, "grad_norm": 0.29958635568618774, "learning_rate": 9.49119373776908e-06, "loss": 1.1961, "step": 26 }, { "epoch": 0.05283757338551859, "grad_norm": 0.3322153687477112, "learning_rate": 9.471624266144814e-06, "loss": 1.1918, "step": 27 }, { "epoch": 0.0547945205479452, "grad_norm": 0.30765673518180847, "learning_rate": 9.452054794520548e-06, "loss": 1.2193, "step": 28 }, { "epoch": 0.05675146771037182, "grad_norm": 0.3587987720966339, "learning_rate": 9.432485322896282e-06, "loss": 1.2119, "step": 29 }, { "epoch": 0.05870841487279843, "grad_norm": 0.3882049024105072, "learning_rate": 9.412915851272017e-06, "loss": 1.2632, "step": 30 }, { "epoch": 0.060665362035225046, "grad_norm": 0.31729480624198914, "learning_rate": 9.393346379647749e-06, "loss": 1.1768, "step": 31 }, { "epoch": 0.06262230919765166, "grad_norm": 0.32497256994247437, "learning_rate": 9.373776908023484e-06, "loss": 1.1603, "step": 32 }, { "epoch": 0.06457925636007827, "grad_norm": 0.3473309576511383, "learning_rate": 9.354207436399218e-06, "loss": 1.2131, "step": 33 }, { "epoch": 0.06653620352250489, "grad_norm": 0.35079726576805115, "learning_rate": 9.334637964774952e-06, "loss": 1.1437, "step": 34 }, { "epoch": 0.0684931506849315, "grad_norm": 0.33893632888793945, "learning_rate": 9.315068493150685e-06, "loss": 1.1662, "step": 35 }, { "epoch": 0.07045009784735812, "grad_norm": 0.2970063090324402, "learning_rate": 9.295499021526419e-06, "loss": 1.1354, "step": 36 }, { "epoch": 0.07240704500978473, "grad_norm": 0.29775238037109375, "learning_rate": 9.275929549902153e-06, "loss": 1.1152, "step": 37 }, { "epoch": 0.07436399217221135, "grad_norm": 0.28359663486480713, "learning_rate": 9.256360078277888e-06, "loss": 1.1677, "step": 38 }, { "epoch": 0.07632093933463796, "grad_norm": 0.45086753368377686, "learning_rate": 9.23679060665362e-06, "loss": 1.1375, "step": 39 }, { "epoch": 0.07827788649706457, "grad_norm": 0.3175908029079437, "learning_rate": 9.217221135029355e-06, "loss": 1.1484, "step": 40 }, { "epoch": 0.08023483365949119, "grad_norm": 0.28813987970352173, "learning_rate": 9.197651663405089e-06, "loss": 1.1496, "step": 41 }, { "epoch": 0.0821917808219178, "grad_norm": 0.43875834345817566, "learning_rate": 9.178082191780823e-06, "loss": 1.1561, "step": 42 }, { "epoch": 0.08414872798434442, "grad_norm": 0.29168814420700073, "learning_rate": 9.158512720156556e-06, "loss": 1.123, "step": 43 }, { "epoch": 0.08610567514677103, "grad_norm": 0.2780202627182007, "learning_rate": 9.13894324853229e-06, "loss": 1.0861, "step": 44 }, { "epoch": 0.08806262230919765, "grad_norm": 0.31432074308395386, "learning_rate": 9.119373776908024e-06, "loss": 1.1191, "step": 45 }, { "epoch": 0.09001956947162426, "grad_norm": 0.31427478790283203, "learning_rate": 9.099804305283759e-06, "loss": 1.1696, "step": 46 }, { "epoch": 0.09197651663405088, "grad_norm": 0.3010156452655792, "learning_rate": 9.080234833659491e-06, "loss": 1.1092, "step": 47 }, { "epoch": 0.09393346379647749, "grad_norm": 0.3034595549106598, "learning_rate": 9.060665362035226e-06, "loss": 1.1503, "step": 48 }, { "epoch": 0.0958904109589041, "grad_norm": 0.29212486743927, "learning_rate": 9.04109589041096e-06, "loss": 1.063, "step": 49 }, { "epoch": 0.09784735812133072, "grad_norm": 0.3151186406612396, "learning_rate": 9.021526418786694e-06, "loss": 1.1331, "step": 50 }, { "epoch": 0.09980430528375733, "grad_norm": 0.31028345227241516, "learning_rate": 9.001956947162427e-06, "loss": 1.082, "step": 51 }, { "epoch": 0.10176125244618395, "grad_norm": 0.2922515869140625, "learning_rate": 8.982387475538161e-06, "loss": 1.1151, "step": 52 }, { "epoch": 0.10371819960861056, "grad_norm": 0.3567720651626587, "learning_rate": 8.962818003913895e-06, "loss": 1.075, "step": 53 }, { "epoch": 0.10567514677103718, "grad_norm": 0.2653373181819916, "learning_rate": 8.943248532289628e-06, "loss": 1.1201, "step": 54 }, { "epoch": 0.10763209393346379, "grad_norm": 0.301695317029953, "learning_rate": 8.923679060665362e-06, "loss": 1.0929, "step": 55 }, { "epoch": 0.1095890410958904, "grad_norm": 0.407412052154541, "learning_rate": 8.904109589041097e-06, "loss": 1.0487, "step": 56 }, { "epoch": 0.11154598825831702, "grad_norm": 0.2796148657798767, "learning_rate": 8.88454011741683e-06, "loss": 1.0566, "step": 57 }, { "epoch": 0.11350293542074363, "grad_norm": 0.26347002387046814, "learning_rate": 8.864970645792564e-06, "loss": 1.0474, "step": 58 }, { "epoch": 0.11545988258317025, "grad_norm": 0.40815216302871704, "learning_rate": 8.845401174168298e-06, "loss": 1.0143, "step": 59 }, { "epoch": 0.11741682974559686, "grad_norm": 0.3168679475784302, "learning_rate": 8.825831702544032e-06, "loss": 0.9874, "step": 60 }, { "epoch": 0.11937377690802348, "grad_norm": 0.3845454454421997, "learning_rate": 8.806262230919765e-06, "loss": 0.9675, "step": 61 }, { "epoch": 0.12133072407045009, "grad_norm": 0.3026748299598694, "learning_rate": 8.786692759295499e-06, "loss": 1.0715, "step": 62 }, { "epoch": 0.1232876712328767, "grad_norm": 0.3220444917678833, "learning_rate": 8.767123287671233e-06, "loss": 0.9537, "step": 63 }, { "epoch": 0.12524461839530332, "grad_norm": 0.2780131995677948, "learning_rate": 8.747553816046968e-06, "loss": 0.9989, "step": 64 }, { "epoch": 0.12720156555772993, "grad_norm": 0.2975338101387024, "learning_rate": 8.7279843444227e-06, "loss": 1.0493, "step": 65 }, { "epoch": 0.12915851272015655, "grad_norm": 0.29012811183929443, "learning_rate": 8.708414872798435e-06, "loss": 1.0393, "step": 66 }, { "epoch": 0.13111545988258316, "grad_norm": 0.25619998574256897, "learning_rate": 8.688845401174169e-06, "loss": 1.0265, "step": 67 }, { "epoch": 0.13307240704500978, "grad_norm": 0.27613916993141174, "learning_rate": 8.669275929549903e-06, "loss": 1.0921, "step": 68 }, { "epoch": 0.1350293542074364, "grad_norm": 0.28087612986564636, "learning_rate": 8.649706457925636e-06, "loss": 1.0375, "step": 69 }, { "epoch": 0.136986301369863, "grad_norm": 0.2618680000305176, "learning_rate": 8.63013698630137e-06, "loss": 1.0382, "step": 70 }, { "epoch": 0.13894324853228962, "grad_norm": 0.38419079780578613, "learning_rate": 8.610567514677104e-06, "loss": 1.0343, "step": 71 }, { "epoch": 0.14090019569471623, "grad_norm": 0.2579880654811859, "learning_rate": 8.590998043052839e-06, "loss": 1.034, "step": 72 }, { "epoch": 0.14285714285714285, "grad_norm": 0.2773086726665497, "learning_rate": 8.571428571428571e-06, "loss": 1.0823, "step": 73 }, { "epoch": 0.14481409001956946, "grad_norm": 0.2644873559474945, "learning_rate": 8.551859099804306e-06, "loss": 1.0831, "step": 74 }, { "epoch": 0.14677103718199608, "grad_norm": 0.3036477863788605, "learning_rate": 8.53228962818004e-06, "loss": 1.0366, "step": 75 }, { "epoch": 0.1487279843444227, "grad_norm": 0.25494951009750366, "learning_rate": 8.512720156555774e-06, "loss": 0.9652, "step": 76 }, { "epoch": 0.1506849315068493, "grad_norm": 0.27949514985084534, "learning_rate": 8.493150684931507e-06, "loss": 1.0384, "step": 77 }, { "epoch": 0.15264187866927592, "grad_norm": 0.2715696096420288, "learning_rate": 8.473581213307241e-06, "loss": 1.1061, "step": 78 }, { "epoch": 0.15459882583170254, "grad_norm": 0.36530429124832153, "learning_rate": 8.454011741682975e-06, "loss": 1.103, "step": 79 }, { "epoch": 0.15655577299412915, "grad_norm": 0.3417298495769501, "learning_rate": 8.43444227005871e-06, "loss": 1.0395, "step": 80 }, { "epoch": 0.15851272015655576, "grad_norm": 0.25749561190605164, "learning_rate": 8.414872798434442e-06, "loss": 1.0554, "step": 81 }, { "epoch": 0.16046966731898238, "grad_norm": 0.30251964926719666, "learning_rate": 8.395303326810177e-06, "loss": 1.0466, "step": 82 }, { "epoch": 0.162426614481409, "grad_norm": 0.27155768871307373, "learning_rate": 8.37573385518591e-06, "loss": 1.0019, "step": 83 }, { "epoch": 0.1643835616438356, "grad_norm": 0.2923905551433563, "learning_rate": 8.356164383561644e-06, "loss": 1.0335, "step": 84 }, { "epoch": 0.16634050880626222, "grad_norm": 0.2730099558830261, "learning_rate": 8.336594911937378e-06, "loss": 1.0066, "step": 85 }, { "epoch": 0.16829745596868884, "grad_norm": 0.27152329683303833, "learning_rate": 8.317025440313112e-06, "loss": 1.0408, "step": 86 }, { "epoch": 0.17025440313111545, "grad_norm": 0.2805017828941345, "learning_rate": 8.297455968688845e-06, "loss": 1.0159, "step": 87 }, { "epoch": 0.17221135029354206, "grad_norm": 0.30287447571754456, "learning_rate": 8.27788649706458e-06, "loss": 0.9943, "step": 88 }, { "epoch": 0.17416829745596868, "grad_norm": 0.4621107280254364, "learning_rate": 8.258317025440313e-06, "loss": 0.9984, "step": 89 }, { "epoch": 0.1761252446183953, "grad_norm": 0.27693963050842285, "learning_rate": 8.238747553816048e-06, "loss": 1.0471, "step": 90 }, { "epoch": 0.1780821917808219, "grad_norm": 0.2575695514678955, "learning_rate": 8.219178082191782e-06, "loss": 1.0124, "step": 91 }, { "epoch": 0.18003913894324852, "grad_norm": 0.3268100321292877, "learning_rate": 8.199608610567515e-06, "loss": 1.0201, "step": 92 }, { "epoch": 0.18199608610567514, "grad_norm": 0.2674817144870758, "learning_rate": 8.180039138943249e-06, "loss": 1.0491, "step": 93 }, { "epoch": 0.18395303326810175, "grad_norm": 0.29703083634376526, "learning_rate": 8.160469667318983e-06, "loss": 0.9988, "step": 94 }, { "epoch": 0.18590998043052837, "grad_norm": 0.3002019226551056, "learning_rate": 8.140900195694716e-06, "loss": 0.983, "step": 95 }, { "epoch": 0.18786692759295498, "grad_norm": 0.28777456283569336, "learning_rate": 8.121330724070452e-06, "loss": 1.0218, "step": 96 }, { "epoch": 0.1898238747553816, "grad_norm": 0.27293819189071655, "learning_rate": 8.101761252446184e-06, "loss": 1.0338, "step": 97 }, { "epoch": 0.1917808219178082, "grad_norm": 0.288841187953949, "learning_rate": 8.082191780821919e-06, "loss": 1.0355, "step": 98 }, { "epoch": 0.19373776908023482, "grad_norm": 0.2783367931842804, "learning_rate": 8.062622309197653e-06, "loss": 1.0328, "step": 99 }, { "epoch": 0.19569471624266144, "grad_norm": 0.3079596161842346, "learning_rate": 8.043052837573386e-06, "loss": 1.0034, "step": 100 }, { "epoch": 0.19765166340508805, "grad_norm": 0.27803629636764526, "learning_rate": 8.02348336594912e-06, "loss": 0.9606, "step": 101 }, { "epoch": 0.19960861056751467, "grad_norm": 0.2793106138706207, "learning_rate": 8.003913894324854e-06, "loss": 0.9918, "step": 102 }, { "epoch": 0.20156555772994128, "grad_norm": 0.3062870502471924, "learning_rate": 7.984344422700587e-06, "loss": 1.027, "step": 103 }, { "epoch": 0.2035225048923679, "grad_norm": 0.2591916620731354, "learning_rate": 7.964774951076321e-06, "loss": 0.9696, "step": 104 }, { "epoch": 0.2054794520547945, "grad_norm": 0.27566251158714294, "learning_rate": 7.945205479452055e-06, "loss": 0.9723, "step": 105 }, { "epoch": 0.20743639921722112, "grad_norm": 0.5589897632598877, "learning_rate": 7.92563600782779e-06, "loss": 0.8956, "step": 106 }, { "epoch": 0.20939334637964774, "grad_norm": 0.3209697902202606, "learning_rate": 7.906066536203524e-06, "loss": 1.0195, "step": 107 }, { "epoch": 0.21135029354207435, "grad_norm": 0.5480762720108032, "learning_rate": 7.886497064579257e-06, "loss": 0.8403, "step": 108 }, { "epoch": 0.21330724070450097, "grad_norm": 0.27812933921813965, "learning_rate": 7.86692759295499e-06, "loss": 0.9995, "step": 109 }, { "epoch": 0.21526418786692758, "grad_norm": 0.3054767847061157, "learning_rate": 7.847358121330724e-06, "loss": 0.9886, "step": 110 }, { "epoch": 0.2172211350293542, "grad_norm": 0.26338857412338257, "learning_rate": 7.827788649706458e-06, "loss": 1.0099, "step": 111 }, { "epoch": 0.2191780821917808, "grad_norm": 0.28542402386665344, "learning_rate": 7.808219178082192e-06, "loss": 1.0108, "step": 112 }, { "epoch": 0.22113502935420742, "grad_norm": 0.2645825147628784, "learning_rate": 7.788649706457925e-06, "loss": 0.9855, "step": 113 }, { "epoch": 0.22309197651663404, "grad_norm": 0.36923593282699585, "learning_rate": 7.76908023483366e-06, "loss": 0.874, "step": 114 }, { "epoch": 0.22504892367906065, "grad_norm": 0.2942226827144623, "learning_rate": 7.749510763209393e-06, "loss": 1.035, "step": 115 }, { "epoch": 0.22700587084148727, "grad_norm": 0.25831156969070435, "learning_rate": 7.729941291585128e-06, "loss": 0.9821, "step": 116 }, { "epoch": 0.22896281800391388, "grad_norm": 0.2656974196434021, "learning_rate": 7.710371819960862e-06, "loss": 0.9607, "step": 117 }, { "epoch": 0.2309197651663405, "grad_norm": 0.2775110602378845, "learning_rate": 7.690802348336595e-06, "loss": 0.9405, "step": 118 }, { "epoch": 0.2328767123287671, "grad_norm": 0.2815232276916504, "learning_rate": 7.671232876712329e-06, "loss": 0.9963, "step": 119 }, { "epoch": 0.23483365949119372, "grad_norm": 0.2941558063030243, "learning_rate": 7.651663405088063e-06, "loss": 1.0005, "step": 120 }, { "epoch": 0.23679060665362034, "grad_norm": 0.3432468771934509, "learning_rate": 7.632093933463796e-06, "loss": 0.9146, "step": 121 }, { "epoch": 0.23874755381604695, "grad_norm": 0.2610355615615845, "learning_rate": 7.612524461839531e-06, "loss": 1.0172, "step": 122 }, { "epoch": 0.24070450097847357, "grad_norm": 0.30524012446403503, "learning_rate": 7.5929549902152645e-06, "loss": 0.8499, "step": 123 }, { "epoch": 0.24266144814090018, "grad_norm": 0.32364916801452637, "learning_rate": 7.573385518590999e-06, "loss": 0.9872, "step": 124 }, { "epoch": 0.2446183953033268, "grad_norm": 0.3468589186668396, "learning_rate": 7.553816046966732e-06, "loss": 0.8668, "step": 125 }, { "epoch": 0.2465753424657534, "grad_norm": 0.28638043999671936, "learning_rate": 7.534246575342466e-06, "loss": 0.9535, "step": 126 }, { "epoch": 0.24853228962818003, "grad_norm": 0.4365461766719818, "learning_rate": 7.5146771037182e-06, "loss": 0.9555, "step": 127 }, { "epoch": 0.25048923679060664, "grad_norm": 0.2678782641887665, "learning_rate": 7.4951076320939344e-06, "loss": 0.9609, "step": 128 }, { "epoch": 0.25244618395303325, "grad_norm": 0.32698872685432434, "learning_rate": 7.475538160469667e-06, "loss": 0.9961, "step": 129 }, { "epoch": 0.25440313111545987, "grad_norm": 0.2704651653766632, "learning_rate": 7.455968688845402e-06, "loss": 0.9892, "step": 130 }, { "epoch": 0.2563600782778865, "grad_norm": 0.28522607684135437, "learning_rate": 7.436399217221135e-06, "loss": 0.9704, "step": 131 }, { "epoch": 0.2583170254403131, "grad_norm": 0.3018089532852173, "learning_rate": 7.41682974559687e-06, "loss": 0.9922, "step": 132 }, { "epoch": 0.2602739726027397, "grad_norm": 0.3053472638130188, "learning_rate": 7.397260273972603e-06, "loss": 0.9679, "step": 133 }, { "epoch": 0.2622309197651663, "grad_norm": 0.3184056580066681, "learning_rate": 7.377690802348337e-06, "loss": 0.993, "step": 134 }, { "epoch": 0.26418786692759294, "grad_norm": 0.2696513831615448, "learning_rate": 7.358121330724071e-06, "loss": 0.9933, "step": 135 }, { "epoch": 0.26614481409001955, "grad_norm": 0.2935352921485901, "learning_rate": 7.338551859099805e-06, "loss": 0.9124, "step": 136 }, { "epoch": 0.26810176125244617, "grad_norm": 0.29200154542922974, "learning_rate": 7.318982387475538e-06, "loss": 1.0168, "step": 137 }, { "epoch": 0.2700587084148728, "grad_norm": 0.29628440737724304, "learning_rate": 7.299412915851273e-06, "loss": 0.8931, "step": 138 }, { "epoch": 0.2720156555772994, "grad_norm": 0.2664463520050049, "learning_rate": 7.279843444227006e-06, "loss": 0.9743, "step": 139 }, { "epoch": 0.273972602739726, "grad_norm": 0.3182372748851776, "learning_rate": 7.260273972602741e-06, "loss": 0.961, "step": 140 }, { "epoch": 0.2759295499021526, "grad_norm": 0.2961776554584503, "learning_rate": 7.240704500978474e-06, "loss": 0.9807, "step": 141 }, { "epoch": 0.27788649706457924, "grad_norm": 0.2903692126274109, "learning_rate": 7.221135029354208e-06, "loss": 0.9685, "step": 142 }, { "epoch": 0.27984344422700586, "grad_norm": 0.43462836742401123, "learning_rate": 7.201565557729942e-06, "loss": 0.9757, "step": 143 }, { "epoch": 0.28180039138943247, "grad_norm": 0.4679277241230011, "learning_rate": 7.181996086105676e-06, "loss": 0.9829, "step": 144 }, { "epoch": 0.2837573385518591, "grad_norm": 0.3153940737247467, "learning_rate": 7.162426614481409e-06, "loss": 1.0502, "step": 145 }, { "epoch": 0.2857142857142857, "grad_norm": 0.2722516357898712, "learning_rate": 7.1428571428571436e-06, "loss": 0.9777, "step": 146 }, { "epoch": 0.2876712328767123, "grad_norm": 0.30617383122444153, "learning_rate": 7.123287671232877e-06, "loss": 0.8937, "step": 147 }, { "epoch": 0.2896281800391389, "grad_norm": 0.28956839442253113, "learning_rate": 7.103718199608612e-06, "loss": 0.9796, "step": 148 }, { "epoch": 0.29158512720156554, "grad_norm": 0.31176698207855225, "learning_rate": 7.0841487279843445e-06, "loss": 0.9775, "step": 149 }, { "epoch": 0.29354207436399216, "grad_norm": 0.3150478005409241, "learning_rate": 7.064579256360079e-06, "loss": 0.9624, "step": 150 }, { "epoch": 0.29549902152641877, "grad_norm": 0.26309195160865784, "learning_rate": 7.045009784735813e-06, "loss": 1.0398, "step": 151 }, { "epoch": 0.2974559686888454, "grad_norm": 0.3138732612133026, "learning_rate": 7.025440313111546e-06, "loss": 0.9977, "step": 152 }, { "epoch": 0.299412915851272, "grad_norm": 0.39994385838508606, "learning_rate": 7.00587084148728e-06, "loss": 0.8905, "step": 153 }, { "epoch": 0.3013698630136986, "grad_norm": 0.3341100811958313, "learning_rate": 6.9863013698630145e-06, "loss": 0.946, "step": 154 }, { "epoch": 0.30332681017612523, "grad_norm": 0.2890676259994507, "learning_rate": 6.966731898238748e-06, "loss": 0.9756, "step": 155 }, { "epoch": 0.30528375733855184, "grad_norm": 0.2878880202770233, "learning_rate": 6.947162426614482e-06, "loss": 1.002, "step": 156 }, { "epoch": 0.30724070450097846, "grad_norm": 0.31986042857170105, "learning_rate": 6.927592954990215e-06, "loss": 0.9563, "step": 157 }, { "epoch": 0.30919765166340507, "grad_norm": 0.3330422639846802, "learning_rate": 6.90802348336595e-06, "loss": 0.946, "step": 158 }, { "epoch": 0.3111545988258317, "grad_norm": 0.3121936321258545, "learning_rate": 6.8884540117416836e-06, "loss": 0.9553, "step": 159 }, { "epoch": 0.3131115459882583, "grad_norm": 0.32173246145248413, "learning_rate": 6.868884540117417e-06, "loss": 0.9717, "step": 160 }, { "epoch": 0.3150684931506849, "grad_norm": 0.32296982407569885, "learning_rate": 6.849315068493151e-06, "loss": 0.8586, "step": 161 }, { "epoch": 0.31702544031311153, "grad_norm": 0.319832444190979, "learning_rate": 6.829745596868885e-06, "loss": 0.9785, "step": 162 }, { "epoch": 0.31898238747553814, "grad_norm": 0.3126278817653656, "learning_rate": 6.810176125244618e-06, "loss": 0.9448, "step": 163 }, { "epoch": 0.32093933463796476, "grad_norm": 0.3096999228000641, "learning_rate": 6.790606653620353e-06, "loss": 0.971, "step": 164 }, { "epoch": 0.32289628180039137, "grad_norm": 0.3132016062736511, "learning_rate": 6.771037181996086e-06, "loss": 0.9722, "step": 165 }, { "epoch": 0.324853228962818, "grad_norm": 0.3196086585521698, "learning_rate": 6.751467710371821e-06, "loss": 0.9611, "step": 166 }, { "epoch": 0.3268101761252446, "grad_norm": 0.33392807841300964, "learning_rate": 6.731898238747554e-06, "loss": 0.9585, "step": 167 }, { "epoch": 0.3287671232876712, "grad_norm": 0.3167315125465393, "learning_rate": 6.712328767123288e-06, "loss": 0.8919, "step": 168 }, { "epoch": 0.33072407045009783, "grad_norm": 0.3052123188972473, "learning_rate": 6.692759295499022e-06, "loss": 0.944, "step": 169 }, { "epoch": 0.33268101761252444, "grad_norm": 0.32091811299324036, "learning_rate": 6.673189823874756e-06, "loss": 0.898, "step": 170 }, { "epoch": 0.33463796477495106, "grad_norm": 0.3221595287322998, "learning_rate": 6.653620352250489e-06, "loss": 0.9206, "step": 171 }, { "epoch": 0.33659491193737767, "grad_norm": 0.3247275650501251, "learning_rate": 6.634050880626224e-06, "loss": 0.9629, "step": 172 }, { "epoch": 0.3385518590998043, "grad_norm": 0.3308790624141693, "learning_rate": 6.614481409001957e-06, "loss": 0.9712, "step": 173 }, { "epoch": 0.3405088062622309, "grad_norm": 0.2884618937969208, "learning_rate": 6.594911937377692e-06, "loss": 0.9922, "step": 174 }, { "epoch": 0.3424657534246575, "grad_norm": 0.2902919054031372, "learning_rate": 6.5753424657534245e-06, "loss": 0.9865, "step": 175 }, { "epoch": 0.34442270058708413, "grad_norm": 0.3081991374492645, "learning_rate": 6.555772994129159e-06, "loss": 0.9578, "step": 176 }, { "epoch": 0.34637964774951074, "grad_norm": 0.30048370361328125, "learning_rate": 6.536203522504893e-06, "loss": 1.0135, "step": 177 }, { "epoch": 0.34833659491193736, "grad_norm": 0.30617308616638184, "learning_rate": 6.516634050880627e-06, "loss": 0.932, "step": 178 }, { "epoch": 0.350293542074364, "grad_norm": 0.32503214478492737, "learning_rate": 6.49706457925636e-06, "loss": 0.8688, "step": 179 }, { "epoch": 0.3522504892367906, "grad_norm": 0.348254531621933, "learning_rate": 6.4774951076320945e-06, "loss": 0.9353, "step": 180 }, { "epoch": 0.3542074363992172, "grad_norm": 0.3076007664203644, "learning_rate": 6.457925636007828e-06, "loss": 0.9643, "step": 181 }, { "epoch": 0.3561643835616438, "grad_norm": 0.31836771965026855, "learning_rate": 6.438356164383563e-06, "loss": 0.9717, "step": 182 }, { "epoch": 0.35812133072407043, "grad_norm": 0.3177882134914398, "learning_rate": 6.4187866927592954e-06, "loss": 0.9363, "step": 183 }, { "epoch": 0.36007827788649704, "grad_norm": 0.35349786281585693, "learning_rate": 6.39921722113503e-06, "loss": 0.9697, "step": 184 }, { "epoch": 0.36203522504892366, "grad_norm": 0.3868875801563263, "learning_rate": 6.379647749510764e-06, "loss": 0.9895, "step": 185 }, { "epoch": 0.3639921722113503, "grad_norm": 0.3449805676937103, "learning_rate": 6.360078277886498e-06, "loss": 0.9625, "step": 186 }, { "epoch": 0.3659491193737769, "grad_norm": 0.3141196370124817, "learning_rate": 6.340508806262231e-06, "loss": 0.8996, "step": 187 }, { "epoch": 0.3679060665362035, "grad_norm": 0.3363155424594879, "learning_rate": 6.320939334637965e-06, "loss": 0.9723, "step": 188 }, { "epoch": 0.3698630136986301, "grad_norm": 0.27333149313926697, "learning_rate": 6.301369863013699e-06, "loss": 0.9592, "step": 189 }, { "epoch": 0.37181996086105673, "grad_norm": 0.32245489954948425, "learning_rate": 6.2818003913894335e-06, "loss": 0.911, "step": 190 }, { "epoch": 0.37377690802348335, "grad_norm": 0.31895750761032104, "learning_rate": 6.262230919765166e-06, "loss": 0.9951, "step": 191 }, { "epoch": 0.37573385518590996, "grad_norm": 0.373411625623703, "learning_rate": 6.242661448140901e-06, "loss": 0.9124, "step": 192 }, { "epoch": 0.3776908023483366, "grad_norm": 0.30244988203048706, "learning_rate": 6.2230919765166345e-06, "loss": 0.9502, "step": 193 }, { "epoch": 0.3796477495107632, "grad_norm": 0.29507070779800415, "learning_rate": 6.203522504892369e-06, "loss": 0.9594, "step": 194 }, { "epoch": 0.3816046966731898, "grad_norm": 0.31607192754745483, "learning_rate": 6.183953033268102e-06, "loss": 0.9379, "step": 195 }, { "epoch": 0.3835616438356164, "grad_norm": 0.3330182135105133, "learning_rate": 6.164383561643836e-06, "loss": 0.954, "step": 196 }, { "epoch": 0.38551859099804303, "grad_norm": 0.33578622341156006, "learning_rate": 6.14481409001957e-06, "loss": 0.9416, "step": 197 }, { "epoch": 0.38747553816046965, "grad_norm": 0.3267570436000824, "learning_rate": 6.1252446183953044e-06, "loss": 0.9379, "step": 198 }, { "epoch": 0.38943248532289626, "grad_norm": 0.33791911602020264, "learning_rate": 6.105675146771037e-06, "loss": 0.9247, "step": 199 }, { "epoch": 0.3913894324853229, "grad_norm": 0.32018688321113586, "learning_rate": 6.086105675146772e-06, "loss": 0.8886, "step": 200 }, { "epoch": 0.3933463796477495, "grad_norm": 0.32782450318336487, "learning_rate": 6.066536203522505e-06, "loss": 0.8923, "step": 201 }, { "epoch": 0.3953033268101761, "grad_norm": 0.32713061571121216, "learning_rate": 6.046966731898239e-06, "loss": 0.9265, "step": 202 }, { "epoch": 0.3972602739726027, "grad_norm": 0.3310089409351349, "learning_rate": 6.027397260273973e-06, "loss": 0.961, "step": 203 }, { "epoch": 0.39921722113502933, "grad_norm": 0.35549378395080566, "learning_rate": 6.007827788649707e-06, "loss": 0.9342, "step": 204 }, { "epoch": 0.40117416829745595, "grad_norm": 0.29176008701324463, "learning_rate": 5.988258317025441e-06, "loss": 0.9507, "step": 205 }, { "epoch": 0.40313111545988256, "grad_norm": 0.32877659797668457, "learning_rate": 5.9686888454011745e-06, "loss": 0.9162, "step": 206 }, { "epoch": 0.4050880626223092, "grad_norm": 0.33669957518577576, "learning_rate": 5.949119373776908e-06, "loss": 0.9424, "step": 207 }, { "epoch": 0.4070450097847358, "grad_norm": 0.35781094431877136, "learning_rate": 5.929549902152643e-06, "loss": 0.9469, "step": 208 }, { "epoch": 0.4090019569471624, "grad_norm": 0.4322330355644226, "learning_rate": 5.909980430528376e-06, "loss": 0.8294, "step": 209 }, { "epoch": 0.410958904109589, "grad_norm": 0.3147006034851074, "learning_rate": 5.89041095890411e-06, "loss": 0.8941, "step": 210 }, { "epoch": 0.41291585127201563, "grad_norm": 0.3192490339279175, "learning_rate": 5.870841487279844e-06, "loss": 0.9286, "step": 211 }, { "epoch": 0.41487279843444225, "grad_norm": 0.287655234336853, "learning_rate": 5.851272015655578e-06, "loss": 0.9372, "step": 212 }, { "epoch": 0.41682974559686886, "grad_norm": 0.3179602324962616, "learning_rate": 5.831702544031311e-06, "loss": 0.9261, "step": 213 }, { "epoch": 0.4187866927592955, "grad_norm": 0.4064527750015259, "learning_rate": 5.812133072407045e-06, "loss": 0.9122, "step": 214 }, { "epoch": 0.4207436399217221, "grad_norm": 0.4332832992076874, "learning_rate": 5.792563600782779e-06, "loss": 0.9605, "step": 215 }, { "epoch": 0.4227005870841487, "grad_norm": 0.32594162225723267, "learning_rate": 5.7729941291585136e-06, "loss": 0.8909, "step": 216 }, { "epoch": 0.4246575342465753, "grad_norm": 0.30977311730384827, "learning_rate": 5.753424657534246e-06, "loss": 0.9206, "step": 217 }, { "epoch": 0.42661448140900193, "grad_norm": 0.3289760947227478, "learning_rate": 5.733855185909981e-06, "loss": 0.9241, "step": 218 }, { "epoch": 0.42857142857142855, "grad_norm": 0.3369634747505188, "learning_rate": 5.7142857142857145e-06, "loss": 0.9863, "step": 219 }, { "epoch": 0.43052837573385516, "grad_norm": 0.34902551770210266, "learning_rate": 5.694716242661449e-06, "loss": 0.9399, "step": 220 }, { "epoch": 0.4324853228962818, "grad_norm": 0.339798241853714, "learning_rate": 5.675146771037182e-06, "loss": 0.9525, "step": 221 }, { "epoch": 0.4344422700587084, "grad_norm": 0.3014651834964752, "learning_rate": 5.655577299412916e-06, "loss": 0.9911, "step": 222 }, { "epoch": 0.436399217221135, "grad_norm": 0.28443804383277893, "learning_rate": 5.63600782778865e-06, "loss": 0.9884, "step": 223 }, { "epoch": 0.4383561643835616, "grad_norm": 0.5278264284133911, "learning_rate": 5.6164383561643845e-06, "loss": 0.924, "step": 224 }, { "epoch": 0.44031311154598823, "grad_norm": 0.313249409198761, "learning_rate": 5.596868884540117e-06, "loss": 0.8896, "step": 225 }, { "epoch": 0.44227005870841485, "grad_norm": 0.3332677483558655, "learning_rate": 5.577299412915852e-06, "loss": 0.8871, "step": 226 }, { "epoch": 0.44422700587084146, "grad_norm": 0.3289450705051422, "learning_rate": 5.557729941291585e-06, "loss": 0.9482, "step": 227 }, { "epoch": 0.4461839530332681, "grad_norm": 0.340781569480896, "learning_rate": 5.53816046966732e-06, "loss": 0.9035, "step": 228 }, { "epoch": 0.4481409001956947, "grad_norm": 0.34197819232940674, "learning_rate": 5.518590998043053e-06, "loss": 0.9275, "step": 229 }, { "epoch": 0.4500978473581213, "grad_norm": 0.4397524893283844, "learning_rate": 5.499021526418787e-06, "loss": 0.9485, "step": 230 }, { "epoch": 0.4520547945205479, "grad_norm": 0.3033043444156647, "learning_rate": 5.479452054794521e-06, "loss": 0.867, "step": 231 }, { "epoch": 0.45401174168297453, "grad_norm": 0.3285888135433197, "learning_rate": 5.459882583170255e-06, "loss": 0.9199, "step": 232 }, { "epoch": 0.45596868884540115, "grad_norm": 0.33250048756599426, "learning_rate": 5.440313111545988e-06, "loss": 0.8926, "step": 233 }, { "epoch": 0.45792563600782776, "grad_norm": 0.3682827651500702, "learning_rate": 5.420743639921723e-06, "loss": 0.9012, "step": 234 }, { "epoch": 0.4598825831702544, "grad_norm": 0.31080353260040283, "learning_rate": 5.401174168297456e-06, "loss": 0.8953, "step": 235 }, { "epoch": 0.461839530332681, "grad_norm": 0.3215543329715729, "learning_rate": 5.381604696673191e-06, "loss": 0.7648, "step": 236 }, { "epoch": 0.4637964774951076, "grad_norm": 0.33108121156692505, "learning_rate": 5.362035225048924e-06, "loss": 0.9314, "step": 237 }, { "epoch": 0.4657534246575342, "grad_norm": 0.3492167294025421, "learning_rate": 5.342465753424658e-06, "loss": 0.9463, "step": 238 }, { "epoch": 0.46771037181996084, "grad_norm": 0.3727250099182129, "learning_rate": 5.322896281800392e-06, "loss": 0.9519, "step": 239 }, { "epoch": 0.46966731898238745, "grad_norm": 0.3256610929965973, "learning_rate": 5.303326810176126e-06, "loss": 0.919, "step": 240 }, { "epoch": 0.47162426614481406, "grad_norm": 0.30512261390686035, "learning_rate": 5.283757338551859e-06, "loss": 0.9372, "step": 241 }, { "epoch": 0.4735812133072407, "grad_norm": 0.3406316041946411, "learning_rate": 5.2641878669275936e-06, "loss": 0.9323, "step": 242 }, { "epoch": 0.4755381604696673, "grad_norm": 0.3489183485507965, "learning_rate": 5.244618395303327e-06, "loss": 0.9259, "step": 243 }, { "epoch": 0.4774951076320939, "grad_norm": 0.349557489156723, "learning_rate": 5.225048923679062e-06, "loss": 0.8651, "step": 244 }, { "epoch": 0.4794520547945205, "grad_norm": 0.3324158191680908, "learning_rate": 5.2054794520547945e-06, "loss": 0.8787, "step": 245 }, { "epoch": 0.48140900195694714, "grad_norm": 0.3594268560409546, "learning_rate": 5.185909980430529e-06, "loss": 0.8767, "step": 246 }, { "epoch": 0.48336594911937375, "grad_norm": 0.33352982997894287, "learning_rate": 5.166340508806263e-06, "loss": 0.8714, "step": 247 }, { "epoch": 0.48532289628180036, "grad_norm": 0.3096468150615692, "learning_rate": 5.146771037181997e-06, "loss": 0.9296, "step": 248 }, { "epoch": 0.487279843444227, "grad_norm": 0.3263510763645172, "learning_rate": 5.12720156555773e-06, "loss": 0.9597, "step": 249 }, { "epoch": 0.4892367906066536, "grad_norm": 0.3318216800689697, "learning_rate": 5.1076320939334645e-06, "loss": 0.9144, "step": 250 }, { "epoch": 0.4911937377690802, "grad_norm": 0.39225342869758606, "learning_rate": 5.088062622309198e-06, "loss": 0.934, "step": 251 }, { "epoch": 0.4931506849315068, "grad_norm": 0.3386378884315491, "learning_rate": 5.068493150684932e-06, "loss": 0.9443, "step": 252 }, { "epoch": 0.49510763209393344, "grad_norm": 0.31350958347320557, "learning_rate": 5.0489236790606654e-06, "loss": 0.9419, "step": 253 }, { "epoch": 0.49706457925636005, "grad_norm": 0.3767964839935303, "learning_rate": 5.0293542074364e-06, "loss": 0.9299, "step": 254 }, { "epoch": 0.49902152641878667, "grad_norm": 0.3285723924636841, "learning_rate": 5.009784735812134e-06, "loss": 0.9561, "step": 255 }, { "epoch": 0.5009784735812133, "grad_norm": 0.38723042607307434, "learning_rate": 4.990215264187867e-06, "loss": 0.9415, "step": 256 }, { "epoch": 0.50293542074364, "grad_norm": 0.31375616788864136, "learning_rate": 4.970645792563601e-06, "loss": 0.9507, "step": 257 }, { "epoch": 0.5048923679060665, "grad_norm": 0.33384719491004944, "learning_rate": 4.9510763209393345e-06, "loss": 0.8887, "step": 258 }, { "epoch": 0.5068493150684932, "grad_norm": 0.39128080010414124, "learning_rate": 4.931506849315069e-06, "loss": 0.9316, "step": 259 }, { "epoch": 0.5088062622309197, "grad_norm": 0.3334865868091583, "learning_rate": 4.911937377690803e-06, "loss": 0.8958, "step": 260 }, { "epoch": 0.5107632093933464, "grad_norm": 0.3332456350326538, "learning_rate": 4.892367906066536e-06, "loss": 0.8705, "step": 261 }, { "epoch": 0.512720156555773, "grad_norm": 0.42276686429977417, "learning_rate": 4.87279843444227e-06, "loss": 0.824, "step": 262 }, { "epoch": 0.5146771037181996, "grad_norm": 0.33200517296791077, "learning_rate": 4.853228962818004e-06, "loss": 0.905, "step": 263 }, { "epoch": 0.5166340508806262, "grad_norm": 0.3116356134414673, "learning_rate": 4.833659491193738e-06, "loss": 0.9098, "step": 264 }, { "epoch": 0.5185909980430529, "grad_norm": 0.33332517743110657, "learning_rate": 4.814090019569472e-06, "loss": 0.9436, "step": 265 }, { "epoch": 0.5205479452054794, "grad_norm": 0.3184143900871277, "learning_rate": 4.7945205479452054e-06, "loss": 0.9149, "step": 266 }, { "epoch": 0.5225048923679061, "grad_norm": 0.3486206829547882, "learning_rate": 4.774951076320939e-06, "loss": 0.8951, "step": 267 }, { "epoch": 0.5244618395303327, "grad_norm": 0.3263947367668152, "learning_rate": 4.755381604696674e-06, "loss": 0.9133, "step": 268 }, { "epoch": 0.5264187866927593, "grad_norm": 0.33816662430763245, "learning_rate": 4.735812133072407e-06, "loss": 0.9537, "step": 269 }, { "epoch": 0.5283757338551859, "grad_norm": 0.4058966338634491, "learning_rate": 4.716242661448141e-06, "loss": 0.7974, "step": 270 }, { "epoch": 0.5303326810176126, "grad_norm": 0.33853861689567566, "learning_rate": 4.6966731898238745e-06, "loss": 0.9175, "step": 271 }, { "epoch": 0.5322896281800391, "grad_norm": 0.3483884036540985, "learning_rate": 4.677103718199609e-06, "loss": 0.8432, "step": 272 }, { "epoch": 0.5342465753424658, "grad_norm": 0.33916252851486206, "learning_rate": 4.657534246575343e-06, "loss": 0.9084, "step": 273 }, { "epoch": 0.5362035225048923, "grad_norm": 0.3245210349559784, "learning_rate": 4.637964774951076e-06, "loss": 0.9392, "step": 274 }, { "epoch": 0.538160469667319, "grad_norm": 0.382941871881485, "learning_rate": 4.61839530332681e-06, "loss": 0.9531, "step": 275 }, { "epoch": 0.5401174168297456, "grad_norm": 0.31128600239753723, "learning_rate": 4.5988258317025445e-06, "loss": 0.9656, "step": 276 }, { "epoch": 0.5420743639921722, "grad_norm": 0.36267444491386414, "learning_rate": 4.579256360078278e-06, "loss": 0.9052, "step": 277 }, { "epoch": 0.5440313111545988, "grad_norm": 0.32378819584846497, "learning_rate": 4.559686888454012e-06, "loss": 0.9546, "step": 278 }, { "epoch": 0.5459882583170255, "grad_norm": 0.47103360295295715, "learning_rate": 4.5401174168297455e-06, "loss": 0.8386, "step": 279 }, { "epoch": 0.547945205479452, "grad_norm": 0.34283939003944397, "learning_rate": 4.52054794520548e-06, "loss": 0.8913, "step": 280 }, { "epoch": 0.5499021526418787, "grad_norm": 0.33877629041671753, "learning_rate": 4.500978473581214e-06, "loss": 0.8588, "step": 281 }, { "epoch": 0.5518590998043053, "grad_norm": 0.32226869463920593, "learning_rate": 4.481409001956947e-06, "loss": 0.9419, "step": 282 }, { "epoch": 0.5538160469667319, "grad_norm": 0.3250659704208374, "learning_rate": 4.461839530332681e-06, "loss": 0.949, "step": 283 }, { "epoch": 0.5557729941291585, "grad_norm": 0.37836357951164246, "learning_rate": 4.442270058708415e-06, "loss": 0.9173, "step": 284 }, { "epoch": 0.5577299412915852, "grad_norm": 0.3452129364013672, "learning_rate": 4.422700587084149e-06, "loss": 0.8923, "step": 285 }, { "epoch": 0.5596868884540117, "grad_norm": 0.3265805244445801, "learning_rate": 4.403131115459883e-06, "loss": 0.9209, "step": 286 }, { "epoch": 0.5616438356164384, "grad_norm": 0.32123324275016785, "learning_rate": 4.383561643835616e-06, "loss": 0.984, "step": 287 }, { "epoch": 0.5636007827788649, "grad_norm": 0.34007397294044495, "learning_rate": 4.36399217221135e-06, "loss": 0.9041, "step": 288 }, { "epoch": 0.5655577299412916, "grad_norm": 0.34763190150260925, "learning_rate": 4.3444227005870845e-06, "loss": 0.9266, "step": 289 }, { "epoch": 0.5675146771037182, "grad_norm": 0.32859864830970764, "learning_rate": 4.324853228962818e-06, "loss": 0.9358, "step": 290 }, { "epoch": 0.5694716242661448, "grad_norm": 0.3776375353336334, "learning_rate": 4.305283757338552e-06, "loss": 0.9017, "step": 291 }, { "epoch": 0.5714285714285714, "grad_norm": 0.38739728927612305, "learning_rate": 4.2857142857142855e-06, "loss": 0.9524, "step": 292 }, { "epoch": 0.5733855185909981, "grad_norm": 0.3514958322048187, "learning_rate": 4.26614481409002e-06, "loss": 0.9318, "step": 293 }, { "epoch": 0.5753424657534246, "grad_norm": 0.340966135263443, "learning_rate": 4.246575342465754e-06, "loss": 0.921, "step": 294 }, { "epoch": 0.5772994129158513, "grad_norm": 0.33006027340888977, "learning_rate": 4.227005870841487e-06, "loss": 0.9094, "step": 295 }, { "epoch": 0.5792563600782779, "grad_norm": 0.30589374899864197, "learning_rate": 4.207436399217221e-06, "loss": 0.9176, "step": 296 }, { "epoch": 0.5812133072407045, "grad_norm": 0.34072640538215637, "learning_rate": 4.187866927592955e-06, "loss": 0.968, "step": 297 }, { "epoch": 0.5831702544031311, "grad_norm": 0.34003034234046936, "learning_rate": 4.168297455968689e-06, "loss": 0.8709, "step": 298 }, { "epoch": 0.5851272015655578, "grad_norm": 0.3410165011882782, "learning_rate": 4.148727984344423e-06, "loss": 0.9126, "step": 299 }, { "epoch": 0.5870841487279843, "grad_norm": 0.3337312936782837, "learning_rate": 4.129158512720156e-06, "loss": 0.852, "step": 300 }, { "epoch": 0.589041095890411, "grad_norm": 0.48609423637390137, "learning_rate": 4.109589041095891e-06, "loss": 0.8855, "step": 301 }, { "epoch": 0.5909980430528375, "grad_norm": 0.35817044973373413, "learning_rate": 4.0900195694716245e-06, "loss": 0.954, "step": 302 }, { "epoch": 0.5929549902152642, "grad_norm": 0.37432897090911865, "learning_rate": 4.070450097847358e-06, "loss": 0.8936, "step": 303 }, { "epoch": 0.5949119373776908, "grad_norm": 0.3256794512271881, "learning_rate": 4.050880626223092e-06, "loss": 0.9175, "step": 304 }, { "epoch": 0.5968688845401174, "grad_norm": 0.3711596429347992, "learning_rate": 4.031311154598826e-06, "loss": 0.9312, "step": 305 }, { "epoch": 0.598825831702544, "grad_norm": 0.35513797402381897, "learning_rate": 4.01174168297456e-06, "loss": 0.9227, "step": 306 }, { "epoch": 0.6007827788649707, "grad_norm": 0.3059983551502228, "learning_rate": 3.992172211350294e-06, "loss": 0.9364, "step": 307 }, { "epoch": 0.6027397260273972, "grad_norm": 0.38014575839042664, "learning_rate": 3.972602739726027e-06, "loss": 0.8645, "step": 308 }, { "epoch": 0.6046966731898239, "grad_norm": 0.33558711409568787, "learning_rate": 3.953033268101762e-06, "loss": 0.9175, "step": 309 }, { "epoch": 0.6066536203522505, "grad_norm": 0.3638705015182495, "learning_rate": 3.933463796477495e-06, "loss": 1.02, "step": 310 }, { "epoch": 0.6086105675146771, "grad_norm": 0.341256707906723, "learning_rate": 3.913894324853229e-06, "loss": 0.9254, "step": 311 }, { "epoch": 0.6105675146771037, "grad_norm": 0.34499531984329224, "learning_rate": 3.894324853228963e-06, "loss": 0.9273, "step": 312 }, { "epoch": 0.6125244618395304, "grad_norm": 0.3527175784111023, "learning_rate": 3.874755381604696e-06, "loss": 0.9152, "step": 313 }, { "epoch": 0.6144814090019569, "grad_norm": 0.3410734534263611, "learning_rate": 3.855185909980431e-06, "loss": 0.9186, "step": 314 }, { "epoch": 0.6164383561643836, "grad_norm": 0.36121881008148193, "learning_rate": 3.8356164383561645e-06, "loss": 0.9857, "step": 315 }, { "epoch": 0.6183953033268101, "grad_norm": 0.6107659935951233, "learning_rate": 3.816046966731898e-06, "loss": 0.8668, "step": 316 }, { "epoch": 0.6203522504892368, "grad_norm": 0.3535270094871521, "learning_rate": 3.7964774951076322e-06, "loss": 0.8487, "step": 317 }, { "epoch": 0.6223091976516634, "grad_norm": 0.3669748604297638, "learning_rate": 3.776908023483366e-06, "loss": 0.9351, "step": 318 }, { "epoch": 0.62426614481409, "grad_norm": 0.33674487471580505, "learning_rate": 3.7573385518591e-06, "loss": 0.8533, "step": 319 }, { "epoch": 0.6262230919765166, "grad_norm": 0.3490351736545563, "learning_rate": 3.7377690802348336e-06, "loss": 0.8261, "step": 320 }, { "epoch": 0.6281800391389433, "grad_norm": 0.34486088156700134, "learning_rate": 3.7181996086105677e-06, "loss": 0.9306, "step": 321 }, { "epoch": 0.6301369863013698, "grad_norm": 0.35340040922164917, "learning_rate": 3.6986301369863014e-06, "loss": 0.8714, "step": 322 }, { "epoch": 0.6320939334637965, "grad_norm": 0.3286992609500885, "learning_rate": 3.6790606653620354e-06, "loss": 0.9321, "step": 323 }, { "epoch": 0.6340508806262231, "grad_norm": 0.3706447184085846, "learning_rate": 3.659491193737769e-06, "loss": 0.9171, "step": 324 }, { "epoch": 0.6360078277886497, "grad_norm": 0.3709685802459717, "learning_rate": 3.639921722113503e-06, "loss": 0.9456, "step": 325 }, { "epoch": 0.6379647749510763, "grad_norm": 0.33583569526672363, "learning_rate": 3.620352250489237e-06, "loss": 0.9105, "step": 326 }, { "epoch": 0.639921722113503, "grad_norm": 0.36042001843452454, "learning_rate": 3.600782778864971e-06, "loss": 0.9062, "step": 327 }, { "epoch": 0.6418786692759295, "grad_norm": 0.3614070415496826, "learning_rate": 3.5812133072407045e-06, "loss": 0.9101, "step": 328 }, { "epoch": 0.6438356164383562, "grad_norm": 0.3542083501815796, "learning_rate": 3.5616438356164386e-06, "loss": 0.935, "step": 329 }, { "epoch": 0.6457925636007827, "grad_norm": 0.3305101692676544, "learning_rate": 3.5420743639921723e-06, "loss": 0.85, "step": 330 }, { "epoch": 0.6477495107632094, "grad_norm": 0.35562682151794434, "learning_rate": 3.5225048923679063e-06, "loss": 0.9223, "step": 331 }, { "epoch": 0.649706457925636, "grad_norm": 0.3588898777961731, "learning_rate": 3.50293542074364e-06, "loss": 0.8486, "step": 332 }, { "epoch": 0.6516634050880626, "grad_norm": 0.48412322998046875, "learning_rate": 3.483365949119374e-06, "loss": 0.9895, "step": 333 }, { "epoch": 0.6536203522504892, "grad_norm": 0.3221297264099121, "learning_rate": 3.4637964774951077e-06, "loss": 0.9376, "step": 334 }, { "epoch": 0.6555772994129159, "grad_norm": 0.3534998595714569, "learning_rate": 3.4442270058708418e-06, "loss": 0.8962, "step": 335 }, { "epoch": 0.6575342465753424, "grad_norm": 0.3436375558376312, "learning_rate": 3.4246575342465754e-06, "loss": 0.9241, "step": 336 }, { "epoch": 0.6594911937377691, "grad_norm": 0.37481996417045593, "learning_rate": 3.405088062622309e-06, "loss": 0.85, "step": 337 }, { "epoch": 0.6614481409001957, "grad_norm": 0.4136059284210205, "learning_rate": 3.385518590998043e-06, "loss": 0.8787, "step": 338 }, { "epoch": 0.6634050880626223, "grad_norm": 0.3450472354888916, "learning_rate": 3.365949119373777e-06, "loss": 0.9156, "step": 339 }, { "epoch": 0.6653620352250489, "grad_norm": 0.3252355754375458, "learning_rate": 3.346379647749511e-06, "loss": 0.9292, "step": 340 }, { "epoch": 0.6673189823874756, "grad_norm": 0.36309337615966797, "learning_rate": 3.3268101761252445e-06, "loss": 0.8951, "step": 341 }, { "epoch": 0.6692759295499021, "grad_norm": 0.3402676284313202, "learning_rate": 3.3072407045009786e-06, "loss": 0.9409, "step": 342 }, { "epoch": 0.6712328767123288, "grad_norm": 0.3547208607196808, "learning_rate": 3.2876712328767123e-06, "loss": 0.8855, "step": 343 }, { "epoch": 0.6731898238747553, "grad_norm": 0.348457008600235, "learning_rate": 3.2681017612524463e-06, "loss": 0.8965, "step": 344 }, { "epoch": 0.675146771037182, "grad_norm": 0.3382589519023895, "learning_rate": 3.24853228962818e-06, "loss": 0.9357, "step": 345 }, { "epoch": 0.6771037181996086, "grad_norm": 0.35978591442108154, "learning_rate": 3.228962818003914e-06, "loss": 0.8625, "step": 346 }, { "epoch": 0.6790606653620352, "grad_norm": 0.3476986885070801, "learning_rate": 3.2093933463796477e-06, "loss": 0.8352, "step": 347 }, { "epoch": 0.6810176125244618, "grad_norm": 0.36571869254112244, "learning_rate": 3.189823874755382e-06, "loss": 0.913, "step": 348 }, { "epoch": 0.6829745596868885, "grad_norm": 0.33142149448394775, "learning_rate": 3.1702544031311154e-06, "loss": 0.8793, "step": 349 }, { "epoch": 0.684931506849315, "grad_norm": 0.37687569856643677, "learning_rate": 3.1506849315068495e-06, "loss": 0.8976, "step": 350 }, { "epoch": 0.6868884540117417, "grad_norm": 0.3633004128932953, "learning_rate": 3.131115459882583e-06, "loss": 0.8668, "step": 351 }, { "epoch": 0.6888454011741683, "grad_norm": 0.363525390625, "learning_rate": 3.1115459882583172e-06, "loss": 0.8561, "step": 352 }, { "epoch": 0.6908023483365949, "grad_norm": 0.3553753197193146, "learning_rate": 3.091976516634051e-06, "loss": 0.8881, "step": 353 }, { "epoch": 0.6927592954990215, "grad_norm": 0.36212918162345886, "learning_rate": 3.072407045009785e-06, "loss": 0.9327, "step": 354 }, { "epoch": 0.6947162426614482, "grad_norm": 0.3260986804962158, "learning_rate": 3.0528375733855186e-06, "loss": 0.8812, "step": 355 }, { "epoch": 0.6966731898238747, "grad_norm": 0.39815372228622437, "learning_rate": 3.0332681017612527e-06, "loss": 0.8762, "step": 356 }, { "epoch": 0.6986301369863014, "grad_norm": 0.34042733907699585, "learning_rate": 3.0136986301369864e-06, "loss": 0.9195, "step": 357 }, { "epoch": 0.700587084148728, "grad_norm": 0.39932090044021606, "learning_rate": 2.9941291585127204e-06, "loss": 0.8907, "step": 358 }, { "epoch": 0.7025440313111546, "grad_norm": 0.35055866837501526, "learning_rate": 2.974559686888454e-06, "loss": 0.8956, "step": 359 }, { "epoch": 0.7045009784735812, "grad_norm": 0.43682193756103516, "learning_rate": 2.954990215264188e-06, "loss": 0.9004, "step": 360 }, { "epoch": 0.7064579256360078, "grad_norm": 0.3480110466480255, "learning_rate": 2.935420743639922e-06, "loss": 0.8725, "step": 361 }, { "epoch": 0.7084148727984344, "grad_norm": 0.3565778136253357, "learning_rate": 2.9158512720156555e-06, "loss": 0.9085, "step": 362 }, { "epoch": 0.7103718199608611, "grad_norm": 0.38167497515678406, "learning_rate": 2.8962818003913895e-06, "loss": 0.9225, "step": 363 }, { "epoch": 0.7123287671232876, "grad_norm": 0.3642929494380951, "learning_rate": 2.876712328767123e-06, "loss": 0.8497, "step": 364 }, { "epoch": 0.7142857142857143, "grad_norm": 0.3579420745372772, "learning_rate": 2.8571428571428573e-06, "loss": 0.9122, "step": 365 }, { "epoch": 0.7162426614481409, "grad_norm": 0.4468456506729126, "learning_rate": 2.837573385518591e-06, "loss": 0.9301, "step": 366 }, { "epoch": 0.7181996086105675, "grad_norm": 0.4842437207698822, "learning_rate": 2.818003913894325e-06, "loss": 0.8161, "step": 367 }, { "epoch": 0.7201565557729941, "grad_norm": 0.37980690598487854, "learning_rate": 2.7984344422700586e-06, "loss": 0.8068, "step": 368 }, { "epoch": 0.7221135029354208, "grad_norm": 0.35172978043556213, "learning_rate": 2.7788649706457927e-06, "loss": 0.9114, "step": 369 }, { "epoch": 0.7240704500978473, "grad_norm": 0.34394723176956177, "learning_rate": 2.7592954990215264e-06, "loss": 0.9262, "step": 370 }, { "epoch": 0.726027397260274, "grad_norm": 0.45529139041900635, "learning_rate": 2.7397260273972604e-06, "loss": 0.8255, "step": 371 }, { "epoch": 0.7279843444227005, "grad_norm": 0.3215661644935608, "learning_rate": 2.720156555772994e-06, "loss": 0.9848, "step": 372 }, { "epoch": 0.7299412915851272, "grad_norm": 0.374117374420166, "learning_rate": 2.700587084148728e-06, "loss": 0.9345, "step": 373 }, { "epoch": 0.7318982387475538, "grad_norm": 0.3748462498188019, "learning_rate": 2.681017612524462e-06, "loss": 0.9029, "step": 374 }, { "epoch": 0.7338551859099804, "grad_norm": 0.35281816124916077, "learning_rate": 2.661448140900196e-06, "loss": 0.9081, "step": 375 }, { "epoch": 0.735812133072407, "grad_norm": 0.3568076491355896, "learning_rate": 2.6418786692759295e-06, "loss": 0.9359, "step": 376 }, { "epoch": 0.7377690802348337, "grad_norm": 0.3849165141582489, "learning_rate": 2.6223091976516636e-06, "loss": 0.8738, "step": 377 }, { "epoch": 0.7397260273972602, "grad_norm": 0.33613815903663635, "learning_rate": 2.6027397260273973e-06, "loss": 0.8893, "step": 378 }, { "epoch": 0.7416829745596869, "grad_norm": 0.3870159387588501, "learning_rate": 2.5831702544031313e-06, "loss": 0.8985, "step": 379 }, { "epoch": 0.7436399217221135, "grad_norm": 0.41747015714645386, "learning_rate": 2.563600782778865e-06, "loss": 0.9111, "step": 380 }, { "epoch": 0.7455968688845401, "grad_norm": 0.33905646204948425, "learning_rate": 2.544031311154599e-06, "loss": 0.9438, "step": 381 }, { "epoch": 0.7475538160469667, "grad_norm": 0.42772483825683594, "learning_rate": 2.5244618395303327e-06, "loss": 0.9123, "step": 382 }, { "epoch": 0.7495107632093934, "grad_norm": 0.3450902998447418, "learning_rate": 2.504892367906067e-06, "loss": 0.9051, "step": 383 }, { "epoch": 0.7514677103718199, "grad_norm": 0.3520686626434326, "learning_rate": 2.4853228962818004e-06, "loss": 0.9282, "step": 384 }, { "epoch": 0.7534246575342466, "grad_norm": 0.36060193181037903, "learning_rate": 2.4657534246575345e-06, "loss": 1.0014, "step": 385 }, { "epoch": 0.7553816046966731, "grad_norm": 0.36178913712501526, "learning_rate": 2.446183953033268e-06, "loss": 0.782, "step": 386 }, { "epoch": 0.7573385518590998, "grad_norm": 0.3532876670360565, "learning_rate": 2.426614481409002e-06, "loss": 0.9207, "step": 387 }, { "epoch": 0.7592954990215264, "grad_norm": 0.34173986315727234, "learning_rate": 2.407045009784736e-06, "loss": 0.8727, "step": 388 }, { "epoch": 0.761252446183953, "grad_norm": 0.34336763620376587, "learning_rate": 2.3874755381604695e-06, "loss": 0.8857, "step": 389 }, { "epoch": 0.7632093933463796, "grad_norm": 0.33171361684799194, "learning_rate": 2.3679060665362036e-06, "loss": 0.8882, "step": 390 }, { "epoch": 0.7651663405088063, "grad_norm": 0.3519209325313568, "learning_rate": 2.3483365949119373e-06, "loss": 0.8927, "step": 391 }, { "epoch": 0.7671232876712328, "grad_norm": 0.3307989239692688, "learning_rate": 2.3287671232876713e-06, "loss": 0.904, "step": 392 }, { "epoch": 0.7690802348336595, "grad_norm": 0.3287998139858246, "learning_rate": 2.309197651663405e-06, "loss": 0.9244, "step": 393 }, { "epoch": 0.7710371819960861, "grad_norm": 0.3633367121219635, "learning_rate": 2.289628180039139e-06, "loss": 0.8516, "step": 394 }, { "epoch": 0.7729941291585127, "grad_norm": 0.36312800645828247, "learning_rate": 2.2700587084148727e-06, "loss": 0.9231, "step": 395 }, { "epoch": 0.7749510763209393, "grad_norm": 0.3343620002269745, "learning_rate": 2.250489236790607e-06, "loss": 0.8798, "step": 396 }, { "epoch": 0.776908023483366, "grad_norm": 0.41196396946907043, "learning_rate": 2.2309197651663405e-06, "loss": 0.8786, "step": 397 }, { "epoch": 0.7788649706457925, "grad_norm": 0.36387088894844055, "learning_rate": 2.2113502935420745e-06, "loss": 0.8739, "step": 398 }, { "epoch": 0.7808219178082192, "grad_norm": 0.31284716725349426, "learning_rate": 2.191780821917808e-06, "loss": 0.9609, "step": 399 }, { "epoch": 0.7827788649706457, "grad_norm": 0.3968718647956848, "learning_rate": 2.1722113502935423e-06, "loss": 0.8576, "step": 400 }, { "epoch": 0.7847358121330724, "grad_norm": 0.346426784992218, "learning_rate": 2.152641878669276e-06, "loss": 0.8406, "step": 401 }, { "epoch": 0.786692759295499, "grad_norm": 0.4300689399242401, "learning_rate": 2.13307240704501e-06, "loss": 0.9047, "step": 402 }, { "epoch": 0.7886497064579256, "grad_norm": 0.32908865809440613, "learning_rate": 2.1135029354207436e-06, "loss": 0.9306, "step": 403 }, { "epoch": 0.7906066536203522, "grad_norm": 0.3870595693588257, "learning_rate": 2.0939334637964777e-06, "loss": 0.8387, "step": 404 }, { "epoch": 0.7925636007827789, "grad_norm": 0.32453787326812744, "learning_rate": 2.0743639921722114e-06, "loss": 0.9424, "step": 405 }, { "epoch": 0.7945205479452054, "grad_norm": 0.2953280806541443, "learning_rate": 2.0547945205479454e-06, "loss": 0.955, "step": 406 }, { "epoch": 0.7964774951076321, "grad_norm": 0.378826767206192, "learning_rate": 2.035225048923679e-06, "loss": 0.8313, "step": 407 }, { "epoch": 0.7984344422700587, "grad_norm": 0.36773788928985596, "learning_rate": 2.015655577299413e-06, "loss": 0.8854, "step": 408 }, { "epoch": 0.8003913894324853, "grad_norm": 0.3617993891239166, "learning_rate": 1.996086105675147e-06, "loss": 0.9318, "step": 409 }, { "epoch": 0.8023483365949119, "grad_norm": 0.3715813159942627, "learning_rate": 1.976516634050881e-06, "loss": 0.8805, "step": 410 }, { "epoch": 0.8043052837573386, "grad_norm": 0.3366706073284149, "learning_rate": 1.9569471624266145e-06, "loss": 0.9197, "step": 411 }, { "epoch": 0.8062622309197651, "grad_norm": 0.37290623784065247, "learning_rate": 1.937377690802348e-06, "loss": 0.8869, "step": 412 }, { "epoch": 0.8082191780821918, "grad_norm": 0.34826987981796265, "learning_rate": 1.9178082191780823e-06, "loss": 0.8826, "step": 413 }, { "epoch": 0.8101761252446184, "grad_norm": 0.35748153924942017, "learning_rate": 1.8982387475538161e-06, "loss": 0.9207, "step": 414 }, { "epoch": 0.812133072407045, "grad_norm": 0.3526861071586609, "learning_rate": 1.87866927592955e-06, "loss": 0.9348, "step": 415 }, { "epoch": 0.8140900195694716, "grad_norm": 0.3595939874649048, "learning_rate": 1.8590998043052839e-06, "loss": 0.9544, "step": 416 }, { "epoch": 0.8160469667318982, "grad_norm": 0.3745361864566803, "learning_rate": 1.8395303326810177e-06, "loss": 0.857, "step": 417 }, { "epoch": 0.8180039138943248, "grad_norm": 0.3955901563167572, "learning_rate": 1.8199608610567516e-06, "loss": 0.8932, "step": 418 }, { "epoch": 0.8199608610567515, "grad_norm": 0.3213536739349365, "learning_rate": 1.8003913894324854e-06, "loss": 0.9079, "step": 419 }, { "epoch": 0.821917808219178, "grad_norm": 0.36574825644493103, "learning_rate": 1.7808219178082193e-06, "loss": 0.9256, "step": 420 }, { "epoch": 0.8238747553816047, "grad_norm": 0.5008761286735535, "learning_rate": 1.7612524461839532e-06, "loss": 0.8376, "step": 421 }, { "epoch": 0.8258317025440313, "grad_norm": 0.312209814786911, "learning_rate": 1.741682974559687e-06, "loss": 0.9188, "step": 422 }, { "epoch": 0.8277886497064579, "grad_norm": 0.4078651964664459, "learning_rate": 1.7221135029354209e-06, "loss": 0.9135, "step": 423 }, { "epoch": 0.8297455968688845, "grad_norm": 0.42918604612350464, "learning_rate": 1.7025440313111545e-06, "loss": 0.925, "step": 424 }, { "epoch": 0.8317025440313112, "grad_norm": 0.3664219081401825, "learning_rate": 1.6829745596868884e-06, "loss": 0.9153, "step": 425 }, { "epoch": 0.8336594911937377, "grad_norm": 0.41947075724601746, "learning_rate": 1.6634050880626223e-06, "loss": 0.8403, "step": 426 }, { "epoch": 0.8356164383561644, "grad_norm": 0.5199389457702637, "learning_rate": 1.6438356164383561e-06, "loss": 0.9094, "step": 427 }, { "epoch": 0.837573385518591, "grad_norm": 0.36971110105514526, "learning_rate": 1.62426614481409e-06, "loss": 0.8776, "step": 428 }, { "epoch": 0.8395303326810176, "grad_norm": 0.3708122968673706, "learning_rate": 1.6046966731898239e-06, "loss": 0.8424, "step": 429 }, { "epoch": 0.8414872798434442, "grad_norm": 0.35816383361816406, "learning_rate": 1.5851272015655577e-06, "loss": 0.8263, "step": 430 }, { "epoch": 0.8434442270058709, "grad_norm": 0.4561832845211029, "learning_rate": 1.5655577299412916e-06, "loss": 0.9035, "step": 431 }, { "epoch": 0.8454011741682974, "grad_norm": 0.46993499994277954, "learning_rate": 1.5459882583170254e-06, "loss": 0.8494, "step": 432 }, { "epoch": 0.8473581213307241, "grad_norm": 0.3416410982608795, "learning_rate": 1.5264187866927593e-06, "loss": 0.9109, "step": 433 }, { "epoch": 0.8493150684931506, "grad_norm": 0.36532074213027954, "learning_rate": 1.5068493150684932e-06, "loss": 0.8966, "step": 434 }, { "epoch": 0.8512720156555773, "grad_norm": 0.3833313286304474, "learning_rate": 1.487279843444227e-06, "loss": 0.9056, "step": 435 }, { "epoch": 0.8532289628180039, "grad_norm": 0.39663517475128174, "learning_rate": 1.467710371819961e-06, "loss": 0.8904, "step": 436 }, { "epoch": 0.8551859099804305, "grad_norm": 0.3750026524066925, "learning_rate": 1.4481409001956948e-06, "loss": 0.895, "step": 437 }, { "epoch": 0.8571428571428571, "grad_norm": 0.3878662586212158, "learning_rate": 1.4285714285714286e-06, "loss": 0.8879, "step": 438 }, { "epoch": 0.8590998043052838, "grad_norm": 0.32945066690444946, "learning_rate": 1.4090019569471625e-06, "loss": 0.9154, "step": 439 }, { "epoch": 0.8610567514677103, "grad_norm": 0.3289746046066284, "learning_rate": 1.3894324853228964e-06, "loss": 0.9272, "step": 440 }, { "epoch": 0.863013698630137, "grad_norm": 0.3634059727191925, "learning_rate": 1.3698630136986302e-06, "loss": 0.9226, "step": 441 }, { "epoch": 0.8649706457925636, "grad_norm": 0.4308583438396454, "learning_rate": 1.350293542074364e-06, "loss": 0.9235, "step": 442 }, { "epoch": 0.8669275929549902, "grad_norm": 0.3874328136444092, "learning_rate": 1.330724070450098e-06, "loss": 0.9058, "step": 443 }, { "epoch": 0.8688845401174168, "grad_norm": 0.3811403512954712, "learning_rate": 1.3111545988258318e-06, "loss": 0.8796, "step": 444 }, { "epoch": 0.8708414872798435, "grad_norm": 0.33906376361846924, "learning_rate": 1.2915851272015657e-06, "loss": 0.9051, "step": 445 }, { "epoch": 0.87279843444227, "grad_norm": 0.3563789427280426, "learning_rate": 1.2720156555772995e-06, "loss": 0.8932, "step": 446 }, { "epoch": 0.8747553816046967, "grad_norm": 0.44213926792144775, "learning_rate": 1.2524461839530334e-06, "loss": 0.8787, "step": 447 }, { "epoch": 0.8767123287671232, "grad_norm": 0.39271780848503113, "learning_rate": 1.2328767123287673e-06, "loss": 0.8706, "step": 448 }, { "epoch": 0.8786692759295499, "grad_norm": 0.3402375280857086, "learning_rate": 1.213307240704501e-06, "loss": 0.8404, "step": 449 }, { "epoch": 0.8806262230919765, "grad_norm": 0.37391403317451477, "learning_rate": 1.1937377690802348e-06, "loss": 0.873, "step": 450 }, { "epoch": 0.8825831702544031, "grad_norm": 0.4681132137775421, "learning_rate": 1.1741682974559686e-06, "loss": 0.8922, "step": 451 }, { "epoch": 0.8845401174168297, "grad_norm": 0.37245139479637146, "learning_rate": 1.1545988258317025e-06, "loss": 0.9606, "step": 452 }, { "epoch": 0.8864970645792564, "grad_norm": 0.3633488714694977, "learning_rate": 1.1350293542074364e-06, "loss": 0.9371, "step": 453 }, { "epoch": 0.8884540117416829, "grad_norm": 0.36568257212638855, "learning_rate": 1.1154598825831702e-06, "loss": 0.889, "step": 454 }, { "epoch": 0.8904109589041096, "grad_norm": 0.37727871537208557, "learning_rate": 1.095890410958904e-06, "loss": 0.8863, "step": 455 }, { "epoch": 0.8923679060665362, "grad_norm": 0.3628275990486145, "learning_rate": 1.076320939334638e-06, "loss": 0.8753, "step": 456 }, { "epoch": 0.8943248532289628, "grad_norm": 0.5403597950935364, "learning_rate": 1.0567514677103718e-06, "loss": 0.8446, "step": 457 }, { "epoch": 0.8962818003913894, "grad_norm": 0.37633222341537476, "learning_rate": 1.0371819960861057e-06, "loss": 0.8821, "step": 458 }, { "epoch": 0.898238747553816, "grad_norm": 0.4256667494773865, "learning_rate": 1.0176125244618395e-06, "loss": 0.8967, "step": 459 }, { "epoch": 0.9001956947162426, "grad_norm": 0.37082305550575256, "learning_rate": 9.980430528375734e-07, "loss": 0.8644, "step": 460 }, { "epoch": 0.9021526418786693, "grad_norm": 0.35088518261909485, "learning_rate": 9.784735812133073e-07, "loss": 0.9007, "step": 461 }, { "epoch": 0.9041095890410958, "grad_norm": 0.37018847465515137, "learning_rate": 9.589041095890411e-07, "loss": 0.8561, "step": 462 }, { "epoch": 0.9060665362035225, "grad_norm": 0.4181114137172699, "learning_rate": 9.39334637964775e-07, "loss": 0.9222, "step": 463 }, { "epoch": 0.9080234833659491, "grad_norm": 0.3350118100643158, "learning_rate": 9.197651663405089e-07, "loss": 0.9061, "step": 464 }, { "epoch": 0.9099804305283757, "grad_norm": 0.4112285077571869, "learning_rate": 9.001956947162427e-07, "loss": 0.8684, "step": 465 }, { "epoch": 0.9119373776908023, "grad_norm": 0.3795412480831146, "learning_rate": 8.806262230919766e-07, "loss": 0.8144, "step": 466 }, { "epoch": 0.913894324853229, "grad_norm": 0.36573439836502075, "learning_rate": 8.610567514677104e-07, "loss": 0.8562, "step": 467 }, { "epoch": 0.9158512720156555, "grad_norm": 0.5129836797714233, "learning_rate": 8.414872798434442e-07, "loss": 0.8725, "step": 468 }, { "epoch": 0.9178082191780822, "grad_norm": 0.3448660373687744, "learning_rate": 8.219178082191781e-07, "loss": 0.9071, "step": 469 }, { "epoch": 0.9197651663405088, "grad_norm": 0.33694183826446533, "learning_rate": 8.023483365949119e-07, "loss": 0.8779, "step": 470 }, { "epoch": 0.9217221135029354, "grad_norm": 0.4201546609401703, "learning_rate": 7.827788649706458e-07, "loss": 0.9754, "step": 471 }, { "epoch": 0.923679060665362, "grad_norm": 0.3125755488872528, "learning_rate": 7.632093933463797e-07, "loss": 0.9204, "step": 472 }, { "epoch": 0.9256360078277887, "grad_norm": 0.41351065039634705, "learning_rate": 7.436399217221135e-07, "loss": 0.8487, "step": 473 }, { "epoch": 0.9275929549902152, "grad_norm": 0.3856956660747528, "learning_rate": 7.240704500978474e-07, "loss": 0.9087, "step": 474 }, { "epoch": 0.9295499021526419, "grad_norm": 0.4000626802444458, "learning_rate": 7.045009784735812e-07, "loss": 0.8734, "step": 475 }, { "epoch": 0.9315068493150684, "grad_norm": 0.3834664523601532, "learning_rate": 6.849315068493151e-07, "loss": 0.8928, "step": 476 }, { "epoch": 0.9334637964774951, "grad_norm": 0.36856183409690857, "learning_rate": 6.65362035225049e-07, "loss": 0.8967, "step": 477 }, { "epoch": 0.9354207436399217, "grad_norm": 0.34125497937202454, "learning_rate": 6.457925636007828e-07, "loss": 0.8501, "step": 478 }, { "epoch": 0.9373776908023483, "grad_norm": 0.3918203115463257, "learning_rate": 6.262230919765167e-07, "loss": 0.8277, "step": 479 }, { "epoch": 0.9393346379647749, "grad_norm": 0.35030046105384827, "learning_rate": 6.066536203522505e-07, "loss": 0.8883, "step": 480 }, { "epoch": 0.9412915851272016, "grad_norm": 0.3345521092414856, "learning_rate": 5.870841487279843e-07, "loss": 0.9089, "step": 481 }, { "epoch": 0.9432485322896281, "grad_norm": 0.7940770983695984, "learning_rate": 5.675146771037182e-07, "loss": 0.8447, "step": 482 }, { "epoch": 0.9452054794520548, "grad_norm": 0.4670639634132385, "learning_rate": 5.47945205479452e-07, "loss": 0.9603, "step": 483 }, { "epoch": 0.9471624266144814, "grad_norm": 0.35085639357566833, "learning_rate": 5.283757338551859e-07, "loss": 0.8936, "step": 484 }, { "epoch": 0.949119373776908, "grad_norm": 0.3659544587135315, "learning_rate": 5.088062622309198e-07, "loss": 0.8489, "step": 485 }, { "epoch": 0.9510763209393346, "grad_norm": 0.37457695603370667, "learning_rate": 4.892367906066536e-07, "loss": 0.9201, "step": 486 }, { "epoch": 0.9530332681017613, "grad_norm": 0.3387516140937805, "learning_rate": 4.696673189823875e-07, "loss": 0.8846, "step": 487 }, { "epoch": 0.9549902152641878, "grad_norm": 0.3514867424964905, "learning_rate": 4.5009784735812136e-07, "loss": 0.9238, "step": 488 }, { "epoch": 0.9569471624266145, "grad_norm": 0.3868323564529419, "learning_rate": 4.305283757338552e-07, "loss": 0.93, "step": 489 }, { "epoch": 0.958904109589041, "grad_norm": 0.38455379009246826, "learning_rate": 4.1095890410958903e-07, "loss": 0.9301, "step": 490 }, { "epoch": 0.9608610567514677, "grad_norm": 0.360344797372818, "learning_rate": 3.913894324853229e-07, "loss": 0.8141, "step": 491 }, { "epoch": 0.9628180039138943, "grad_norm": 0.3541224002838135, "learning_rate": 3.7181996086105676e-07, "loss": 0.9057, "step": 492 }, { "epoch": 0.9647749510763209, "grad_norm": 0.35285741090774536, "learning_rate": 3.522504892367906e-07, "loss": 0.8942, "step": 493 }, { "epoch": 0.9667318982387475, "grad_norm": 0.3489803969860077, "learning_rate": 3.326810176125245e-07, "loss": 0.9787, "step": 494 }, { "epoch": 0.9686888454011742, "grad_norm": 0.36583074927330017, "learning_rate": 3.1311154598825835e-07, "loss": 0.8947, "step": 495 }, { "epoch": 0.9706457925636007, "grad_norm": 0.3927527964115143, "learning_rate": 2.9354207436399216e-07, "loss": 0.936, "step": 496 }, { "epoch": 0.9726027397260274, "grad_norm": 0.37387171387672424, "learning_rate": 2.73972602739726e-07, "loss": 0.9358, "step": 497 }, { "epoch": 0.974559686888454, "grad_norm": 0.36170729994773865, "learning_rate": 2.544031311154599e-07, "loss": 0.9211, "step": 498 }, { "epoch": 0.9765166340508806, "grad_norm": 0.3695358633995056, "learning_rate": 2.3483365949119375e-07, "loss": 0.8574, "step": 499 }, { "epoch": 0.9784735812133072, "grad_norm": 0.3722043037414551, "learning_rate": 2.152641878669276e-07, "loss": 0.8627, "step": 500 }, { "epoch": 0.9804305283757339, "grad_norm": 0.3411552309989929, "learning_rate": 1.9569471624266145e-07, "loss": 0.8924, "step": 501 }, { "epoch": 0.9823874755381604, "grad_norm": 0.3667154610157013, "learning_rate": 1.761252446183953e-07, "loss": 0.9128, "step": 502 }, { "epoch": 0.9843444227005871, "grad_norm": 0.36946728825569153, "learning_rate": 1.5655577299412917e-07, "loss": 0.8716, "step": 503 }, { "epoch": 0.9863013698630136, "grad_norm": 0.3377256393432617, "learning_rate": 1.36986301369863e-07, "loss": 0.939, "step": 504 }, { "epoch": 0.9882583170254403, "grad_norm": 0.3812258541584015, "learning_rate": 1.1741682974559687e-07, "loss": 0.8871, "step": 505 }, { "epoch": 0.9902152641878669, "grad_norm": 0.41513141989707947, "learning_rate": 9.784735812133072e-08, "loss": 0.8544, "step": 506 }, { "epoch": 0.9921722113502935, "grad_norm": 0.36569666862487793, "learning_rate": 7.827788649706459e-08, "loss": 0.9033, "step": 507 }, { "epoch": 0.9941291585127201, "grad_norm": 0.36549660563468933, "learning_rate": 5.870841487279844e-08, "loss": 0.8103, "step": 508 }, { "epoch": 0.9960861056751468, "grad_norm": 0.3560737073421478, "learning_rate": 3.9138943248532294e-08, "loss": 0.8832, "step": 509 }, { "epoch": 0.9980430528375733, "grad_norm": 0.35010769963264465, "learning_rate": 1.9569471624266147e-08, "loss": 0.8588, "step": 510 }, { "epoch": 1.0, "grad_norm": 0.3699730932712555, "learning_rate": 0.0, "loss": 0.9031, "step": 511 } ], "logging_steps": 1.0, "max_steps": 511, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8009679088687514e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }