{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9967051070840198, "eval_steps": 500, "global_step": 606, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032948929159802307, "grad_norm": 12.442080297711913, "learning_rate": 1.639344262295082e-07, "loss": 0.383, "step": 1 }, { "epoch": 0.006589785831960461, "grad_norm": 13.440096974039598, "learning_rate": 3.278688524590164e-07, "loss": 0.3663, "step": 2 }, { "epoch": 0.009884678747940691, "grad_norm": 13.796855251219549, "learning_rate": 4.918032786885246e-07, "loss": 0.3663, "step": 3 }, { "epoch": 0.013179571663920923, "grad_norm": 12.70572090942759, "learning_rate": 6.557377049180328e-07, "loss": 0.3061, "step": 4 }, { "epoch": 0.016474464579901153, "grad_norm": 11.468076158597158, "learning_rate": 8.196721311475409e-07, "loss": 0.3387, "step": 5 }, { "epoch": 0.019769357495881382, "grad_norm": 10.302737632617346, "learning_rate": 9.836065573770493e-07, "loss": 0.2814, "step": 6 }, { "epoch": 0.023064250411861616, "grad_norm": 9.415064505565756, "learning_rate": 1.1475409836065575e-06, "loss": 0.2815, "step": 7 }, { "epoch": 0.026359143327841845, "grad_norm": 8.420756675130125, "learning_rate": 1.3114754098360657e-06, "loss": 0.2483, "step": 8 }, { "epoch": 0.029654036243822075, "grad_norm": 5.932373113068906, "learning_rate": 1.4754098360655739e-06, "loss": 0.1744, "step": 9 }, { "epoch": 0.032948929159802305, "grad_norm": 4.602925255551132, "learning_rate": 1.6393442622950819e-06, "loss": 0.1821, "step": 10 }, { "epoch": 0.036243822075782535, "grad_norm": 4.1437512962542025, "learning_rate": 1.8032786885245903e-06, "loss": 0.1557, "step": 11 }, { "epoch": 0.039538714991762765, "grad_norm": 4.1008407646409495, "learning_rate": 1.9672131147540985e-06, "loss": 0.1901, "step": 12 }, { "epoch": 0.042833607907743, "grad_norm": 3.2895218176449177, "learning_rate": 2.1311475409836067e-06, "loss": 0.1695, "step": 13 }, { "epoch": 0.04612850082372323, "grad_norm": 2.578808113814672, "learning_rate": 2.295081967213115e-06, "loss": 0.1431, "step": 14 }, { "epoch": 0.04942339373970346, "grad_norm": 2.0925168309074804, "learning_rate": 2.459016393442623e-06, "loss": 0.1459, "step": 15 }, { "epoch": 0.05271828665568369, "grad_norm": 5.776302012227753, "learning_rate": 2.6229508196721314e-06, "loss": 0.2116, "step": 16 }, { "epoch": 0.05601317957166392, "grad_norm": 4.863535909815622, "learning_rate": 2.786885245901639e-06, "loss": 0.157, "step": 17 }, { "epoch": 0.05930807248764415, "grad_norm": 2.709665323729833, "learning_rate": 2.9508196721311478e-06, "loss": 0.1463, "step": 18 }, { "epoch": 0.06260296540362438, "grad_norm": 2.911144968708877, "learning_rate": 3.114754098360656e-06, "loss": 0.1283, "step": 19 }, { "epoch": 0.06589785831960461, "grad_norm": 2.4985115549367873, "learning_rate": 3.2786885245901638e-06, "loss": 0.13, "step": 20 }, { "epoch": 0.06919275123558484, "grad_norm": 2.158826062926261, "learning_rate": 3.4426229508196724e-06, "loss": 0.1135, "step": 21 }, { "epoch": 0.07248764415156507, "grad_norm": 1.8839820680217123, "learning_rate": 3.6065573770491806e-06, "loss": 0.1196, "step": 22 }, { "epoch": 0.0757825370675453, "grad_norm": 2.208880925801888, "learning_rate": 3.7704918032786884e-06, "loss": 0.1642, "step": 23 }, { "epoch": 0.07907742998352553, "grad_norm": 1.948969438238695, "learning_rate": 3.934426229508197e-06, "loss": 0.1057, "step": 24 }, { "epoch": 0.08237232289950576, "grad_norm": 1.5855531759235064, "learning_rate": 4.098360655737705e-06, "loss": 0.0908, "step": 25 }, { "epoch": 0.085667215815486, "grad_norm": 1.7484734055235995, "learning_rate": 4.2622950819672135e-06, "loss": 0.1088, "step": 26 }, { "epoch": 0.08896210873146623, "grad_norm": 1.7763409124600038, "learning_rate": 4.426229508196722e-06, "loss": 0.1264, "step": 27 }, { "epoch": 0.09225700164744646, "grad_norm": 1.943799751354267, "learning_rate": 4.59016393442623e-06, "loss": 0.1204, "step": 28 }, { "epoch": 0.09555189456342669, "grad_norm": 2.0024112586646723, "learning_rate": 4.754098360655738e-06, "loss": 0.132, "step": 29 }, { "epoch": 0.09884678747940692, "grad_norm": 1.927718117469908, "learning_rate": 4.918032786885246e-06, "loss": 0.0988, "step": 30 }, { "epoch": 0.10214168039538715, "grad_norm": 1.8175857487104838, "learning_rate": 5.0819672131147545e-06, "loss": 0.0997, "step": 31 }, { "epoch": 0.10543657331136738, "grad_norm": 1.1821292523186457, "learning_rate": 5.245901639344263e-06, "loss": 0.0874, "step": 32 }, { "epoch": 0.10873146622734761, "grad_norm": 1.6925074350020268, "learning_rate": 5.409836065573772e-06, "loss": 0.115, "step": 33 }, { "epoch": 0.11202635914332784, "grad_norm": 1.4637368762611331, "learning_rate": 5.573770491803278e-06, "loss": 0.0916, "step": 34 }, { "epoch": 0.11532125205930807, "grad_norm": 1.4814671153620174, "learning_rate": 5.737704918032787e-06, "loss": 0.0853, "step": 35 }, { "epoch": 0.1186161449752883, "grad_norm": 1.243594463126339, "learning_rate": 5.9016393442622956e-06, "loss": 0.0903, "step": 36 }, { "epoch": 0.12191103789126853, "grad_norm": 1.2713537957193175, "learning_rate": 6.065573770491804e-06, "loss": 0.1271, "step": 37 }, { "epoch": 0.12520593080724876, "grad_norm": 2.2092835366807893, "learning_rate": 6.229508196721312e-06, "loss": 0.1137, "step": 38 }, { "epoch": 0.128500823723229, "grad_norm": 1.1846569097593065, "learning_rate": 6.393442622950821e-06, "loss": 0.0714, "step": 39 }, { "epoch": 0.13179571663920922, "grad_norm": 3.001321928490275, "learning_rate": 6.5573770491803276e-06, "loss": 0.1129, "step": 40 }, { "epoch": 0.13509060955518945, "grad_norm": 1.8758843455974739, "learning_rate": 6.721311475409837e-06, "loss": 0.1021, "step": 41 }, { "epoch": 0.13838550247116968, "grad_norm": 2.993102960488022, "learning_rate": 6.885245901639345e-06, "loss": 0.1155, "step": 42 }, { "epoch": 0.1416803953871499, "grad_norm": 1.8441651337946723, "learning_rate": 7.049180327868853e-06, "loss": 0.1004, "step": 43 }, { "epoch": 0.14497528830313014, "grad_norm": 1.495517808358825, "learning_rate": 7.213114754098361e-06, "loss": 0.0868, "step": 44 }, { "epoch": 0.14827018121911037, "grad_norm": 1.8953155374136303, "learning_rate": 7.3770491803278695e-06, "loss": 0.1371, "step": 45 }, { "epoch": 0.1515650741350906, "grad_norm": 1.702133714045992, "learning_rate": 7.540983606557377e-06, "loss": 0.1597, "step": 46 }, { "epoch": 0.15485996705107083, "grad_norm": 1.0196937438072402, "learning_rate": 7.704918032786886e-06, "loss": 0.0892, "step": 47 }, { "epoch": 0.15815485996705106, "grad_norm": 1.2171514922994324, "learning_rate": 7.868852459016394e-06, "loss": 0.0778, "step": 48 }, { "epoch": 0.1614497528830313, "grad_norm": 1.0680772396455713, "learning_rate": 8.032786885245902e-06, "loss": 0.084, "step": 49 }, { "epoch": 0.16474464579901152, "grad_norm": 2.9252478676320557, "learning_rate": 8.19672131147541e-06, "loss": 0.1275, "step": 50 }, { "epoch": 0.16803953871499178, "grad_norm": 2.422192715088673, "learning_rate": 8.360655737704919e-06, "loss": 0.1473, "step": 51 }, { "epoch": 0.171334431630972, "grad_norm": 1.8804479896519768, "learning_rate": 8.524590163934427e-06, "loss": 0.1036, "step": 52 }, { "epoch": 0.17462932454695224, "grad_norm": 1.568191135105415, "learning_rate": 8.688524590163935e-06, "loss": 0.1109, "step": 53 }, { "epoch": 0.17792421746293247, "grad_norm": 1.7822383034585774, "learning_rate": 8.852459016393443e-06, "loss": 0.1138, "step": 54 }, { "epoch": 0.1812191103789127, "grad_norm": 2.39128847516323, "learning_rate": 9.016393442622952e-06, "loss": 0.0995, "step": 55 }, { "epoch": 0.18451400329489293, "grad_norm": 2.005472201286874, "learning_rate": 9.18032786885246e-06, "loss": 0.1071, "step": 56 }, { "epoch": 0.18780889621087316, "grad_norm": 1.771963789348757, "learning_rate": 9.344262295081968e-06, "loss": 0.1194, "step": 57 }, { "epoch": 0.19110378912685339, "grad_norm": 1.9883587282426103, "learning_rate": 9.508196721311476e-06, "loss": 0.1263, "step": 58 }, { "epoch": 0.19439868204283361, "grad_norm": 1.869337708289752, "learning_rate": 9.672131147540984e-06, "loss": 0.0906, "step": 59 }, { "epoch": 0.19769357495881384, "grad_norm": 1.95284840649176, "learning_rate": 9.836065573770493e-06, "loss": 0.0941, "step": 60 }, { "epoch": 0.20098846787479407, "grad_norm": 0.9408728760921459, "learning_rate": 1e-05, "loss": 0.0892, "step": 61 }, { "epoch": 0.2042833607907743, "grad_norm": 1.4827130661563888, "learning_rate": 9.999916929744365e-06, "loss": 0.0897, "step": 62 }, { "epoch": 0.20757825370675453, "grad_norm": 1.7548705917176584, "learning_rate": 9.999667721737726e-06, "loss": 0.1052, "step": 63 }, { "epoch": 0.21087314662273476, "grad_norm": 1.5160957230364678, "learning_rate": 9.999252384260794e-06, "loss": 0.0959, "step": 64 }, { "epoch": 0.214168039538715, "grad_norm": 1.4961421846399192, "learning_rate": 9.998670931114443e-06, "loss": 0.0738, "step": 65 }, { "epoch": 0.21746293245469522, "grad_norm": 1.6241097546909007, "learning_rate": 9.997923381619257e-06, "loss": 0.1057, "step": 66 }, { "epoch": 0.22075782537067545, "grad_norm": 1.3354496262353763, "learning_rate": 9.99700976061489e-06, "loss": 0.0951, "step": 67 }, { "epoch": 0.22405271828665568, "grad_norm": 1.0847564491580965, "learning_rate": 9.99593009845923e-06, "loss": 0.0863, "step": 68 }, { "epoch": 0.2273476112026359, "grad_norm": 1.6871231023520186, "learning_rate": 9.994684431027407e-06, "loss": 0.0804, "step": 69 }, { "epoch": 0.23064250411861614, "grad_norm": 1.428280535176863, "learning_rate": 9.99327279971058e-06, "loss": 0.0865, "step": 70 }, { "epoch": 0.23393739703459637, "grad_norm": 1.1179352637786575, "learning_rate": 9.991695251414584e-06, "loss": 0.0734, "step": 71 }, { "epoch": 0.2372322899505766, "grad_norm": 1.2033588488413391, "learning_rate": 9.989951838558352e-06, "loss": 0.105, "step": 72 }, { "epoch": 0.24052718286655683, "grad_norm": 1.465169988779563, "learning_rate": 9.988042619072185e-06, "loss": 0.0729, "step": 73 }, { "epoch": 0.24382207578253706, "grad_norm": 0.8979589269956051, "learning_rate": 9.985967656395823e-06, "loss": 0.0802, "step": 74 }, { "epoch": 0.2471169686985173, "grad_norm": 1.733084591549866, "learning_rate": 9.98372701947634e-06, "loss": 0.1105, "step": 75 }, { "epoch": 0.2504118616144975, "grad_norm": 1.3264470296840676, "learning_rate": 9.981320782765847e-06, "loss": 0.0994, "step": 76 }, { "epoch": 0.25370675453047775, "grad_norm": 1.6544247928428517, "learning_rate": 9.978749026219023e-06, "loss": 0.0729, "step": 77 }, { "epoch": 0.257001647446458, "grad_norm": 1.7379037158314299, "learning_rate": 9.976011835290457e-06, "loss": 0.1084, "step": 78 }, { "epoch": 0.2602965403624382, "grad_norm": 2.1898271902712465, "learning_rate": 9.973109300931813e-06, "loss": 0.14, "step": 79 }, { "epoch": 0.26359143327841844, "grad_norm": 1.5180418313244188, "learning_rate": 9.970041519588797e-06, "loss": 0.1032, "step": 80 }, { "epoch": 0.26688632619439867, "grad_norm": 0.9058107350669203, "learning_rate": 9.966808593197959e-06, "loss": 0.0659, "step": 81 }, { "epoch": 0.2701812191103789, "grad_norm": 1.6740173453146032, "learning_rate": 9.963410629183311e-06, "loss": 0.0903, "step": 82 }, { "epoch": 0.27347611202635913, "grad_norm": 1.6927879596763102, "learning_rate": 9.959847740452746e-06, "loss": 0.1011, "step": 83 }, { "epoch": 0.27677100494233936, "grad_norm": 1.1858716785831125, "learning_rate": 9.956120045394297e-06, "loss": 0.1001, "step": 84 }, { "epoch": 0.2800658978583196, "grad_norm": 1.7472793459338325, "learning_rate": 9.952227667872197e-06, "loss": 0.0999, "step": 85 }, { "epoch": 0.2833607907742998, "grad_norm": 1.3355726739748197, "learning_rate": 9.948170737222763e-06, "loss": 0.0869, "step": 86 }, { "epoch": 0.28665568369028005, "grad_norm": 1.6554862377061288, "learning_rate": 9.943949388250102e-06, "loss": 0.0956, "step": 87 }, { "epoch": 0.2899505766062603, "grad_norm": 1.4959889750287383, "learning_rate": 9.939563761221628e-06, "loss": 0.1011, "step": 88 }, { "epoch": 0.2932454695222405, "grad_norm": 1.90960683038771, "learning_rate": 9.935014001863405e-06, "loss": 0.086, "step": 89 }, { "epoch": 0.29654036243822074, "grad_norm": 1.33889912510083, "learning_rate": 9.930300261355305e-06, "loss": 0.0884, "step": 90 }, { "epoch": 0.29983525535420097, "grad_norm": 1.8364942381661888, "learning_rate": 9.925422696325976e-06, "loss": 0.1198, "step": 91 }, { "epoch": 0.3031301482701812, "grad_norm": 0.9853906443215683, "learning_rate": 9.920381468847648e-06, "loss": 0.0805, "step": 92 }, { "epoch": 0.30642504118616143, "grad_norm": 1.802022784884664, "learning_rate": 9.915176746430746e-06, "loss": 0.1, "step": 93 }, { "epoch": 0.30971993410214166, "grad_norm": 1.6019896189177425, "learning_rate": 9.909808702018315e-06, "loss": 0.1063, "step": 94 }, { "epoch": 0.3130148270181219, "grad_norm": 1.7270522226909808, "learning_rate": 9.904277513980285e-06, "loss": 0.1009, "step": 95 }, { "epoch": 0.3163097199341021, "grad_norm": 1.355594119054743, "learning_rate": 9.898583366107539e-06, "loss": 0.0875, "step": 96 }, { "epoch": 0.31960461285008235, "grad_norm": 1.8076099955595957, "learning_rate": 9.892726447605803e-06, "loss": 0.1236, "step": 97 }, { "epoch": 0.3228995057660626, "grad_norm": 1.6099906623254512, "learning_rate": 9.886706953089364e-06, "loss": 0.0873, "step": 98 }, { "epoch": 0.3261943986820428, "grad_norm": 1.1035445355215792, "learning_rate": 9.880525082574604e-06, "loss": 0.0869, "step": 99 }, { "epoch": 0.32948929159802304, "grad_norm": 1.4670801213842743, "learning_rate": 9.874181041473344e-06, "loss": 0.1076, "step": 100 }, { "epoch": 0.33278418451400327, "grad_norm": 1.1434779708887954, "learning_rate": 9.867675040586035e-06, "loss": 0.0987, "step": 101 }, { "epoch": 0.33607907742998355, "grad_norm": 1.2329688966692105, "learning_rate": 9.861007296094736e-06, "loss": 0.0685, "step": 102 }, { "epoch": 0.3393739703459638, "grad_norm": 1.33973829264815, "learning_rate": 9.854178029555945e-06, "loss": 0.0926, "step": 103 }, { "epoch": 0.342668863261944, "grad_norm": 2.4442395017358565, "learning_rate": 9.847187467893228e-06, "loss": 0.0942, "step": 104 }, { "epoch": 0.34596375617792424, "grad_norm": 1.5545572705565789, "learning_rate": 9.840035843389684e-06, "loss": 0.0849, "step": 105 }, { "epoch": 0.34925864909390447, "grad_norm": 0.8361181058922856, "learning_rate": 9.832723393680222e-06, "loss": 0.0678, "step": 106 }, { "epoch": 0.3525535420098847, "grad_norm": 1.1875767399598742, "learning_rate": 9.825250361743667e-06, "loss": 0.0922, "step": 107 }, { "epoch": 0.35584843492586493, "grad_norm": 1.018855526493989, "learning_rate": 9.817616995894694e-06, "loss": 0.0893, "step": 108 }, { "epoch": 0.35914332784184516, "grad_norm": 0.8793948272396893, "learning_rate": 9.809823549775559e-06, "loss": 0.0816, "step": 109 }, { "epoch": 0.3624382207578254, "grad_norm": 1.1843731418180148, "learning_rate": 9.801870282347686e-06, "loss": 0.0815, "step": 110 }, { "epoch": 0.3657331136738056, "grad_norm": 1.3513059654506845, "learning_rate": 9.793757457883062e-06, "loss": 0.0838, "step": 111 }, { "epoch": 0.36902800658978585, "grad_norm": 0.9877986179117915, "learning_rate": 9.785485345955446e-06, "loss": 0.0873, "step": 112 }, { "epoch": 0.3723228995057661, "grad_norm": 0.8970818631280407, "learning_rate": 9.777054221431418e-06, "loss": 0.0611, "step": 113 }, { "epoch": 0.3756177924217463, "grad_norm": 0.9226544421763598, "learning_rate": 9.768464364461248e-06, "loss": 0.078, "step": 114 }, { "epoch": 0.37891268533772654, "grad_norm": 1.4766754423858683, "learning_rate": 9.75971606046958e-06, "loss": 0.0858, "step": 115 }, { "epoch": 0.38220757825370677, "grad_norm": 1.320312770948565, "learning_rate": 9.750809600145955e-06, "loss": 0.0872, "step": 116 }, { "epoch": 0.385502471169687, "grad_norm": 0.9205622576015271, "learning_rate": 9.741745279435144e-06, "loss": 0.0855, "step": 117 }, { "epoch": 0.38879736408566723, "grad_norm": 2.1130956623266384, "learning_rate": 9.732523399527328e-06, "loss": 0.0869, "step": 118 }, { "epoch": 0.39209225700164746, "grad_norm": 1.346773242530782, "learning_rate": 9.723144266848073e-06, "loss": 0.0891, "step": 119 }, { "epoch": 0.3953871499176277, "grad_norm": 0.954418724615336, "learning_rate": 9.713608193048156e-06, "loss": 0.0927, "step": 120 }, { "epoch": 0.3986820428336079, "grad_norm": 1.1613672266850283, "learning_rate": 9.703915494993215e-06, "loss": 0.0946, "step": 121 }, { "epoch": 0.40197693574958815, "grad_norm": 0.9668677562255413, "learning_rate": 9.694066494753211e-06, "loss": 0.0828, "step": 122 }, { "epoch": 0.4052718286655684, "grad_norm": 1.1556457477946604, "learning_rate": 9.684061519591734e-06, "loss": 0.0926, "step": 123 }, { "epoch": 0.4085667215815486, "grad_norm": 1.0348616873807939, "learning_rate": 9.673900901955118e-06, "loss": 0.0942, "step": 124 }, { "epoch": 0.41186161449752884, "grad_norm": 0.954839683053947, "learning_rate": 9.663584979461407e-06, "loss": 0.0841, "step": 125 }, { "epoch": 0.41515650741350907, "grad_norm": 0.8951987570639582, "learning_rate": 9.653114094889128e-06, "loss": 0.082, "step": 126 }, { "epoch": 0.4184514003294893, "grad_norm": 0.5582851260265275, "learning_rate": 9.642488596165903e-06, "loss": 0.0579, "step": 127 }, { "epoch": 0.42174629324546953, "grad_norm": 0.7282031291743011, "learning_rate": 9.631708836356893e-06, "loss": 0.0686, "step": 128 }, { "epoch": 0.42504118616144976, "grad_norm": 0.7341380700784487, "learning_rate": 9.620775173653055e-06, "loss": 0.0581, "step": 129 }, { "epoch": 0.42833607907743, "grad_norm": 1.0426311808706392, "learning_rate": 9.609687971359254e-06, "loss": 0.0863, "step": 130 }, { "epoch": 0.4316309719934102, "grad_norm": 0.7221932362532579, "learning_rate": 9.598447597882181e-06, "loss": 0.0904, "step": 131 }, { "epoch": 0.43492586490939045, "grad_norm": 1.057905146517346, "learning_rate": 9.587054426718117e-06, "loss": 0.087, "step": 132 }, { "epoch": 0.4382207578253707, "grad_norm": 1.0654844742045295, "learning_rate": 9.575508836440516e-06, "loss": 0.0833, "step": 133 }, { "epoch": 0.4415156507413509, "grad_norm": 0.9708392842496616, "learning_rate": 9.563811210687433e-06, "loss": 0.07, "step": 134 }, { "epoch": 0.44481054365733114, "grad_norm": 0.869842645454385, "learning_rate": 9.551961938148772e-06, "loss": 0.0798, "step": 135 }, { "epoch": 0.44810543657331137, "grad_norm": 0.9291669588809491, "learning_rate": 9.539961412553375e-06, "loss": 0.0717, "step": 136 }, { "epoch": 0.4514003294892916, "grad_norm": 0.971502019921878, "learning_rate": 9.52781003265593e-06, "loss": 0.0936, "step": 137 }, { "epoch": 0.4546952224052718, "grad_norm": 0.7797238947376094, "learning_rate": 9.515508202223735e-06, "loss": 0.0711, "step": 138 }, { "epoch": 0.45799011532125206, "grad_norm": 1.0743726394776476, "learning_rate": 9.503056330023267e-06, "loss": 0.0755, "step": 139 }, { "epoch": 0.4612850082372323, "grad_norm": 1.1954773475892864, "learning_rate": 9.490454829806609e-06, "loss": 0.1304, "step": 140 }, { "epoch": 0.4645799011532125, "grad_norm": 0.5763939890921029, "learning_rate": 9.477704120297698e-06, "loss": 0.0614, "step": 141 }, { "epoch": 0.46787479406919275, "grad_norm": 0.9472151466304063, "learning_rate": 9.464804625178414e-06, "loss": 0.0712, "step": 142 }, { "epoch": 0.471169686985173, "grad_norm": 0.886844022382936, "learning_rate": 9.4517567730745e-06, "loss": 0.0797, "step": 143 }, { "epoch": 0.4744645799011532, "grad_norm": 1.0134166748412918, "learning_rate": 9.438560997541319e-06, "loss": 0.0899, "step": 144 }, { "epoch": 0.47775947281713343, "grad_norm": 0.8070690603660651, "learning_rate": 9.425217737049452e-06, "loss": 0.0826, "step": 145 }, { "epoch": 0.48105436573311366, "grad_norm": 0.8619994488550511, "learning_rate": 9.411727434970121e-06, "loss": 0.086, "step": 146 }, { "epoch": 0.4843492586490939, "grad_norm": 0.8778816857460797, "learning_rate": 9.398090539560465e-06, "loss": 0.0854, "step": 147 }, { "epoch": 0.4876441515650741, "grad_norm": 1.1118672558694775, "learning_rate": 9.384307503948637e-06, "loss": 0.1105, "step": 148 }, { "epoch": 0.49093904448105435, "grad_norm": 1.032735807082786, "learning_rate": 9.370378786118755e-06, "loss": 0.0783, "step": 149 }, { "epoch": 0.4942339373970346, "grad_norm": 1.0495708384737894, "learning_rate": 9.356304848895676e-06, "loss": 0.0815, "step": 150 }, { "epoch": 0.4975288303130148, "grad_norm": 1.0945068947939716, "learning_rate": 9.342086159929629e-06, "loss": 0.0875, "step": 151 }, { "epoch": 0.500823723228995, "grad_norm": 1.1286751183737302, "learning_rate": 9.327723191680666e-06, "loss": 0.0545, "step": 152 }, { "epoch": 0.5041186161449753, "grad_norm": 0.9183247855730471, "learning_rate": 9.31321642140296e-06, "loss": 0.0757, "step": 153 }, { "epoch": 0.5074135090609555, "grad_norm": 1.0502035656993673, "learning_rate": 9.29856633112896e-06, "loss": 0.0809, "step": 154 }, { "epoch": 0.5107084019769358, "grad_norm": 0.8738749178267629, "learning_rate": 9.283773407653363e-06, "loss": 0.0562, "step": 155 }, { "epoch": 0.514003294892916, "grad_norm": 1.0813438666782247, "learning_rate": 9.268838142516943e-06, "loss": 0.085, "step": 156 }, { "epoch": 0.5172981878088962, "grad_norm": 1.0115574109758954, "learning_rate": 9.253761031990218e-06, "loss": 0.0749, "step": 157 }, { "epoch": 0.5205930807248764, "grad_norm": 1.375042365212545, "learning_rate": 9.238542577056957e-06, "loss": 0.078, "step": 158 }, { "epoch": 0.5238879736408567, "grad_norm": 1.59130187611513, "learning_rate": 9.223183283397538e-06, "loss": 0.1029, "step": 159 }, { "epoch": 0.5271828665568369, "grad_norm": 1.1395752837101527, "learning_rate": 9.20768366137214e-06, "loss": 0.1128, "step": 160 }, { "epoch": 0.5304777594728172, "grad_norm": 0.8079597092244131, "learning_rate": 9.19204422600379e-06, "loss": 0.0527, "step": 161 }, { "epoch": 0.5337726523887973, "grad_norm": 1.0336684369348528, "learning_rate": 9.176265496961242e-06, "loss": 0.0828, "step": 162 }, { "epoch": 0.5370675453047776, "grad_norm": 0.8656528491730779, "learning_rate": 9.160347998541722e-06, "loss": 0.0704, "step": 163 }, { "epoch": 0.5403624382207578, "grad_norm": 2.086614798853126, "learning_rate": 9.144292259653493e-06, "loss": 0.104, "step": 164 }, { "epoch": 0.5436573311367381, "grad_norm": 1.5779000987891205, "learning_rate": 9.128098813798291e-06, "loss": 0.0996, "step": 165 }, { "epoch": 0.5469522240527183, "grad_norm": 0.7283818376854391, "learning_rate": 9.111768199053588e-06, "loss": 0.0621, "step": 166 }, { "epoch": 0.5502471169686985, "grad_norm": 1.2155979859224575, "learning_rate": 9.095300958054722e-06, "loss": 0.0653, "step": 167 }, { "epoch": 0.5535420098846787, "grad_norm": 1.3669964434695867, "learning_rate": 9.078697637976861e-06, "loss": 0.1071, "step": 168 }, { "epoch": 0.556836902800659, "grad_norm": 0.659337598200629, "learning_rate": 9.061958790516821e-06, "loss": 0.101, "step": 169 }, { "epoch": 0.5601317957166392, "grad_norm": 3.064428601730586, "learning_rate": 9.045084971874738e-06, "loss": 0.0631, "step": 170 }, { "epoch": 0.5634266886326195, "grad_norm": 1.8617437169334994, "learning_rate": 9.028076742735583e-06, "loss": 0.1062, "step": 171 }, { "epoch": 0.5667215815485996, "grad_norm": 1.005975190266642, "learning_rate": 9.010934668250533e-06, "loss": 0.0706, "step": 172 }, { "epoch": 0.5700164744645799, "grad_norm": 1.2704133524125742, "learning_rate": 8.993659318018191e-06, "loss": 0.1047, "step": 173 }, { "epoch": 0.5733113673805601, "grad_norm": 1.5091840035688024, "learning_rate": 8.976251266065663e-06, "loss": 0.0915, "step": 174 }, { "epoch": 0.5766062602965404, "grad_norm": 0.9983867645520093, "learning_rate": 8.958711090829477e-06, "loss": 0.0868, "step": 175 }, { "epoch": 0.5799011532125206, "grad_norm": 0.9695191782522918, "learning_rate": 8.94103937513637e-06, "loss": 0.0782, "step": 176 }, { "epoch": 0.5831960461285008, "grad_norm": 1.485821220772562, "learning_rate": 8.923236706183923e-06, "loss": 0.088, "step": 177 }, { "epoch": 0.586490939044481, "grad_norm": 0.8778171297271887, "learning_rate": 8.905303675521031e-06, "loss": 0.0675, "step": 178 }, { "epoch": 0.5897858319604613, "grad_norm": 1.4148551976101613, "learning_rate": 8.887240879028276e-06, "loss": 0.0968, "step": 179 }, { "epoch": 0.5930807248764415, "grad_norm": 1.2313668213255908, "learning_rate": 8.869048916898109e-06, "loss": 0.0885, "step": 180 }, { "epoch": 0.5963756177924218, "grad_norm": 0.7935552297095103, "learning_rate": 8.850728393614903e-06, "loss": 0.0919, "step": 181 }, { "epoch": 0.5996705107084019, "grad_norm": 0.6219191194067725, "learning_rate": 8.832279917934881e-06, "loss": 0.0495, "step": 182 }, { "epoch": 0.6029654036243822, "grad_norm": 0.891323405042387, "learning_rate": 8.813704102865881e-06, "loss": 0.1036, "step": 183 }, { "epoch": 0.6062602965403624, "grad_norm": 0.7672391951678563, "learning_rate": 8.795001565646983e-06, "loss": 0.0728, "step": 184 }, { "epoch": 0.6095551894563427, "grad_norm": 0.9915439152928615, "learning_rate": 8.776172927728008e-06, "loss": 0.0744, "step": 185 }, { "epoch": 0.6128500823723229, "grad_norm": 0.7415850366120278, "learning_rate": 8.75721881474886e-06, "loss": 0.0999, "step": 186 }, { "epoch": 0.6161449752883031, "grad_norm": 0.7803751476377428, "learning_rate": 8.738139856518746e-06, "loss": 0.084, "step": 187 }, { "epoch": 0.6194398682042833, "grad_norm": 0.6444369328111929, "learning_rate": 8.718936686995239e-06, "loss": 0.0632, "step": 188 }, { "epoch": 0.6227347611202636, "grad_norm": 1.063336491675898, "learning_rate": 8.699609944263219e-06, "loss": 0.0854, "step": 189 }, { "epoch": 0.6260296540362438, "grad_norm": 0.5254825539939502, "learning_rate": 8.680160270513671e-06, "loss": 0.0658, "step": 190 }, { "epoch": 0.6293245469522241, "grad_norm": 2.1957036152343097, "learning_rate": 8.660588312022345e-06, "loss": 0.0767, "step": 191 }, { "epoch": 0.6326194398682042, "grad_norm": 1.1425043272398887, "learning_rate": 8.640894719128274e-06, "loss": 0.1092, "step": 192 }, { "epoch": 0.6359143327841845, "grad_norm": 0.6811077233527263, "learning_rate": 8.621080146212181e-06, "loss": 0.0552, "step": 193 }, { "epoch": 0.6392092257001647, "grad_norm": 1.0217179933628129, "learning_rate": 8.601145251674718e-06, "loss": 0.0749, "step": 194 }, { "epoch": 0.642504118616145, "grad_norm": 0.9919309113126284, "learning_rate": 8.581090697914602e-06, "loss": 0.0929, "step": 195 }, { "epoch": 0.6457990115321252, "grad_norm": 1.178486792691301, "learning_rate": 8.560917151306594e-06, "loss": 0.1023, "step": 196 }, { "epoch": 0.6490939044481054, "grad_norm": 1.155779967707836, "learning_rate": 8.540625282179364e-06, "loss": 0.0821, "step": 197 }, { "epoch": 0.6523887973640856, "grad_norm": 1.0278193202776953, "learning_rate": 8.520215764793214e-06, "loss": 0.0739, "step": 198 }, { "epoch": 0.6556836902800659, "grad_norm": 5.872968813903652, "learning_rate": 8.499689277317675e-06, "loss": 0.0763, "step": 199 }, { "epoch": 0.6589785831960461, "grad_norm": 1.4949053388183082, "learning_rate": 8.479046501808971e-06, "loss": 0.0696, "step": 200 }, { "epoch": 0.6622734761120264, "grad_norm": 0.9342591652212858, "learning_rate": 8.45828812418736e-06, "loss": 0.0629, "step": 201 }, { "epoch": 0.6655683690280065, "grad_norm": 0.5142904539906094, "learning_rate": 8.437414834214333e-06, "loss": 0.0653, "step": 202 }, { "epoch": 0.6688632619439868, "grad_norm": 1.3243894037794444, "learning_rate": 8.416427325469705e-06, "loss": 0.1095, "step": 203 }, { "epoch": 0.6721581548599671, "grad_norm": 1.3555237832265656, "learning_rate": 8.395326295328562e-06, "loss": 0.1028, "step": 204 }, { "epoch": 0.6754530477759473, "grad_norm": 0.818855723195901, "learning_rate": 8.374112444938094e-06, "loss": 0.088, "step": 205 }, { "epoch": 0.6787479406919276, "grad_norm": 0.8852074571699485, "learning_rate": 8.352786479194288e-06, "loss": 0.0526, "step": 206 }, { "epoch": 0.6820428336079077, "grad_norm": 1.372137747701387, "learning_rate": 8.331349106718515e-06, "loss": 0.0957, "step": 207 }, { "epoch": 0.685337726523888, "grad_norm": 0.8727917319312107, "learning_rate": 8.309801039833978e-06, "loss": 0.0895, "step": 208 }, { "epoch": 0.6886326194398682, "grad_norm": 1.0509761315988109, "learning_rate": 8.28814299454205e-06, "loss": 0.0996, "step": 209 }, { "epoch": 0.6919275123558485, "grad_norm": 0.8839477803791086, "learning_rate": 8.266375690498475e-06, "loss": 0.0865, "step": 210 }, { "epoch": 0.6952224052718287, "grad_norm": 0.7064553470448757, "learning_rate": 8.244499850989453e-06, "loss": 0.0728, "step": 211 }, { "epoch": 0.6985172981878089, "grad_norm": 0.9694201547278781, "learning_rate": 8.22251620290762e-06, "loss": 0.0581, "step": 212 }, { "epoch": 0.7018121911037891, "grad_norm": 0.724783690835583, "learning_rate": 8.20042547672788e-06, "loss": 0.0828, "step": 213 }, { "epoch": 0.7051070840197694, "grad_norm": 0.7137797204098316, "learning_rate": 8.178228406483145e-06, "loss": 0.0707, "step": 214 }, { "epoch": 0.7084019769357496, "grad_norm": 0.4606523745916078, "learning_rate": 8.15592572973993e-06, "loss": 0.044, "step": 215 }, { "epoch": 0.7116968698517299, "grad_norm": 0.6640092591585782, "learning_rate": 8.133518187573864e-06, "loss": 0.0561, "step": 216 }, { "epoch": 0.71499176276771, "grad_norm": 0.8170937626114954, "learning_rate": 8.111006524545043e-06, "loss": 0.0823, "step": 217 }, { "epoch": 0.7182866556836903, "grad_norm": 0.6372182616249538, "learning_rate": 8.088391488673313e-06, "loss": 0.066, "step": 218 }, { "epoch": 0.7215815485996705, "grad_norm": 0.6375067936631406, "learning_rate": 8.065673831413396e-06, "loss": 0.0506, "step": 219 }, { "epoch": 0.7248764415156508, "grad_norm": 0.6423309999365212, "learning_rate": 8.042854307629932e-06, "loss": 0.0629, "step": 220 }, { "epoch": 0.728171334431631, "grad_norm": 0.752582677343103, "learning_rate": 8.019933675572389e-06, "loss": 0.0722, "step": 221 }, { "epoch": 0.7314662273476112, "grad_norm": 0.8112402585396246, "learning_rate": 7.996912696849873e-06, "loss": 0.0842, "step": 222 }, { "epoch": 0.7347611202635914, "grad_norm": 0.6919487336036387, "learning_rate": 7.97379213640582e-06, "loss": 0.0684, "step": 223 }, { "epoch": 0.7380560131795717, "grad_norm": 0.7434807539917022, "learning_rate": 7.950572762492577e-06, "loss": 0.0682, "step": 224 }, { "epoch": 0.7413509060955519, "grad_norm": 0.6848656249105235, "learning_rate": 7.927255346645872e-06, "loss": 0.0546, "step": 225 }, { "epoch": 0.7446457990115322, "grad_norm": 0.7826479624770332, "learning_rate": 7.903840663659186e-06, "loss": 0.0684, "step": 226 }, { "epoch": 0.7479406919275123, "grad_norm": 0.6927518800734283, "learning_rate": 7.880329491557996e-06, "loss": 0.079, "step": 227 }, { "epoch": 0.7512355848434926, "grad_norm": 0.8763045243203113, "learning_rate": 7.856722611573938e-06, "loss": 0.1068, "step": 228 }, { "epoch": 0.7545304777594728, "grad_norm": 0.8300681217056403, "learning_rate": 7.83302080811883e-06, "loss": 0.0667, "step": 229 }, { "epoch": 0.7578253706754531, "grad_norm": 0.5437395859594083, "learning_rate": 7.809224868758621e-06, "loss": 0.0671, "step": 230 }, { "epoch": 0.7611202635914333, "grad_norm": 0.7134167417475868, "learning_rate": 7.78533558418722e-06, "loss": 0.079, "step": 231 }, { "epoch": 0.7644151565074135, "grad_norm": 0.8367162252369527, "learning_rate": 7.761353748200213e-06, "loss": 0.075, "step": 232 }, { "epoch": 0.7677100494233937, "grad_norm": 0.6993381735068975, "learning_rate": 7.737280157668503e-06, "loss": 0.0665, "step": 233 }, { "epoch": 0.771004942339374, "grad_norm": 0.644489745443266, "learning_rate": 7.713115612511815e-06, "loss": 0.0704, "step": 234 }, { "epoch": 0.7742998352553542, "grad_norm": 0.6337392482963783, "learning_rate": 7.688860915672129e-06, "loss": 0.0487, "step": 235 }, { "epoch": 0.7775947281713345, "grad_norm": 0.4306575759208823, "learning_rate": 7.664516873086987e-06, "loss": 0.0498, "step": 236 }, { "epoch": 0.7808896210873146, "grad_norm": 0.6371209076121114, "learning_rate": 7.640084293662731e-06, "loss": 0.0581, "step": 237 }, { "epoch": 0.7841845140032949, "grad_norm": 0.809205628205596, "learning_rate": 7.615563989247604e-06, "loss": 0.0886, "step": 238 }, { "epoch": 0.7874794069192751, "grad_norm": 0.6807826450879982, "learning_rate": 7.590956774604791e-06, "loss": 0.0824, "step": 239 }, { "epoch": 0.7907742998352554, "grad_norm": 0.9092838195300236, "learning_rate": 7.566263467385335e-06, "loss": 0.0703, "step": 240 }, { "epoch": 0.7940691927512356, "grad_norm": 0.736565350841279, "learning_rate": 7.541484888100974e-06, "loss": 0.0695, "step": 241 }, { "epoch": 0.7973640856672158, "grad_norm": 0.7220288466907268, "learning_rate": 7.516621860096873e-06, "loss": 0.0707, "step": 242 }, { "epoch": 0.800658978583196, "grad_norm": 0.6829838831547227, "learning_rate": 7.491675209524272e-06, "loss": 0.0666, "step": 243 }, { "epoch": 0.8039538714991763, "grad_norm": 0.8226949141177504, "learning_rate": 7.466645765313023e-06, "loss": 0.0752, "step": 244 }, { "epoch": 0.8072487644151565, "grad_norm": 0.5909405519820083, "learning_rate": 7.4415343591440604e-06, "loss": 0.0582, "step": 245 }, { "epoch": 0.8105436573311368, "grad_norm": 0.7318765147109815, "learning_rate": 7.416341825421755e-06, "loss": 0.078, "step": 246 }, { "epoch": 0.8138385502471169, "grad_norm": 0.7063912838195767, "learning_rate": 7.391069001246193e-06, "loss": 0.0868, "step": 247 }, { "epoch": 0.8171334431630972, "grad_norm": 0.6799477779267012, "learning_rate": 7.365716726385361e-06, "loss": 0.0681, "step": 248 }, { "epoch": 0.8204283360790774, "grad_norm": 0.8516971338664023, "learning_rate": 7.3402858432472416e-06, "loss": 0.0761, "step": 249 }, { "epoch": 0.8237232289950577, "grad_norm": 0.8051104503646311, "learning_rate": 7.3147771968518175e-06, "loss": 0.077, "step": 250 }, { "epoch": 0.8270181219110379, "grad_norm": 0.8417638265928152, "learning_rate": 7.289191634803002e-06, "loss": 0.0721, "step": 251 }, { "epoch": 0.8303130148270181, "grad_norm": 0.9280576906426667, "learning_rate": 7.263530007260466e-06, "loss": 0.0839, "step": 252 }, { "epoch": 0.8336079077429983, "grad_norm": 0.8205604877193189, "learning_rate": 7.2377931669113934e-06, "loss": 0.084, "step": 253 }, { "epoch": 0.8369028006589786, "grad_norm": 0.7347246169190605, "learning_rate": 7.211981968942147e-06, "loss": 0.0508, "step": 254 }, { "epoch": 0.8401976935749588, "grad_norm": 0.7727540137134915, "learning_rate": 7.186097271009852e-06, "loss": 0.0504, "step": 255 }, { "epoch": 0.8434925864909391, "grad_norm": 0.6116838823458901, "learning_rate": 7.160139933213899e-06, "loss": 0.0533, "step": 256 }, { "epoch": 0.8467874794069192, "grad_norm": 0.8518782068127816, "learning_rate": 7.134110818067361e-06, "loss": 0.0775, "step": 257 }, { "epoch": 0.8500823723228995, "grad_norm": 0.9449160515812749, "learning_rate": 7.1080107904683405e-06, "loss": 0.0721, "step": 258 }, { "epoch": 0.8533772652388797, "grad_norm": 0.6964873142430633, "learning_rate": 7.08184071767122e-06, "loss": 0.0673, "step": 259 }, { "epoch": 0.85667215815486, "grad_norm": 0.768104304709271, "learning_rate": 7.0556014692578554e-06, "loss": 0.0749, "step": 260 }, { "epoch": 0.8599670510708401, "grad_norm": 0.7599189113700034, "learning_rate": 7.029293917108678e-06, "loss": 0.0684, "step": 261 }, { "epoch": 0.8632619439868204, "grad_norm": 0.777387517223909, "learning_rate": 7.0029189353737195e-06, "loss": 0.0656, "step": 262 }, { "epoch": 0.8665568369028006, "grad_norm": 0.7045793540209936, "learning_rate": 6.9764774004435685e-06, "loss": 0.0619, "step": 263 }, { "epoch": 0.8698517298187809, "grad_norm": 0.6234760268316166, "learning_rate": 6.949970190920255e-06, "loss": 0.0708, "step": 264 }, { "epoch": 0.8731466227347611, "grad_norm": 0.7124980322892176, "learning_rate": 6.9233981875880416e-06, "loss": 0.0521, "step": 265 }, { "epoch": 0.8764415156507414, "grad_norm": 0.8490902000387839, "learning_rate": 6.896762273384179e-06, "loss": 0.0632, "step": 266 }, { "epoch": 0.8797364085667215, "grad_norm": 0.6944201065528963, "learning_rate": 6.870063333369543e-06, "loss": 0.0716, "step": 267 }, { "epoch": 0.8830313014827018, "grad_norm": 0.758349126043532, "learning_rate": 6.8433022546992444e-06, "loss": 0.0596, "step": 268 }, { "epoch": 0.886326194398682, "grad_norm": 1.2664444257431744, "learning_rate": 6.81647992659314e-06, "loss": 0.0628, "step": 269 }, { "epoch": 0.8896210873146623, "grad_norm": 0.8379324844684077, "learning_rate": 6.789597240306295e-06, "loss": 0.0674, "step": 270 }, { "epoch": 0.8929159802306426, "grad_norm": 0.8462600835900949, "learning_rate": 6.762655089099353e-06, "loss": 0.0659, "step": 271 }, { "epoch": 0.8962108731466227, "grad_norm": 0.9094387161179498, "learning_rate": 6.735654368208875e-06, "loss": 0.0623, "step": 272 }, { "epoch": 0.899505766062603, "grad_norm": 0.7877875224066865, "learning_rate": 6.7085959748175685e-06, "loss": 0.0696, "step": 273 }, { "epoch": 0.9028006589785832, "grad_norm": 0.6514864513423558, "learning_rate": 6.681480808024503e-06, "loss": 0.0766, "step": 274 }, { "epoch": 0.9060955518945635, "grad_norm": 1.148236164352365, "learning_rate": 6.654309768815208e-06, "loss": 0.0903, "step": 275 }, { "epoch": 0.9093904448105437, "grad_norm": 0.7078109102899715, "learning_rate": 6.627083760031755e-06, "loss": 0.0607, "step": 276 }, { "epoch": 0.9126853377265239, "grad_norm": 0.613094345393223, "learning_rate": 6.599803686342748e-06, "loss": 0.0655, "step": 277 }, { "epoch": 0.9159802306425041, "grad_norm": 0.6642339763695972, "learning_rate": 6.572470454213266e-06, "loss": 0.0731, "step": 278 }, { "epoch": 0.9192751235584844, "grad_norm": 0.6971630112819691, "learning_rate": 6.545084971874738e-06, "loss": 0.0473, "step": 279 }, { "epoch": 0.9225700164744646, "grad_norm": 0.7592858638911076, "learning_rate": 6.517648149294774e-06, "loss": 0.0581, "step": 280 }, { "epoch": 0.9258649093904449, "grad_norm": 0.7189143571066544, "learning_rate": 6.490160898146919e-06, "loss": 0.0733, "step": 281 }, { "epoch": 0.929159802306425, "grad_norm": 0.8305599945381572, "learning_rate": 6.4626241317803665e-06, "loss": 0.0807, "step": 282 }, { "epoch": 0.9324546952224053, "grad_norm": 0.8787944618632045, "learning_rate": 6.4350387651896025e-06, "loss": 0.0648, "step": 283 }, { "epoch": 0.9357495881383855, "grad_norm": 0.649270561331511, "learning_rate": 6.407405714984011e-06, "loss": 0.0921, "step": 284 }, { "epoch": 0.9390444810543658, "grad_norm": 0.9873611661857511, "learning_rate": 6.379725899357408e-06, "loss": 0.0847, "step": 285 }, { "epoch": 0.942339373970346, "grad_norm": 0.8338719043181901, "learning_rate": 6.3520002380575395e-06, "loss": 0.0673, "step": 286 }, { "epoch": 0.9456342668863262, "grad_norm": 0.8390156519820746, "learning_rate": 6.324229652355513e-06, "loss": 0.0626, "step": 287 }, { "epoch": 0.9489291598023064, "grad_norm": 0.7197773939188823, "learning_rate": 6.29641506501519e-06, "loss": 0.0864, "step": 288 }, { "epoch": 0.9522240527182867, "grad_norm": 0.942984980454084, "learning_rate": 6.2685574002625235e-06, "loss": 0.0686, "step": 289 }, { "epoch": 0.9555189456342669, "grad_norm": 0.9649936636393807, "learning_rate": 6.2406575837548455e-06, "loss": 0.0599, "step": 290 }, { "epoch": 0.9588138385502472, "grad_norm": 0.6889881534410974, "learning_rate": 6.212716542550112e-06, "loss": 0.101, "step": 291 }, { "epoch": 0.9621087314662273, "grad_norm": 0.9632795509211302, "learning_rate": 6.184735205076097e-06, "loss": 0.0773, "step": 292 }, { "epoch": 0.9654036243822076, "grad_norm": 1.0400767819370376, "learning_rate": 6.156714501099544e-06, "loss": 0.0638, "step": 293 }, { "epoch": 0.9686985172981878, "grad_norm": 1.0147243725605253, "learning_rate": 6.1286553616952705e-06, "loss": 0.0593, "step": 294 }, { "epoch": 0.9719934102141681, "grad_norm": 0.6613193470791487, "learning_rate": 6.100558719215228e-06, "loss": 0.0632, "step": 295 }, { "epoch": 0.9752883031301482, "grad_norm": 1.0408938474730054, "learning_rate": 6.072425507257528e-06, "loss": 0.0876, "step": 296 }, { "epoch": 0.9785831960461285, "grad_norm": 0.712701647042842, "learning_rate": 6.044256660635412e-06, "loss": 0.0733, "step": 297 }, { "epoch": 0.9818780889621087, "grad_norm": 0.6397491114376809, "learning_rate": 6.016053115346197e-06, "loss": 0.0561, "step": 298 }, { "epoch": 0.985172981878089, "grad_norm": 0.7191102659386986, "learning_rate": 5.987815808540169e-06, "loss": 0.0791, "step": 299 }, { "epoch": 0.9884678747940692, "grad_norm": 0.4709712102337363, "learning_rate": 5.959545678489447e-06, "loss": 0.0475, "step": 300 }, { "epoch": 0.9917627677100495, "grad_norm": 0.8715274588578796, "learning_rate": 5.931243664556803e-06, "loss": 0.0771, "step": 301 }, { "epoch": 0.9950576606260296, "grad_norm": 0.7017524340447387, "learning_rate": 5.902910707164449e-06, "loss": 0.0712, "step": 302 }, { "epoch": 0.9983525535420099, "grad_norm": 0.7619744594259967, "learning_rate": 5.874547747762792e-06, "loss": 0.0585, "step": 303 }, { "epoch": 0.9983525535420099, "eval_loss": 0.07007648050785065, "eval_runtime": 143.0638, "eval_samples_per_second": 35.683, "eval_steps_per_second": 1.118, "step": 303 }, { "epoch": 1.00164744645799, "grad_norm": 0.5917774055195716, "learning_rate": 5.8461557287991455e-06, "loss": 0.0686, "step": 304 }, { "epoch": 1.0049423393739703, "grad_norm": 0.4973762275932349, "learning_rate": 5.81773559368642e-06, "loss": 0.0524, "step": 305 }, { "epoch": 1.0082372322899507, "grad_norm": 0.5021975231329254, "learning_rate": 5.7892882867717705e-06, "loss": 0.0577, "step": 306 }, { "epoch": 1.0115321252059308, "grad_norm": 0.704352626743678, "learning_rate": 5.7608147533052194e-06, "loss": 0.0509, "step": 307 }, { "epoch": 1.014827018121911, "grad_norm": 1.189723828759097, "learning_rate": 5.732315939408251e-06, "loss": 0.0815, "step": 308 }, { "epoch": 1.0181219110378912, "grad_norm": 0.6036027009145574, "learning_rate": 5.703792792042363e-06, "loss": 0.0556, "step": 309 }, { "epoch": 1.0214168039538716, "grad_norm": 0.5342904909103813, "learning_rate": 5.675246258977617e-06, "loss": 0.0487, "step": 310 }, { "epoch": 1.0247116968698518, "grad_norm": 0.46763620767148034, "learning_rate": 5.646677288761132e-06, "loss": 0.0491, "step": 311 }, { "epoch": 1.028006589785832, "grad_norm": 0.5696375911949768, "learning_rate": 5.618086830685569e-06, "loss": 0.047, "step": 312 }, { "epoch": 1.031301482701812, "grad_norm": 0.38600791899244996, "learning_rate": 5.589475834757595e-06, "loss": 0.032, "step": 313 }, { "epoch": 1.0345963756177925, "grad_norm": 1.0072710877393638, "learning_rate": 5.560845251666307e-06, "loss": 0.063, "step": 314 }, { "epoch": 1.0378912685337727, "grad_norm": 0.663725882779124, "learning_rate": 5.532196032751647e-06, "loss": 0.0563, "step": 315 }, { "epoch": 1.0411861614497528, "grad_norm": 0.6135177621912624, "learning_rate": 5.503529129972792e-06, "loss": 0.0514, "step": 316 }, { "epoch": 1.044481054365733, "grad_norm": 0.7549455934476204, "learning_rate": 5.474845495876518e-06, "loss": 0.0563, "step": 317 }, { "epoch": 1.0477759472817134, "grad_norm": 0.8244910748727189, "learning_rate": 5.4461460835655535e-06, "loss": 0.0804, "step": 318 }, { "epoch": 1.0510708401976936, "grad_norm": 0.604488079236042, "learning_rate": 5.417431846666903e-06, "loss": 0.0679, "step": 319 }, { "epoch": 1.0543657331136738, "grad_norm": 0.5136265587955748, "learning_rate": 5.388703739300167e-06, "loss": 0.0388, "step": 320 }, { "epoch": 1.057660626029654, "grad_norm": 0.6007478171198604, "learning_rate": 5.359962716045836e-06, "loss": 0.0632, "step": 321 }, { "epoch": 1.0609555189456343, "grad_norm": 0.4928892879154173, "learning_rate": 5.331209731913568e-06, "loss": 0.058, "step": 322 }, { "epoch": 1.0642504118616145, "grad_norm": 0.5300520318408385, "learning_rate": 5.30244574231046e-06, "loss": 0.0528, "step": 323 }, { "epoch": 1.0675453047775947, "grad_norm": 0.49159187140329286, "learning_rate": 5.273671703009301e-06, "loss": 0.046, "step": 324 }, { "epoch": 1.0708401976935749, "grad_norm": 0.804620211006138, "learning_rate": 5.2448885701168094e-06, "loss": 0.0601, "step": 325 }, { "epoch": 1.0741350906095553, "grad_norm": 0.4648011852930538, "learning_rate": 5.21609730004187e-06, "loss": 0.0438, "step": 326 }, { "epoch": 1.0774299835255354, "grad_norm": 0.5362596735899865, "learning_rate": 5.187298849463748e-06, "loss": 0.0507, "step": 327 }, { "epoch": 1.0807248764415156, "grad_norm": 0.5443586783585722, "learning_rate": 5.158494175300304e-06, "loss": 0.053, "step": 328 }, { "epoch": 1.084019769357496, "grad_norm": 0.6076056192307563, "learning_rate": 5.129684234676195e-06, "loss": 0.0594, "step": 329 }, { "epoch": 1.0873146622734762, "grad_norm": 0.9033252357763137, "learning_rate": 5.100869984891077e-06, "loss": 0.06, "step": 330 }, { "epoch": 1.0906095551894563, "grad_norm": 0.454480847306655, "learning_rate": 5.072052383387787e-06, "loss": 0.0424, "step": 331 }, { "epoch": 1.0939044481054365, "grad_norm": 0.46517988927206794, "learning_rate": 5.043232387720532e-06, "loss": 0.0443, "step": 332 }, { "epoch": 1.0971993410214167, "grad_norm": 0.4148720401510593, "learning_rate": 5.014410955523079e-06, "loss": 0.0387, "step": 333 }, { "epoch": 1.100494233937397, "grad_norm": 0.5146539821704307, "learning_rate": 4.9855890444769226e-06, "loss": 0.0563, "step": 334 }, { "epoch": 1.1037891268533773, "grad_norm": 0.5267211782218569, "learning_rate": 4.956767612279468e-06, "loss": 0.044, "step": 335 }, { "epoch": 1.1070840197693574, "grad_norm": 0.5731696810590752, "learning_rate": 4.927947616612216e-06, "loss": 0.0469, "step": 336 }, { "epoch": 1.1103789126853378, "grad_norm": 0.4606767989043497, "learning_rate": 4.899130015108923e-06, "loss": 0.0556, "step": 337 }, { "epoch": 1.113673805601318, "grad_norm": 0.5591348812226693, "learning_rate": 4.8703157653238055e-06, "loss": 0.0526, "step": 338 }, { "epoch": 1.1169686985172982, "grad_norm": 0.5103079438074868, "learning_rate": 4.841505824699697e-06, "loss": 0.0651, "step": 339 }, { "epoch": 1.1202635914332784, "grad_norm": 0.6163138349681117, "learning_rate": 4.812701150536254e-06, "loss": 0.0509, "step": 340 }, { "epoch": 1.1235584843492585, "grad_norm": 0.4842115475256147, "learning_rate": 4.78390269995813e-06, "loss": 0.035, "step": 341 }, { "epoch": 1.126853377265239, "grad_norm": 0.4047877822645327, "learning_rate": 4.755111429883191e-06, "loss": 0.0342, "step": 342 }, { "epoch": 1.130148270181219, "grad_norm": 0.5782935405242332, "learning_rate": 4.726328296990699e-06, "loss": 0.0416, "step": 343 }, { "epoch": 1.1334431630971993, "grad_norm": 0.5846524401590787, "learning_rate": 4.697554257689541e-06, "loss": 0.0419, "step": 344 }, { "epoch": 1.1367380560131797, "grad_norm": 0.5096985328650335, "learning_rate": 4.668790268086432e-06, "loss": 0.044, "step": 345 }, { "epoch": 1.1400329489291599, "grad_norm": 0.5796683420196656, "learning_rate": 4.640037283954165e-06, "loss": 0.0634, "step": 346 }, { "epoch": 1.14332784184514, "grad_norm": 0.5897186824110954, "learning_rate": 4.611296260699833e-06, "loss": 0.0511, "step": 347 }, { "epoch": 1.1466227347611202, "grad_norm": 0.5043407904517478, "learning_rate": 4.582568153333098e-06, "loss": 0.0474, "step": 348 }, { "epoch": 1.1499176276771004, "grad_norm": 0.49203813884361564, "learning_rate": 4.553853916434448e-06, "loss": 0.0399, "step": 349 }, { "epoch": 1.1532125205930808, "grad_norm": 0.6380533145833258, "learning_rate": 4.525154504123483e-06, "loss": 0.0628, "step": 350 }, { "epoch": 1.156507413509061, "grad_norm": 0.6307435685302706, "learning_rate": 4.496470870027209e-06, "loss": 0.0544, "step": 351 }, { "epoch": 1.1598023064250411, "grad_norm": 0.58051661483701, "learning_rate": 4.467803967248354e-06, "loss": 0.0549, "step": 352 }, { "epoch": 1.1630971993410215, "grad_norm": 0.45506940053593953, "learning_rate": 4.439154748333695e-06, "loss": 0.0455, "step": 353 }, { "epoch": 1.1663920922570017, "grad_norm": 0.4477960561383021, "learning_rate": 4.410524165242407e-06, "loss": 0.0417, "step": 354 }, { "epoch": 1.1696869851729819, "grad_norm": 0.5024790404868378, "learning_rate": 4.381913169314432e-06, "loss": 0.0483, "step": 355 }, { "epoch": 1.172981878088962, "grad_norm": 0.43352510094853813, "learning_rate": 4.3533227112388694e-06, "loss": 0.0381, "step": 356 }, { "epoch": 1.1762767710049424, "grad_norm": 0.8015757322992388, "learning_rate": 4.324753741022383e-06, "loss": 0.0589, "step": 357 }, { "epoch": 1.1795716639209226, "grad_norm": 0.554923192898479, "learning_rate": 4.296207207957638e-06, "loss": 0.0469, "step": 358 }, { "epoch": 1.1828665568369028, "grad_norm": 0.4540612599730088, "learning_rate": 4.26768406059175e-06, "loss": 0.0469, "step": 359 }, { "epoch": 1.186161449752883, "grad_norm": 0.4977016265485015, "learning_rate": 4.239185246694781e-06, "loss": 0.0486, "step": 360 }, { "epoch": 1.1894563426688634, "grad_norm": 0.5773178206107633, "learning_rate": 4.21071171322823e-06, "loss": 0.0588, "step": 361 }, { "epoch": 1.1927512355848435, "grad_norm": 0.5714806332591411, "learning_rate": 4.182264406313582e-06, "loss": 0.0473, "step": 362 }, { "epoch": 1.1960461285008237, "grad_norm": 0.5399317568380463, "learning_rate": 4.1538442712008545e-06, "loss": 0.0515, "step": 363 }, { "epoch": 1.1993410214168039, "grad_norm": 0.5077736606662918, "learning_rate": 4.12545225223721e-06, "loss": 0.0473, "step": 364 }, { "epoch": 1.2026359143327843, "grad_norm": 0.65833510309246, "learning_rate": 4.097089292835551e-06, "loss": 0.0574, "step": 365 }, { "epoch": 1.2059308072487644, "grad_norm": 0.5750314764693017, "learning_rate": 4.0687563354431986e-06, "loss": 0.033, "step": 366 }, { "epoch": 1.2092257001647446, "grad_norm": 0.6672168173906087, "learning_rate": 4.040454321510554e-06, "loss": 0.0507, "step": 367 }, { "epoch": 1.2125205930807248, "grad_norm": 0.46572043828398524, "learning_rate": 4.012184191459832e-06, "loss": 0.0448, "step": 368 }, { "epoch": 1.2158154859967052, "grad_norm": 0.5294456067061011, "learning_rate": 3.983946884653804e-06, "loss": 0.0421, "step": 369 }, { "epoch": 1.2191103789126854, "grad_norm": 0.7181848630920071, "learning_rate": 3.95574333936459e-06, "loss": 0.0609, "step": 370 }, { "epoch": 1.2224052718286655, "grad_norm": 0.4872681980462519, "learning_rate": 3.927574492742473e-06, "loss": 0.0332, "step": 371 }, { "epoch": 1.2257001647446457, "grad_norm": 0.5978073219647344, "learning_rate": 3.899441280784773e-06, "loss": 0.0557, "step": 372 }, { "epoch": 1.2289950576606261, "grad_norm": 0.49268040219816195, "learning_rate": 3.8713446383047295e-06, "loss": 0.0539, "step": 373 }, { "epoch": 1.2322899505766063, "grad_norm": 0.553488767277818, "learning_rate": 3.843285498900457e-06, "loss": 0.0438, "step": 374 }, { "epoch": 1.2355848434925865, "grad_norm": 0.5769809240481462, "learning_rate": 3.815264794923903e-06, "loss": 0.0438, "step": 375 }, { "epoch": 1.2388797364085666, "grad_norm": 0.4680099999633115, "learning_rate": 3.7872834574498894e-06, "loss": 0.0391, "step": 376 }, { "epoch": 1.242174629324547, "grad_norm": 0.4990397184455205, "learning_rate": 3.7593424162451553e-06, "loss": 0.0513, "step": 377 }, { "epoch": 1.2454695222405272, "grad_norm": 0.5670279278262034, "learning_rate": 3.731442599737478e-06, "loss": 0.0611, "step": 378 }, { "epoch": 1.2487644151565074, "grad_norm": 0.4178810778744549, "learning_rate": 3.70358493498481e-06, "loss": 0.0461, "step": 379 }, { "epoch": 1.2520593080724876, "grad_norm": 0.5498450231361147, "learning_rate": 3.6757703476444885e-06, "loss": 0.0372, "step": 380 }, { "epoch": 1.255354200988468, "grad_norm": 0.45367014770072983, "learning_rate": 3.6479997619424605e-06, "loss": 0.0423, "step": 381 }, { "epoch": 1.2586490939044481, "grad_norm": 0.4294200611194709, "learning_rate": 3.620274100642593e-06, "loss": 0.0552, "step": 382 }, { "epoch": 1.2619439868204283, "grad_norm": 0.6276700882265509, "learning_rate": 3.5925942850159895e-06, "loss": 0.0659, "step": 383 }, { "epoch": 1.2652388797364087, "grad_norm": 0.7113783547292587, "learning_rate": 3.564961234810399e-06, "loss": 0.067, "step": 384 }, { "epoch": 1.2685337726523889, "grad_norm": 0.6367177743488461, "learning_rate": 3.5373758682196347e-06, "loss": 0.0626, "step": 385 }, { "epoch": 1.271828665568369, "grad_norm": 0.6068919065481327, "learning_rate": 3.509839101853082e-06, "loss": 0.0546, "step": 386 }, { "epoch": 1.2751235584843492, "grad_norm": 0.742600911574775, "learning_rate": 3.4823518507052277e-06, "loss": 0.061, "step": 387 }, { "epoch": 1.2784184514003294, "grad_norm": 0.4142179254874713, "learning_rate": 3.4549150281252635e-06, "loss": 0.0405, "step": 388 }, { "epoch": 1.2817133443163098, "grad_norm": 0.7545310044049625, "learning_rate": 3.427529545786736e-06, "loss": 0.055, "step": 389 }, { "epoch": 1.28500823723229, "grad_norm": 0.5556958907162003, "learning_rate": 3.400196313657253e-06, "loss": 0.0469, "step": 390 }, { "epoch": 1.2883031301482701, "grad_norm": 0.5082442265119059, "learning_rate": 3.372916239968246e-06, "loss": 0.048, "step": 391 }, { "epoch": 1.2915980230642505, "grad_norm": 0.5683891171997948, "learning_rate": 3.345690231184794e-06, "loss": 0.0413, "step": 392 }, { "epoch": 1.2948929159802307, "grad_norm": 0.6720011431709395, "learning_rate": 3.318519191975499e-06, "loss": 0.0604, "step": 393 }, { "epoch": 1.2981878088962109, "grad_norm": 0.5633138961258451, "learning_rate": 3.291404025182432e-06, "loss": 0.065, "step": 394 }, { "epoch": 1.301482701812191, "grad_norm": 0.7184125904469478, "learning_rate": 3.264345631791127e-06, "loss": 0.0653, "step": 395 }, { "epoch": 1.3047775947281712, "grad_norm": 0.6249487221408845, "learning_rate": 3.2373449109006476e-06, "loss": 0.0476, "step": 396 }, { "epoch": 1.3080724876441516, "grad_norm": 0.8032427870564648, "learning_rate": 3.210402759693706e-06, "loss": 0.0731, "step": 397 }, { "epoch": 1.3113673805601318, "grad_norm": 0.56609919677685, "learning_rate": 3.1835200734068604e-06, "loss": 0.0484, "step": 398 }, { "epoch": 1.314662273476112, "grad_norm": 0.6259799233731378, "learning_rate": 3.1566977453007564e-06, "loss": 0.0621, "step": 399 }, { "epoch": 1.3179571663920924, "grad_norm": 0.5571126132586377, "learning_rate": 3.1299366666304586e-06, "loss": 0.0536, "step": 400 }, { "epoch": 1.3212520593080725, "grad_norm": 0.7177292111816991, "learning_rate": 3.103237726615822e-06, "loss": 0.0634, "step": 401 }, { "epoch": 1.3245469522240527, "grad_norm": 0.4655411649268851, "learning_rate": 3.076601812411959e-06, "loss": 0.0386, "step": 402 }, { "epoch": 1.327841845140033, "grad_norm": 0.5171630532203868, "learning_rate": 3.0500298090797465e-06, "loss": 0.0483, "step": 403 }, { "epoch": 1.331136738056013, "grad_norm": 0.624563937797765, "learning_rate": 3.0235225995564323e-06, "loss": 0.0556, "step": 404 }, { "epoch": 1.3344316309719935, "grad_norm": 0.5622033908392009, "learning_rate": 2.9970810646262805e-06, "loss": 0.0478, "step": 405 }, { "epoch": 1.3377265238879736, "grad_norm": 0.4858535947041361, "learning_rate": 2.9707060828913226e-06, "loss": 0.0478, "step": 406 }, { "epoch": 1.3410214168039538, "grad_norm": 0.40196887575712115, "learning_rate": 2.944398530742144e-06, "loss": 0.0462, "step": 407 }, { "epoch": 1.3443163097199342, "grad_norm": 0.5650818660979695, "learning_rate": 2.9181592823287807e-06, "loss": 0.0655, "step": 408 }, { "epoch": 1.3476112026359144, "grad_norm": 0.5475272072919456, "learning_rate": 2.8919892095316616e-06, "loss": 0.0519, "step": 409 }, { "epoch": 1.3509060955518946, "grad_norm": 0.6655092099152591, "learning_rate": 2.865889181932639e-06, "loss": 0.0416, "step": 410 }, { "epoch": 1.3542009884678747, "grad_norm": 0.8536644782226072, "learning_rate": 2.8398600667861032e-06, "loss": 0.0669, "step": 411 }, { "epoch": 1.357495881383855, "grad_norm": 0.48323829394508544, "learning_rate": 2.813902728990149e-06, "loss": 0.0367, "step": 412 }, { "epoch": 1.3607907742998353, "grad_norm": 0.6949809437868909, "learning_rate": 2.7880180310578546e-06, "loss": 0.0523, "step": 413 }, { "epoch": 1.3640856672158155, "grad_norm": 0.43770382604271224, "learning_rate": 2.762206833088608e-06, "loss": 0.0527, "step": 414 }, { "epoch": 1.3673805601317957, "grad_norm": 0.5081547664603686, "learning_rate": 2.7364699927395355e-06, "loss": 0.0613, "step": 415 }, { "epoch": 1.370675453047776, "grad_norm": 0.6539834762026684, "learning_rate": 2.710808365197e-06, "loss": 0.0555, "step": 416 }, { "epoch": 1.3739703459637562, "grad_norm": 0.39905302203795334, "learning_rate": 2.6852228031481837e-06, "loss": 0.0408, "step": 417 }, { "epoch": 1.3772652388797364, "grad_norm": 0.45958447904532335, "learning_rate": 2.6597141567527614e-06, "loss": 0.0503, "step": 418 }, { "epoch": 1.3805601317957166, "grad_norm": 0.4995326285015215, "learning_rate": 2.6342832736146403e-06, "loss": 0.0605, "step": 419 }, { "epoch": 1.3838550247116967, "grad_norm": 0.4884609849725302, "learning_rate": 2.608930998753809e-06, "loss": 0.0602, "step": 420 }, { "epoch": 1.3871499176276771, "grad_norm": 0.48846990238806653, "learning_rate": 2.5836581745782474e-06, "loss": 0.0429, "step": 421 }, { "epoch": 1.3904448105436573, "grad_norm": 0.5807005137969414, "learning_rate": 2.558465640855943e-06, "loss": 0.052, "step": 422 }, { "epoch": 1.3937397034596375, "grad_norm": 0.3856666040192254, "learning_rate": 2.533354234686979e-06, "loss": 0.0422, "step": 423 }, { "epoch": 1.3970345963756179, "grad_norm": 0.3701363763937253, "learning_rate": 2.508324790475731e-06, "loss": 0.0449, "step": 424 }, { "epoch": 1.400329489291598, "grad_norm": 0.5283156125790535, "learning_rate": 2.4833781399031275e-06, "loss": 0.0583, "step": 425 }, { "epoch": 1.4036243822075782, "grad_norm": 0.5143083208475716, "learning_rate": 2.4585151118990286e-06, "loss": 0.0582, "step": 426 }, { "epoch": 1.4069192751235584, "grad_norm": 0.4580082823859306, "learning_rate": 2.433736532614666e-06, "loss": 0.0503, "step": 427 }, { "epoch": 1.4102141680395386, "grad_norm": 0.4733586276806861, "learning_rate": 2.4090432253952113e-06, "loss": 0.0595, "step": 428 }, { "epoch": 1.413509060955519, "grad_norm": 0.46027613089003067, "learning_rate": 2.3844360107523973e-06, "loss": 0.0334, "step": 429 }, { "epoch": 1.4168039538714992, "grad_norm": 0.618163403358967, "learning_rate": 2.3599157063372712e-06, "loss": 0.0505, "step": 430 }, { "epoch": 1.4200988467874793, "grad_norm": 0.5692914543756001, "learning_rate": 2.3354831269130133e-06, "loss": 0.047, "step": 431 }, { "epoch": 1.4233937397034597, "grad_norm": 0.5569933619176715, "learning_rate": 2.3111390843278743e-06, "loss": 0.0506, "step": 432 }, { "epoch": 1.42668863261944, "grad_norm": 0.4384099356121434, "learning_rate": 2.2868843874881856e-06, "loss": 0.0453, "step": 433 }, { "epoch": 1.42998352553542, "grad_norm": 0.5320473584418453, "learning_rate": 2.2627198423314988e-06, "loss": 0.0547, "step": 434 }, { "epoch": 1.4332784184514002, "grad_norm": 0.4951776816150561, "learning_rate": 2.238646251799787e-06, "loss": 0.0517, "step": 435 }, { "epoch": 1.4365733113673804, "grad_norm": 0.5305051346570233, "learning_rate": 2.2146644158127827e-06, "loss": 0.0508, "step": 436 }, { "epoch": 1.4398682042833608, "grad_norm": 0.48235120417487776, "learning_rate": 2.1907751312413793e-06, "loss": 0.0498, "step": 437 }, { "epoch": 1.443163097199341, "grad_norm": 0.7575565682766872, "learning_rate": 2.1669791918811724e-06, "loss": 0.0482, "step": 438 }, { "epoch": 1.4464579901153214, "grad_norm": 0.6122464829305898, "learning_rate": 2.1432773884260627e-06, "loss": 0.0661, "step": 439 }, { "epoch": 1.4497528830313016, "grad_norm": 0.49382428143445756, "learning_rate": 2.119670508442004e-06, "loss": 0.0372, "step": 440 }, { "epoch": 1.4530477759472817, "grad_norm": 0.6113296705934868, "learning_rate": 2.0961593363408154e-06, "loss": 0.0489, "step": 441 }, { "epoch": 1.456342668863262, "grad_norm": 0.4764803472849658, "learning_rate": 2.0727446533541302e-06, "loss": 0.0426, "step": 442 }, { "epoch": 1.459637561779242, "grad_norm": 0.5321931460957434, "learning_rate": 2.0494272375074247e-06, "loss": 0.0428, "step": 443 }, { "epoch": 1.4629324546952225, "grad_norm": 0.43368533141343174, "learning_rate": 2.0262078635941818e-06, "loss": 0.0377, "step": 444 }, { "epoch": 1.4662273476112027, "grad_norm": 0.5227900476116077, "learning_rate": 2.0030873031501274e-06, "loss": 0.048, "step": 445 }, { "epoch": 1.4695222405271828, "grad_norm": 0.40044438580877817, "learning_rate": 1.980066324427613e-06, "loss": 0.0367, "step": 446 }, { "epoch": 1.4728171334431632, "grad_norm": 0.42569057497544066, "learning_rate": 1.9571456923700696e-06, "loss": 0.0485, "step": 447 }, { "epoch": 1.4761120263591434, "grad_norm": 0.5011955876540544, "learning_rate": 1.9343261685866054e-06, "loss": 0.0684, "step": 448 }, { "epoch": 1.4794069192751236, "grad_norm": 0.5257059685422952, "learning_rate": 1.911608511326688e-06, "loss": 0.0469, "step": 449 }, { "epoch": 1.4827018121911038, "grad_norm": 0.5330212717649231, "learning_rate": 1.8889934754549583e-06, "loss": 0.0615, "step": 450 }, { "epoch": 1.485996705107084, "grad_norm": 0.4377288880184422, "learning_rate": 1.8664818124261375e-06, "loss": 0.04, "step": 451 }, { "epoch": 1.4892915980230643, "grad_norm": 0.4821221712040424, "learning_rate": 1.8440742702600706e-06, "loss": 0.0496, "step": 452 }, { "epoch": 1.4925864909390445, "grad_norm": 0.42358079608202237, "learning_rate": 1.8217715935168562e-06, "loss": 0.0446, "step": 453 }, { "epoch": 1.4958813838550247, "grad_norm": 0.6521628225316723, "learning_rate": 1.7995745232721207e-06, "loss": 0.0665, "step": 454 }, { "epoch": 1.499176276771005, "grad_norm": 0.5512352891912379, "learning_rate": 1.777483797092381e-06, "loss": 0.0527, "step": 455 }, { "epoch": 1.5024711696869852, "grad_norm": 0.4132207895248971, "learning_rate": 1.755500149010549e-06, "loss": 0.0369, "step": 456 }, { "epoch": 1.5057660626029654, "grad_norm": 0.5452488198197322, "learning_rate": 1.7336243095015271e-06, "loss": 0.0457, "step": 457 }, { "epoch": 1.5090609555189456, "grad_norm": 1.8013972479013802, "learning_rate": 1.7118570054579508e-06, "loss": 0.0788, "step": 458 }, { "epoch": 1.5123558484349258, "grad_norm": 0.6158971711077378, "learning_rate": 1.6901989601660224e-06, "loss": 0.0577, "step": 459 }, { "epoch": 1.515650741350906, "grad_norm": 0.6402888520963839, "learning_rate": 1.6686508932814871e-06, "loss": 0.0426, "step": 460 }, { "epoch": 1.5189456342668863, "grad_norm": 0.5815365915637473, "learning_rate": 1.6472135208057128e-06, "loss": 0.0526, "step": 461 }, { "epoch": 1.5222405271828665, "grad_norm": 0.5219074399966507, "learning_rate": 1.625887555061907e-06, "loss": 0.0428, "step": 462 }, { "epoch": 1.525535420098847, "grad_norm": 0.5007230705662209, "learning_rate": 1.6046737046714366e-06, "loss": 0.0386, "step": 463 }, { "epoch": 1.528830313014827, "grad_norm": 0.492364769802372, "learning_rate": 1.5835726745302953e-06, "loss": 0.0364, "step": 464 }, { "epoch": 1.5321252059308073, "grad_norm": 0.5652000556154251, "learning_rate": 1.5625851657856666e-06, "loss": 0.0546, "step": 465 }, { "epoch": 1.5354200988467874, "grad_norm": 0.5993897339775979, "learning_rate": 1.5417118758126408e-06, "loss": 0.0579, "step": 466 }, { "epoch": 1.5387149917627676, "grad_norm": 1.3251444571487765, "learning_rate": 1.520953498191028e-06, "loss": 0.0747, "step": 467 }, { "epoch": 1.5420098846787478, "grad_norm": 0.440371155414081, "learning_rate": 1.5003107226823255e-06, "loss": 0.0495, "step": 468 }, { "epoch": 1.5453047775947282, "grad_norm": 0.5274460518323345, "learning_rate": 1.479784235206786e-06, "loss": 0.0457, "step": 469 }, { "epoch": 1.5485996705107083, "grad_norm": 0.4509159507608483, "learning_rate": 1.459374717820637e-06, "loss": 0.0441, "step": 470 }, { "epoch": 1.5518945634266887, "grad_norm": 0.5787329784185842, "learning_rate": 1.439082848693406e-06, "loss": 0.0455, "step": 471 }, { "epoch": 1.555189456342669, "grad_norm": 0.9428413760935695, "learning_rate": 1.4189093020853989e-06, "loss": 0.0635, "step": 472 }, { "epoch": 1.558484349258649, "grad_norm": 0.5810607886116554, "learning_rate": 1.3988547483252812e-06, "loss": 0.0591, "step": 473 }, { "epoch": 1.5617792421746293, "grad_norm": 0.5621586581333317, "learning_rate": 1.3789198537878202e-06, "loss": 0.048, "step": 474 }, { "epoch": 1.5650741350906094, "grad_norm": 0.614476271893997, "learning_rate": 1.3591052808717258e-06, "loss": 0.0574, "step": 475 }, { "epoch": 1.5683690280065898, "grad_norm": 0.4366280882804736, "learning_rate": 1.339411687977657e-06, "loss": 0.0387, "step": 476 }, { "epoch": 1.57166392092257, "grad_norm": 1.654229386447125, "learning_rate": 1.3198397294863285e-06, "loss": 0.0525, "step": 477 }, { "epoch": 1.5749588138385504, "grad_norm": 0.5124984935464315, "learning_rate": 1.3003900557367816e-06, "loss": 0.0586, "step": 478 }, { "epoch": 1.5782537067545306, "grad_norm": 0.5039902746309534, "learning_rate": 1.281063313004761e-06, "loss": 0.0409, "step": 479 }, { "epoch": 1.5815485996705108, "grad_norm": 0.4453799136874429, "learning_rate": 1.261860143481255e-06, "loss": 0.0437, "step": 480 }, { "epoch": 1.584843492586491, "grad_norm": 0.44736265726220936, "learning_rate": 1.2427811852511396e-06, "loss": 0.05, "step": 481 }, { "epoch": 1.588138385502471, "grad_norm": 0.5751552043472024, "learning_rate": 1.223827072271993e-06, "loss": 0.0513, "step": 482 }, { "epoch": 1.5914332784184513, "grad_norm": 0.4854076213664054, "learning_rate": 1.204998434353018e-06, "loss": 0.0434, "step": 483 }, { "epoch": 1.5947281713344317, "grad_norm": 0.5304616858985192, "learning_rate": 1.1862958971341199e-06, "loss": 0.0537, "step": 484 }, { "epoch": 1.5980230642504119, "grad_norm": 0.5357970833666896, "learning_rate": 1.1677200820651197e-06, "loss": 0.049, "step": 485 }, { "epoch": 1.6013179571663922, "grad_norm": 0.6703644083736745, "learning_rate": 1.1492716063850973e-06, "loss": 0.0553, "step": 486 }, { "epoch": 1.6046128500823724, "grad_norm": 0.5104289346948437, "learning_rate": 1.1309510831018927e-06, "loss": 0.0484, "step": 487 }, { "epoch": 1.6079077429983526, "grad_norm": 0.791691486031595, "learning_rate": 1.112759120971723e-06, "loss": 0.0516, "step": 488 }, { "epoch": 1.6112026359143328, "grad_norm": 0.5044446696201748, "learning_rate": 1.09469632447897e-06, "loss": 0.0412, "step": 489 }, { "epoch": 1.614497528830313, "grad_norm": 0.44744812374789733, "learning_rate": 1.0767632938160787e-06, "loss": 0.0441, "step": 490 }, { "epoch": 1.6177924217462931, "grad_norm": 0.689652904031741, "learning_rate": 1.0589606248636291e-06, "loss": 0.0468, "step": 491 }, { "epoch": 1.6210873146622735, "grad_norm": 0.4644587386351254, "learning_rate": 1.0412889091705242e-06, "loss": 0.0356, "step": 492 }, { "epoch": 1.6243822075782537, "grad_norm": 0.6499107202235256, "learning_rate": 1.0237487339343382e-06, "loss": 0.0574, "step": 493 }, { "epoch": 1.627677100494234, "grad_norm": 0.4542177117918383, "learning_rate": 1.0063406819818106e-06, "loss": 0.0443, "step": 494 }, { "epoch": 1.6309719934102143, "grad_norm": 0.6343789726555299, "learning_rate": 9.890653317494686e-07, "loss": 0.0524, "step": 495 }, { "epoch": 1.6342668863261944, "grad_norm": 0.4208852075289343, "learning_rate": 9.719232572644189e-07, "loss": 0.0407, "step": 496 }, { "epoch": 1.6375617792421746, "grad_norm": 0.45018417664569393, "learning_rate": 9.549150281252633e-07, "loss": 0.0382, "step": 497 }, { "epoch": 1.6408566721581548, "grad_norm": 0.4664038740894182, "learning_rate": 9.380412094831809e-07, "loss": 0.0413, "step": 498 }, { "epoch": 1.644151565074135, "grad_norm": 0.5658115763517576, "learning_rate": 9.213023620231404e-07, "loss": 0.055, "step": 499 }, { "epoch": 1.6474464579901154, "grad_norm": 0.42505631586549236, "learning_rate": 9.046990419452795e-07, "loss": 0.0374, "step": 500 }, { "epoch": 1.6507413509060955, "grad_norm": 0.5554370313066022, "learning_rate": 8.882318009464124e-07, "loss": 0.0758, "step": 501 }, { "epoch": 1.654036243822076, "grad_norm": 0.6820183163387327, "learning_rate": 8.719011862017108e-07, "loss": 0.067, "step": 502 }, { "epoch": 1.657331136738056, "grad_norm": 0.8815741831945997, "learning_rate": 8.557077403465069e-07, "loss": 0.0635, "step": 503 }, { "epoch": 1.6606260296540363, "grad_norm": 0.6530261534927284, "learning_rate": 8.396520014582798e-07, "loss": 0.0564, "step": 504 }, { "epoch": 1.6639209225700164, "grad_norm": 0.5563910425802013, "learning_rate": 8.237345030387589e-07, "loss": 0.0568, "step": 505 }, { "epoch": 1.6672158154859966, "grad_norm": 1.6616538016948608, "learning_rate": 8.079557739962129e-07, "loss": 0.0433, "step": 506 }, { "epoch": 1.6705107084019768, "grad_norm": 0.4729743527848457, "learning_rate": 7.923163386278615e-07, "loss": 0.0477, "step": 507 }, { "epoch": 1.6738056013179572, "grad_norm": 0.484207261501026, "learning_rate": 7.768167166024637e-07, "loss": 0.0393, "step": 508 }, { "epoch": 1.6771004942339374, "grad_norm": 0.4347790202564516, "learning_rate": 7.614574229430432e-07, "loss": 0.0348, "step": 509 }, { "epoch": 1.6803953871499178, "grad_norm": 0.5159230901740568, "learning_rate": 7.462389680097831e-07, "loss": 0.0511, "step": 510 }, { "epoch": 1.683690280065898, "grad_norm": 0.7846302974584749, "learning_rate": 7.31161857483057e-07, "loss": 0.0428, "step": 511 }, { "epoch": 1.6869851729818781, "grad_norm": 0.5154541689981792, "learning_rate": 7.162265923466383e-07, "loss": 0.0481, "step": 512 }, { "epoch": 1.6902800658978583, "grad_norm": 0.5103769113667321, "learning_rate": 7.014336688710411e-07, "loss": 0.0559, "step": 513 }, { "epoch": 1.6935749588138385, "grad_norm": 0.5601518054326986, "learning_rate": 6.867835785970417e-07, "loss": 0.0383, "step": 514 }, { "epoch": 1.6968698517298186, "grad_norm": 0.452690076025677, "learning_rate": 6.722768083193354e-07, "loss": 0.0393, "step": 515 }, { "epoch": 1.700164744645799, "grad_norm": 0.46134612749678455, "learning_rate": 6.579138400703716e-07, "loss": 0.0515, "step": 516 }, { "epoch": 1.7034596375617792, "grad_norm": 0.41746324511751276, "learning_rate": 6.436951511043243e-07, "loss": 0.0445, "step": 517 }, { "epoch": 1.7067545304777596, "grad_norm": 0.45014253067830906, "learning_rate": 6.296212138812474e-07, "loss": 0.0438, "step": 518 }, { "epoch": 1.7100494233937398, "grad_norm": 0.49892377023307155, "learning_rate": 6.156924960513638e-07, "loss": 0.0452, "step": 519 }, { "epoch": 1.71334431630972, "grad_norm": 0.47105725038366275, "learning_rate": 6.019094604395359e-07, "loss": 0.054, "step": 520 }, { "epoch": 1.7166392092257001, "grad_norm": 0.4259013974972623, "learning_rate": 5.882725650298787e-07, "loss": 0.0377, "step": 521 }, { "epoch": 1.7199341021416803, "grad_norm": 0.7122055487087868, "learning_rate": 5.747822629505484e-07, "loss": 0.0606, "step": 522 }, { "epoch": 1.7232289950576605, "grad_norm": 0.6463022317948001, "learning_rate": 5.614390024586808e-07, "loss": 0.0948, "step": 523 }, { "epoch": 1.7265238879736409, "grad_norm": 0.6557315175026269, "learning_rate": 5.482432269255011e-07, "loss": 0.0594, "step": 524 }, { "epoch": 1.729818780889621, "grad_norm": 0.48159732485479767, "learning_rate": 5.351953748215872e-07, "loss": 0.0562, "step": 525 }, { "epoch": 1.7331136738056014, "grad_norm": 0.5429610524246544, "learning_rate": 5.222958797023036e-07, "loss": 0.0469, "step": 526 }, { "epoch": 1.7364085667215816, "grad_norm": 0.4703329992162841, "learning_rate": 5.095451701933923e-07, "loss": 0.0495, "step": 527 }, { "epoch": 1.7397034596375618, "grad_norm": 0.4436763771779832, "learning_rate": 4.969436699767344e-07, "loss": 0.0354, "step": 528 }, { "epoch": 1.742998352553542, "grad_norm": 0.5025190325700492, "learning_rate": 4.844917977762653e-07, "loss": 0.056, "step": 529 }, { "epoch": 1.7462932454695221, "grad_norm": 0.661761525862607, "learning_rate": 4.721899673440694e-07, "loss": 0.0436, "step": 530 }, { "epoch": 1.7495881383855023, "grad_norm": 0.5762138308981635, "learning_rate": 4.6003858744662564e-07, "loss": 0.0552, "step": 531 }, { "epoch": 1.7528830313014827, "grad_norm": 0.5387051088420545, "learning_rate": 4.4803806185122866e-07, "loss": 0.0479, "step": 532 }, { "epoch": 1.7561779242174629, "grad_norm": 0.537079178923195, "learning_rate": 4.361887893125677e-07, "loss": 0.0565, "step": 533 }, { "epoch": 1.7594728171334433, "grad_norm": 0.5780295144664594, "learning_rate": 4.244911635594856e-07, "loss": 0.0555, "step": 534 }, { "epoch": 1.7627677100494235, "grad_norm": 0.6971174595241891, "learning_rate": 4.1294557328188376e-07, "loss": 0.0468, "step": 535 }, { "epoch": 1.7660626029654036, "grad_norm": 0.5714544842706962, "learning_rate": 4.0155240211781966e-07, "loss": 0.069, "step": 536 }, { "epoch": 1.7693574958813838, "grad_norm": 0.5560933792076089, "learning_rate": 3.9031202864074634e-07, "loss": 0.0526, "step": 537 }, { "epoch": 1.772652388797364, "grad_norm": 0.5076535062447541, "learning_rate": 3.7922482634694667e-07, "loss": 0.0495, "step": 538 }, { "epoch": 1.7759472817133442, "grad_norm": 0.4733107950145234, "learning_rate": 3.6829116364310914e-07, "loss": 0.048, "step": 539 }, { "epoch": 1.7792421746293245, "grad_norm": 0.5645137566585913, "learning_rate": 3.575114038340977e-07, "loss": 0.0503, "step": 540 }, { "epoch": 1.782537067545305, "grad_norm": 0.5224355605212421, "learning_rate": 3.4688590511087304e-07, "loss": 0.0553, "step": 541 }, { "epoch": 1.7858319604612851, "grad_norm": 0.495152224806279, "learning_rate": 3.3641502053859355e-07, "loss": 0.0304, "step": 542 }, { "epoch": 1.7891268533772653, "grad_norm": 0.40973303317244536, "learning_rate": 3.2609909804488195e-07, "loss": 0.0313, "step": 543 }, { "epoch": 1.7924217462932455, "grad_norm": 0.47058046777642437, "learning_rate": 3.159384804082666e-07, "loss": 0.0526, "step": 544 }, { "epoch": 1.7957166392092256, "grad_norm": 0.4838577514212764, "learning_rate": 3.0593350524678823e-07, "loss": 0.0371, "step": 545 }, { "epoch": 1.7990115321252058, "grad_norm": 0.6690590772761237, "learning_rate": 2.9608450500678566e-07, "loss": 0.0604, "step": 546 }, { "epoch": 1.8023064250411862, "grad_norm": 0.4109056760035354, "learning_rate": 2.863918069518451e-07, "loss": 0.0331, "step": 547 }, { "epoch": 1.8056013179571664, "grad_norm": 0.7159007380510568, "learning_rate": 2.7685573315192895e-07, "loss": 0.0721, "step": 548 }, { "epoch": 1.8088962108731468, "grad_norm": 0.4865674890840018, "learning_rate": 2.67476600472672e-07, "loss": 0.0451, "step": 549 }, { "epoch": 1.812191103789127, "grad_norm": 0.5942738860891101, "learning_rate": 2.5825472056485556e-07, "loss": 0.062, "step": 550 }, { "epoch": 1.8154859967051071, "grad_norm": 0.5357536121944303, "learning_rate": 2.4919039985404626e-07, "loss": 0.0609, "step": 551 }, { "epoch": 1.8187808896210873, "grad_norm": 0.3866866163849165, "learning_rate": 2.4028393953042074e-07, "loss": 0.0296, "step": 552 }, { "epoch": 1.8220757825370675, "grad_norm": 0.49750309911350266, "learning_rate": 2.315356355387527e-07, "loss": 0.0444, "step": 553 }, { "epoch": 1.8253706754530477, "grad_norm": 0.6734726196583376, "learning_rate": 2.2294577856858236e-07, "loss": 0.0552, "step": 554 }, { "epoch": 1.828665568369028, "grad_norm": 0.4599661443293077, "learning_rate": 2.1451465404455473e-07, "loss": 0.041, "step": 555 }, { "epoch": 1.8319604612850082, "grad_norm": 0.691277736971953, "learning_rate": 2.0624254211693894e-07, "loss": 0.061, "step": 556 }, { "epoch": 1.8352553542009886, "grad_norm": 0.42780093096680555, "learning_rate": 1.9812971765231394e-07, "loss": 0.0364, "step": 557 }, { "epoch": 1.8385502471169688, "grad_norm": 0.578589972444535, "learning_rate": 1.901764502244424e-07, "loss": 0.0613, "step": 558 }, { "epoch": 1.841845140032949, "grad_norm": 0.4763862013898226, "learning_rate": 1.823830041053065e-07, "loss": 0.0437, "step": 559 }, { "epoch": 1.8451400329489291, "grad_norm": 0.45718093901396817, "learning_rate": 1.7474963825633185e-07, "loss": 0.043, "step": 560 }, { "epoch": 1.8484349258649093, "grad_norm": 0.6366482024659451, "learning_rate": 1.6727660631977894e-07, "loss": 0.0556, "step": 561 }, { "epoch": 1.8517298187808895, "grad_norm": 0.7471772626827173, "learning_rate": 1.5996415661031662e-07, "loss": 0.0425, "step": 562 }, { "epoch": 1.8550247116968699, "grad_norm": 0.3521582825319659, "learning_rate": 1.528125321067725e-07, "loss": 0.0382, "step": 563 }, { "epoch": 1.85831960461285, "grad_norm": 0.5973061078363581, "learning_rate": 1.4582197044405556e-07, "loss": 0.0509, "step": 564 }, { "epoch": 1.8616144975288305, "grad_norm": 0.5093523377380892, "learning_rate": 1.389927039052652e-07, "loss": 0.0444, "step": 565 }, { "epoch": 1.8649093904448106, "grad_norm": 0.44171165477963154, "learning_rate": 1.323249594139664e-07, "loss": 0.0468, "step": 566 }, { "epoch": 1.8682042833607908, "grad_norm": 0.44675757705445573, "learning_rate": 1.2581895852665671e-07, "loss": 0.0374, "step": 567 }, { "epoch": 1.871499176276771, "grad_norm": 0.6020514933209717, "learning_rate": 1.1947491742539841e-07, "loss": 0.0503, "step": 568 }, { "epoch": 1.8747940691927512, "grad_norm": 0.42586058680868216, "learning_rate": 1.1329304691063692e-07, "loss": 0.0392, "step": 569 }, { "epoch": 1.8780889621087313, "grad_norm": 0.4423611658945744, "learning_rate": 1.0727355239419868e-07, "loss": 0.0469, "step": 570 }, { "epoch": 1.8813838550247117, "grad_norm": 0.424883733072018, "learning_rate": 1.014166338924627e-07, "loss": 0.0475, "step": 571 }, { "epoch": 1.884678747940692, "grad_norm": 0.508275043696402, "learning_rate": 9.572248601971646e-08, "loss": 0.0588, "step": 572 }, { "epoch": 1.8879736408566723, "grad_norm": 0.532268589710946, "learning_rate": 9.019129798168658e-08, "loss": 0.0413, "step": 573 }, { "epoch": 1.8912685337726525, "grad_norm": 0.49075007985093444, "learning_rate": 8.482325356925614e-08, "loss": 0.0438, "step": 574 }, { "epoch": 1.8945634266886326, "grad_norm": 0.6405783999237776, "learning_rate": 7.96185311523523e-08, "loss": 0.0517, "step": 575 }, { "epoch": 1.8978583196046128, "grad_norm": 0.4374824821968711, "learning_rate": 7.45773036740255e-08, "loss": 0.0594, "step": 576 }, { "epoch": 1.901153212520593, "grad_norm": 0.4881691605292657, "learning_rate": 6.969973864469626e-08, "loss": 0.0478, "step": 577 }, { "epoch": 1.9044481054365732, "grad_norm": 0.4169109199296669, "learning_rate": 6.498599813659524e-08, "loss": 0.0329, "step": 578 }, { "epoch": 1.9077429983525536, "grad_norm": 0.7757102581030492, "learning_rate": 6.043623877837301e-08, "loss": 0.0459, "step": 579 }, { "epoch": 1.9110378912685337, "grad_norm": 0.6871632648014142, "learning_rate": 5.6050611749899896e-08, "loss": 0.0499, "step": 580 }, { "epoch": 1.9143327841845141, "grad_norm": 0.4909555169664079, "learning_rate": 5.182926277723821e-08, "loss": 0.0439, "step": 581 }, { "epoch": 1.9176276771004943, "grad_norm": 0.5085351517406403, "learning_rate": 4.777233212780396e-08, "loss": 0.043, "step": 582 }, { "epoch": 1.9209225700164745, "grad_norm": 0.4133261269232907, "learning_rate": 4.387995460570282e-08, "loss": 0.0422, "step": 583 }, { "epoch": 1.9242174629324547, "grad_norm": 0.3457486280213094, "learning_rate": 4.015225954725421e-08, "loss": 0.0302, "step": 584 }, { "epoch": 1.9275123558484348, "grad_norm": 0.5306123017488412, "learning_rate": 3.658937081669034e-08, "loss": 0.0347, "step": 585 }, { "epoch": 1.930807248764415, "grad_norm": 0.6170841106215647, "learning_rate": 3.3191406802041693e-08, "loss": 0.0427, "step": 586 }, { "epoch": 1.9341021416803954, "grad_norm": 0.45743745755641063, "learning_rate": 2.9958480411204086e-08, "loss": 0.0487, "step": 587 }, { "epoch": 1.9373970345963756, "grad_norm": 0.7985516650647139, "learning_rate": 2.6890699068187197e-08, "loss": 0.0598, "step": 588 }, { "epoch": 1.940691927512356, "grad_norm": 0.969435591211712, "learning_rate": 2.3988164709542462e-08, "loss": 0.046, "step": 589 }, { "epoch": 1.9439868204283361, "grad_norm": 0.5530499902423203, "learning_rate": 2.1250973780977957e-08, "loss": 0.0626, "step": 590 }, { "epoch": 1.9472817133443163, "grad_norm": 0.4726855774076432, "learning_rate": 1.8679217234154335e-08, "loss": 0.0442, "step": 591 }, { "epoch": 1.9505766062602965, "grad_norm": 0.5910043256950622, "learning_rate": 1.627298052366111e-08, "loss": 0.0533, "step": 592 }, { "epoch": 1.9538714991762767, "grad_norm": 0.37046514170506584, "learning_rate": 1.4032343604177267e-08, "loss": 0.0436, "step": 593 }, { "epoch": 1.9571663920922568, "grad_norm": 0.45020576348906904, "learning_rate": 1.1957380927816176e-08, "loss": 0.0392, "step": 594 }, { "epoch": 1.9604612850082372, "grad_norm": 0.4456283814258553, "learning_rate": 1.0048161441649217e-08, "loss": 0.0464, "step": 595 }, { "epoch": 1.9637561779242174, "grad_norm": 0.5191136185767354, "learning_rate": 8.304748585417077e-09, "loss": 0.0432, "step": 596 }, { "epoch": 1.9670510708401978, "grad_norm": 0.7541935440652175, "learning_rate": 6.72720028942031e-09, "loss": 0.0417, "step": 597 }, { "epoch": 1.970345963756178, "grad_norm": 0.4930059149169314, "learning_rate": 5.315568972594775e-09, "loss": 0.0522, "step": 598 }, { "epoch": 1.9736408566721582, "grad_norm": 0.43182626877243907, "learning_rate": 4.0699015407702495e-09, "loss": 0.0426, "step": 599 }, { "epoch": 1.9769357495881383, "grad_norm": 0.5518177594295087, "learning_rate": 2.990239385112226e-09, "loss": 0.0565, "step": 600 }, { "epoch": 1.9802306425041185, "grad_norm": 0.5188892218788691, "learning_rate": 2.076618380744133e-09, "loss": 0.0684, "step": 601 }, { "epoch": 1.9835255354200987, "grad_norm": 0.43199397069665996, "learning_rate": 1.3290688855588374e-09, "loss": 0.0396, "step": 602 }, { "epoch": 1.986820428336079, "grad_norm": 0.4470921544911916, "learning_rate": 7.476157392072303e-10, "loss": 0.0385, "step": 603 }, { "epoch": 1.9901153212520593, "grad_norm": 0.5332848840635471, "learning_rate": 3.322782622738885e-10, "loss": 0.0585, "step": 604 }, { "epoch": 1.9934102141680397, "grad_norm": 0.6284366325020901, "learning_rate": 8.307025563536464e-11, "loss": 0.0517, "step": 605 }, { "epoch": 1.9967051070840198, "grad_norm": 0.6056246452486423, "learning_rate": 0.0, "loss": 0.048, "step": 606 }, { "epoch": 1.9967051070840198, "eval_loss": 0.0587012954056263, "eval_runtime": 144.0131, "eval_samples_per_second": 35.448, "eval_steps_per_second": 1.111, "step": 606 }, { "epoch": 1.9967051070840198, "step": 606, "total_flos": 1.811911707237417e+17, "train_loss": 0.0716345354195426, "train_runtime": 6724.9995, "train_samples_per_second": 11.537, "train_steps_per_second": 0.09 } ], "logging_steps": 1, "max_steps": 606, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.811911707237417e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }