|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9967051070840198, |
|
"eval_steps": 500, |
|
"global_step": 606, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0032948929159802307, |
|
"grad_norm": 12.442080297711913, |
|
"learning_rate": 1.639344262295082e-07, |
|
"loss": 0.383, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006589785831960461, |
|
"grad_norm": 13.440096974039598, |
|
"learning_rate": 3.278688524590164e-07, |
|
"loss": 0.3663, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.009884678747940691, |
|
"grad_norm": 13.796855251219549, |
|
"learning_rate": 4.918032786885246e-07, |
|
"loss": 0.3663, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.013179571663920923, |
|
"grad_norm": 12.70572090942759, |
|
"learning_rate": 6.557377049180328e-07, |
|
"loss": 0.3061, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.016474464579901153, |
|
"grad_norm": 11.468076158597158, |
|
"learning_rate": 8.196721311475409e-07, |
|
"loss": 0.3387, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.019769357495881382, |
|
"grad_norm": 10.302737632617346, |
|
"learning_rate": 9.836065573770493e-07, |
|
"loss": 0.2814, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.023064250411861616, |
|
"grad_norm": 9.415064505565756, |
|
"learning_rate": 1.1475409836065575e-06, |
|
"loss": 0.2815, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.026359143327841845, |
|
"grad_norm": 8.420756675130125, |
|
"learning_rate": 1.3114754098360657e-06, |
|
"loss": 0.2483, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.029654036243822075, |
|
"grad_norm": 5.932373113068906, |
|
"learning_rate": 1.4754098360655739e-06, |
|
"loss": 0.1744, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.032948929159802305, |
|
"grad_norm": 4.602925255551132, |
|
"learning_rate": 1.6393442622950819e-06, |
|
"loss": 0.1821, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.036243822075782535, |
|
"grad_norm": 4.1437512962542025, |
|
"learning_rate": 1.8032786885245903e-06, |
|
"loss": 0.1557, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.039538714991762765, |
|
"grad_norm": 4.1008407646409495, |
|
"learning_rate": 1.9672131147540985e-06, |
|
"loss": 0.1901, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.042833607907743, |
|
"grad_norm": 3.2895218176449177, |
|
"learning_rate": 2.1311475409836067e-06, |
|
"loss": 0.1695, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04612850082372323, |
|
"grad_norm": 2.578808113814672, |
|
"learning_rate": 2.295081967213115e-06, |
|
"loss": 0.1431, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.04942339373970346, |
|
"grad_norm": 2.0925168309074804, |
|
"learning_rate": 2.459016393442623e-06, |
|
"loss": 0.1459, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05271828665568369, |
|
"grad_norm": 5.776302012227753, |
|
"learning_rate": 2.6229508196721314e-06, |
|
"loss": 0.2116, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.05601317957166392, |
|
"grad_norm": 4.863535909815622, |
|
"learning_rate": 2.786885245901639e-06, |
|
"loss": 0.157, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.05930807248764415, |
|
"grad_norm": 2.709665323729833, |
|
"learning_rate": 2.9508196721311478e-06, |
|
"loss": 0.1463, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06260296540362438, |
|
"grad_norm": 2.911144968708877, |
|
"learning_rate": 3.114754098360656e-06, |
|
"loss": 0.1283, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.06589785831960461, |
|
"grad_norm": 2.4985115549367873, |
|
"learning_rate": 3.2786885245901638e-06, |
|
"loss": 0.13, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.06919275123558484, |
|
"grad_norm": 2.158826062926261, |
|
"learning_rate": 3.4426229508196724e-06, |
|
"loss": 0.1135, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.07248764415156507, |
|
"grad_norm": 1.8839820680217123, |
|
"learning_rate": 3.6065573770491806e-06, |
|
"loss": 0.1196, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0757825370675453, |
|
"grad_norm": 2.208880925801888, |
|
"learning_rate": 3.7704918032786884e-06, |
|
"loss": 0.1642, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.07907742998352553, |
|
"grad_norm": 1.948969438238695, |
|
"learning_rate": 3.934426229508197e-06, |
|
"loss": 0.1057, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.08237232289950576, |
|
"grad_norm": 1.5855531759235064, |
|
"learning_rate": 4.098360655737705e-06, |
|
"loss": 0.0908, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.085667215815486, |
|
"grad_norm": 1.7484734055235995, |
|
"learning_rate": 4.2622950819672135e-06, |
|
"loss": 0.1088, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.08896210873146623, |
|
"grad_norm": 1.7763409124600038, |
|
"learning_rate": 4.426229508196722e-06, |
|
"loss": 0.1264, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.09225700164744646, |
|
"grad_norm": 1.943799751354267, |
|
"learning_rate": 4.59016393442623e-06, |
|
"loss": 0.1204, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.09555189456342669, |
|
"grad_norm": 2.0024112586646723, |
|
"learning_rate": 4.754098360655738e-06, |
|
"loss": 0.132, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.09884678747940692, |
|
"grad_norm": 1.927718117469908, |
|
"learning_rate": 4.918032786885246e-06, |
|
"loss": 0.0988, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10214168039538715, |
|
"grad_norm": 1.8175857487104838, |
|
"learning_rate": 5.0819672131147545e-06, |
|
"loss": 0.0997, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.10543657331136738, |
|
"grad_norm": 1.1821292523186457, |
|
"learning_rate": 5.245901639344263e-06, |
|
"loss": 0.0874, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.10873146622734761, |
|
"grad_norm": 1.6925074350020268, |
|
"learning_rate": 5.409836065573772e-06, |
|
"loss": 0.115, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.11202635914332784, |
|
"grad_norm": 1.4637368762611331, |
|
"learning_rate": 5.573770491803278e-06, |
|
"loss": 0.0916, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.11532125205930807, |
|
"grad_norm": 1.4814671153620174, |
|
"learning_rate": 5.737704918032787e-06, |
|
"loss": 0.0853, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1186161449752883, |
|
"grad_norm": 1.243594463126339, |
|
"learning_rate": 5.9016393442622956e-06, |
|
"loss": 0.0903, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.12191103789126853, |
|
"grad_norm": 1.2713537957193175, |
|
"learning_rate": 6.065573770491804e-06, |
|
"loss": 0.1271, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.12520593080724876, |
|
"grad_norm": 2.2092835366807893, |
|
"learning_rate": 6.229508196721312e-06, |
|
"loss": 0.1137, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.128500823723229, |
|
"grad_norm": 1.1846569097593065, |
|
"learning_rate": 6.393442622950821e-06, |
|
"loss": 0.0714, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.13179571663920922, |
|
"grad_norm": 3.001321928490275, |
|
"learning_rate": 6.5573770491803276e-06, |
|
"loss": 0.1129, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.13509060955518945, |
|
"grad_norm": 1.8758843455974739, |
|
"learning_rate": 6.721311475409837e-06, |
|
"loss": 0.1021, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.13838550247116968, |
|
"grad_norm": 2.993102960488022, |
|
"learning_rate": 6.885245901639345e-06, |
|
"loss": 0.1155, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1416803953871499, |
|
"grad_norm": 1.8441651337946723, |
|
"learning_rate": 7.049180327868853e-06, |
|
"loss": 0.1004, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.14497528830313014, |
|
"grad_norm": 1.495517808358825, |
|
"learning_rate": 7.213114754098361e-06, |
|
"loss": 0.0868, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.14827018121911037, |
|
"grad_norm": 1.8953155374136303, |
|
"learning_rate": 7.3770491803278695e-06, |
|
"loss": 0.1371, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1515650741350906, |
|
"grad_norm": 1.702133714045992, |
|
"learning_rate": 7.540983606557377e-06, |
|
"loss": 0.1597, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.15485996705107083, |
|
"grad_norm": 1.0196937438072402, |
|
"learning_rate": 7.704918032786886e-06, |
|
"loss": 0.0892, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.15815485996705106, |
|
"grad_norm": 1.2171514922994324, |
|
"learning_rate": 7.868852459016394e-06, |
|
"loss": 0.0778, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1614497528830313, |
|
"grad_norm": 1.0680772396455713, |
|
"learning_rate": 8.032786885245902e-06, |
|
"loss": 0.084, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.16474464579901152, |
|
"grad_norm": 2.9252478676320557, |
|
"learning_rate": 8.19672131147541e-06, |
|
"loss": 0.1275, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16803953871499178, |
|
"grad_norm": 2.422192715088673, |
|
"learning_rate": 8.360655737704919e-06, |
|
"loss": 0.1473, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.171334431630972, |
|
"grad_norm": 1.8804479896519768, |
|
"learning_rate": 8.524590163934427e-06, |
|
"loss": 0.1036, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.17462932454695224, |
|
"grad_norm": 1.568191135105415, |
|
"learning_rate": 8.688524590163935e-06, |
|
"loss": 0.1109, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.17792421746293247, |
|
"grad_norm": 1.7822383034585774, |
|
"learning_rate": 8.852459016393443e-06, |
|
"loss": 0.1138, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1812191103789127, |
|
"grad_norm": 2.39128847516323, |
|
"learning_rate": 9.016393442622952e-06, |
|
"loss": 0.0995, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.18451400329489293, |
|
"grad_norm": 2.005472201286874, |
|
"learning_rate": 9.18032786885246e-06, |
|
"loss": 0.1071, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.18780889621087316, |
|
"grad_norm": 1.771963789348757, |
|
"learning_rate": 9.344262295081968e-06, |
|
"loss": 0.1194, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.19110378912685339, |
|
"grad_norm": 1.9883587282426103, |
|
"learning_rate": 9.508196721311476e-06, |
|
"loss": 0.1263, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.19439868204283361, |
|
"grad_norm": 1.869337708289752, |
|
"learning_rate": 9.672131147540984e-06, |
|
"loss": 0.0906, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.19769357495881384, |
|
"grad_norm": 1.95284840649176, |
|
"learning_rate": 9.836065573770493e-06, |
|
"loss": 0.0941, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.20098846787479407, |
|
"grad_norm": 0.9408728760921459, |
|
"learning_rate": 1e-05, |
|
"loss": 0.0892, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.2042833607907743, |
|
"grad_norm": 1.4827130661563888, |
|
"learning_rate": 9.999916929744365e-06, |
|
"loss": 0.0897, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.20757825370675453, |
|
"grad_norm": 1.7548705917176584, |
|
"learning_rate": 9.999667721737726e-06, |
|
"loss": 0.1052, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.21087314662273476, |
|
"grad_norm": 1.5160957230364678, |
|
"learning_rate": 9.999252384260794e-06, |
|
"loss": 0.0959, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.214168039538715, |
|
"grad_norm": 1.4961421846399192, |
|
"learning_rate": 9.998670931114443e-06, |
|
"loss": 0.0738, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.21746293245469522, |
|
"grad_norm": 1.6241097546909007, |
|
"learning_rate": 9.997923381619257e-06, |
|
"loss": 0.1057, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.22075782537067545, |
|
"grad_norm": 1.3354496262353763, |
|
"learning_rate": 9.99700976061489e-06, |
|
"loss": 0.0951, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.22405271828665568, |
|
"grad_norm": 1.0847564491580965, |
|
"learning_rate": 9.99593009845923e-06, |
|
"loss": 0.0863, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2273476112026359, |
|
"grad_norm": 1.6871231023520186, |
|
"learning_rate": 9.994684431027407e-06, |
|
"loss": 0.0804, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.23064250411861614, |
|
"grad_norm": 1.428280535176863, |
|
"learning_rate": 9.99327279971058e-06, |
|
"loss": 0.0865, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.23393739703459637, |
|
"grad_norm": 1.1179352637786575, |
|
"learning_rate": 9.991695251414584e-06, |
|
"loss": 0.0734, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.2372322899505766, |
|
"grad_norm": 1.2033588488413391, |
|
"learning_rate": 9.989951838558352e-06, |
|
"loss": 0.105, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.24052718286655683, |
|
"grad_norm": 1.465169988779563, |
|
"learning_rate": 9.988042619072185e-06, |
|
"loss": 0.0729, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.24382207578253706, |
|
"grad_norm": 0.8979589269956051, |
|
"learning_rate": 9.985967656395823e-06, |
|
"loss": 0.0802, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.2471169686985173, |
|
"grad_norm": 1.733084591549866, |
|
"learning_rate": 9.98372701947634e-06, |
|
"loss": 0.1105, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2504118616144975, |
|
"grad_norm": 1.3264470296840676, |
|
"learning_rate": 9.981320782765847e-06, |
|
"loss": 0.0994, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.25370675453047775, |
|
"grad_norm": 1.6544247928428517, |
|
"learning_rate": 9.978749026219023e-06, |
|
"loss": 0.0729, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.257001647446458, |
|
"grad_norm": 1.7379037158314299, |
|
"learning_rate": 9.976011835290457e-06, |
|
"loss": 0.1084, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.2602965403624382, |
|
"grad_norm": 2.1898271902712465, |
|
"learning_rate": 9.973109300931813e-06, |
|
"loss": 0.14, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.26359143327841844, |
|
"grad_norm": 1.5180418313244188, |
|
"learning_rate": 9.970041519588797e-06, |
|
"loss": 0.1032, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.26688632619439867, |
|
"grad_norm": 0.9058107350669203, |
|
"learning_rate": 9.966808593197959e-06, |
|
"loss": 0.0659, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2701812191103789, |
|
"grad_norm": 1.6740173453146032, |
|
"learning_rate": 9.963410629183311e-06, |
|
"loss": 0.0903, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.27347611202635913, |
|
"grad_norm": 1.6927879596763102, |
|
"learning_rate": 9.959847740452746e-06, |
|
"loss": 0.1011, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.27677100494233936, |
|
"grad_norm": 1.1858716785831125, |
|
"learning_rate": 9.956120045394297e-06, |
|
"loss": 0.1001, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2800658978583196, |
|
"grad_norm": 1.7472793459338325, |
|
"learning_rate": 9.952227667872197e-06, |
|
"loss": 0.0999, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2833607907742998, |
|
"grad_norm": 1.3355726739748197, |
|
"learning_rate": 9.948170737222763e-06, |
|
"loss": 0.0869, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.28665568369028005, |
|
"grad_norm": 1.6554862377061288, |
|
"learning_rate": 9.943949388250102e-06, |
|
"loss": 0.0956, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.2899505766062603, |
|
"grad_norm": 1.4959889750287383, |
|
"learning_rate": 9.939563761221628e-06, |
|
"loss": 0.1011, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2932454695222405, |
|
"grad_norm": 1.90960683038771, |
|
"learning_rate": 9.935014001863405e-06, |
|
"loss": 0.086, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.29654036243822074, |
|
"grad_norm": 1.33889912510083, |
|
"learning_rate": 9.930300261355305e-06, |
|
"loss": 0.0884, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.29983525535420097, |
|
"grad_norm": 1.8364942381661888, |
|
"learning_rate": 9.925422696325976e-06, |
|
"loss": 0.1198, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.3031301482701812, |
|
"grad_norm": 0.9853906443215683, |
|
"learning_rate": 9.920381468847648e-06, |
|
"loss": 0.0805, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.30642504118616143, |
|
"grad_norm": 1.802022784884664, |
|
"learning_rate": 9.915176746430746e-06, |
|
"loss": 0.1, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.30971993410214166, |
|
"grad_norm": 1.6019896189177425, |
|
"learning_rate": 9.909808702018315e-06, |
|
"loss": 0.1063, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.3130148270181219, |
|
"grad_norm": 1.7270522226909808, |
|
"learning_rate": 9.904277513980285e-06, |
|
"loss": 0.1009, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.3163097199341021, |
|
"grad_norm": 1.355594119054743, |
|
"learning_rate": 9.898583366107539e-06, |
|
"loss": 0.0875, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.31960461285008235, |
|
"grad_norm": 1.8076099955595957, |
|
"learning_rate": 9.892726447605803e-06, |
|
"loss": 0.1236, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.3228995057660626, |
|
"grad_norm": 1.6099906623254512, |
|
"learning_rate": 9.886706953089364e-06, |
|
"loss": 0.0873, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.3261943986820428, |
|
"grad_norm": 1.1035445355215792, |
|
"learning_rate": 9.880525082574604e-06, |
|
"loss": 0.0869, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.32948929159802304, |
|
"grad_norm": 1.4670801213842743, |
|
"learning_rate": 9.874181041473344e-06, |
|
"loss": 0.1076, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.33278418451400327, |
|
"grad_norm": 1.1434779708887954, |
|
"learning_rate": 9.867675040586035e-06, |
|
"loss": 0.0987, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.33607907742998355, |
|
"grad_norm": 1.2329688966692105, |
|
"learning_rate": 9.861007296094736e-06, |
|
"loss": 0.0685, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.3393739703459638, |
|
"grad_norm": 1.33973829264815, |
|
"learning_rate": 9.854178029555945e-06, |
|
"loss": 0.0926, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.342668863261944, |
|
"grad_norm": 2.4442395017358565, |
|
"learning_rate": 9.847187467893228e-06, |
|
"loss": 0.0942, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.34596375617792424, |
|
"grad_norm": 1.5545572705565789, |
|
"learning_rate": 9.840035843389684e-06, |
|
"loss": 0.0849, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.34925864909390447, |
|
"grad_norm": 0.8361181058922856, |
|
"learning_rate": 9.832723393680222e-06, |
|
"loss": 0.0678, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.3525535420098847, |
|
"grad_norm": 1.1875767399598742, |
|
"learning_rate": 9.825250361743667e-06, |
|
"loss": 0.0922, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.35584843492586493, |
|
"grad_norm": 1.018855526493989, |
|
"learning_rate": 9.817616995894694e-06, |
|
"loss": 0.0893, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.35914332784184516, |
|
"grad_norm": 0.8793948272396893, |
|
"learning_rate": 9.809823549775559e-06, |
|
"loss": 0.0816, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.3624382207578254, |
|
"grad_norm": 1.1843731418180148, |
|
"learning_rate": 9.801870282347686e-06, |
|
"loss": 0.0815, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.3657331136738056, |
|
"grad_norm": 1.3513059654506845, |
|
"learning_rate": 9.793757457883062e-06, |
|
"loss": 0.0838, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.36902800658978585, |
|
"grad_norm": 0.9877986179117915, |
|
"learning_rate": 9.785485345955446e-06, |
|
"loss": 0.0873, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.3723228995057661, |
|
"grad_norm": 0.8970818631280407, |
|
"learning_rate": 9.777054221431418e-06, |
|
"loss": 0.0611, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.3756177924217463, |
|
"grad_norm": 0.9226544421763598, |
|
"learning_rate": 9.768464364461248e-06, |
|
"loss": 0.078, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.37891268533772654, |
|
"grad_norm": 1.4766754423858683, |
|
"learning_rate": 9.75971606046958e-06, |
|
"loss": 0.0858, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.38220757825370677, |
|
"grad_norm": 1.320312770948565, |
|
"learning_rate": 9.750809600145955e-06, |
|
"loss": 0.0872, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.385502471169687, |
|
"grad_norm": 0.9205622576015271, |
|
"learning_rate": 9.741745279435144e-06, |
|
"loss": 0.0855, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.38879736408566723, |
|
"grad_norm": 2.1130956623266384, |
|
"learning_rate": 9.732523399527328e-06, |
|
"loss": 0.0869, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.39209225700164746, |
|
"grad_norm": 1.346773242530782, |
|
"learning_rate": 9.723144266848073e-06, |
|
"loss": 0.0891, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.3953871499176277, |
|
"grad_norm": 0.954418724615336, |
|
"learning_rate": 9.713608193048156e-06, |
|
"loss": 0.0927, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3986820428336079, |
|
"grad_norm": 1.1613672266850283, |
|
"learning_rate": 9.703915494993215e-06, |
|
"loss": 0.0946, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.40197693574958815, |
|
"grad_norm": 0.9668677562255413, |
|
"learning_rate": 9.694066494753211e-06, |
|
"loss": 0.0828, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.4052718286655684, |
|
"grad_norm": 1.1556457477946604, |
|
"learning_rate": 9.684061519591734e-06, |
|
"loss": 0.0926, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.4085667215815486, |
|
"grad_norm": 1.0348616873807939, |
|
"learning_rate": 9.673900901955118e-06, |
|
"loss": 0.0942, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.41186161449752884, |
|
"grad_norm": 0.954839683053947, |
|
"learning_rate": 9.663584979461407e-06, |
|
"loss": 0.0841, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.41515650741350907, |
|
"grad_norm": 0.8951987570639582, |
|
"learning_rate": 9.653114094889128e-06, |
|
"loss": 0.082, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.4184514003294893, |
|
"grad_norm": 0.5582851260265275, |
|
"learning_rate": 9.642488596165903e-06, |
|
"loss": 0.0579, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.42174629324546953, |
|
"grad_norm": 0.7282031291743011, |
|
"learning_rate": 9.631708836356893e-06, |
|
"loss": 0.0686, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.42504118616144976, |
|
"grad_norm": 0.7341380700784487, |
|
"learning_rate": 9.620775173653055e-06, |
|
"loss": 0.0581, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.42833607907743, |
|
"grad_norm": 1.0426311808706392, |
|
"learning_rate": 9.609687971359254e-06, |
|
"loss": 0.0863, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.4316309719934102, |
|
"grad_norm": 0.7221932362532579, |
|
"learning_rate": 9.598447597882181e-06, |
|
"loss": 0.0904, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.43492586490939045, |
|
"grad_norm": 1.057905146517346, |
|
"learning_rate": 9.587054426718117e-06, |
|
"loss": 0.087, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4382207578253707, |
|
"grad_norm": 1.0654844742045295, |
|
"learning_rate": 9.575508836440516e-06, |
|
"loss": 0.0833, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.4415156507413509, |
|
"grad_norm": 0.9708392842496616, |
|
"learning_rate": 9.563811210687433e-06, |
|
"loss": 0.07, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.44481054365733114, |
|
"grad_norm": 0.869842645454385, |
|
"learning_rate": 9.551961938148772e-06, |
|
"loss": 0.0798, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.44810543657331137, |
|
"grad_norm": 0.9291669588809491, |
|
"learning_rate": 9.539961412553375e-06, |
|
"loss": 0.0717, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.4514003294892916, |
|
"grad_norm": 0.971502019921878, |
|
"learning_rate": 9.52781003265593e-06, |
|
"loss": 0.0936, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.4546952224052718, |
|
"grad_norm": 0.7797238947376094, |
|
"learning_rate": 9.515508202223735e-06, |
|
"loss": 0.0711, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.45799011532125206, |
|
"grad_norm": 1.0743726394776476, |
|
"learning_rate": 9.503056330023267e-06, |
|
"loss": 0.0755, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.4612850082372323, |
|
"grad_norm": 1.1954773475892864, |
|
"learning_rate": 9.490454829806609e-06, |
|
"loss": 0.1304, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.4645799011532125, |
|
"grad_norm": 0.5763939890921029, |
|
"learning_rate": 9.477704120297698e-06, |
|
"loss": 0.0614, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.46787479406919275, |
|
"grad_norm": 0.9472151466304063, |
|
"learning_rate": 9.464804625178414e-06, |
|
"loss": 0.0712, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.471169686985173, |
|
"grad_norm": 0.886844022382936, |
|
"learning_rate": 9.4517567730745e-06, |
|
"loss": 0.0797, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.4744645799011532, |
|
"grad_norm": 1.0134166748412918, |
|
"learning_rate": 9.438560997541319e-06, |
|
"loss": 0.0899, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.47775947281713343, |
|
"grad_norm": 0.8070690603660651, |
|
"learning_rate": 9.425217737049452e-06, |
|
"loss": 0.0826, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.48105436573311366, |
|
"grad_norm": 0.8619994488550511, |
|
"learning_rate": 9.411727434970121e-06, |
|
"loss": 0.086, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.4843492586490939, |
|
"grad_norm": 0.8778816857460797, |
|
"learning_rate": 9.398090539560465e-06, |
|
"loss": 0.0854, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.4876441515650741, |
|
"grad_norm": 1.1118672558694775, |
|
"learning_rate": 9.384307503948637e-06, |
|
"loss": 0.1105, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.49093904448105435, |
|
"grad_norm": 1.032735807082786, |
|
"learning_rate": 9.370378786118755e-06, |
|
"loss": 0.0783, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.4942339373970346, |
|
"grad_norm": 1.0495708384737894, |
|
"learning_rate": 9.356304848895676e-06, |
|
"loss": 0.0815, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.4975288303130148, |
|
"grad_norm": 1.0945068947939716, |
|
"learning_rate": 9.342086159929629e-06, |
|
"loss": 0.0875, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.500823723228995, |
|
"grad_norm": 1.1286751183737302, |
|
"learning_rate": 9.327723191680666e-06, |
|
"loss": 0.0545, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.5041186161449753, |
|
"grad_norm": 0.9183247855730471, |
|
"learning_rate": 9.31321642140296e-06, |
|
"loss": 0.0757, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.5074135090609555, |
|
"grad_norm": 1.0502035656993673, |
|
"learning_rate": 9.29856633112896e-06, |
|
"loss": 0.0809, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.5107084019769358, |
|
"grad_norm": 0.8738749178267629, |
|
"learning_rate": 9.283773407653363e-06, |
|
"loss": 0.0562, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.514003294892916, |
|
"grad_norm": 1.0813438666782247, |
|
"learning_rate": 9.268838142516943e-06, |
|
"loss": 0.085, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.5172981878088962, |
|
"grad_norm": 1.0115574109758954, |
|
"learning_rate": 9.253761031990218e-06, |
|
"loss": 0.0749, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.5205930807248764, |
|
"grad_norm": 1.375042365212545, |
|
"learning_rate": 9.238542577056957e-06, |
|
"loss": 0.078, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.5238879736408567, |
|
"grad_norm": 1.59130187611513, |
|
"learning_rate": 9.223183283397538e-06, |
|
"loss": 0.1029, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.5271828665568369, |
|
"grad_norm": 1.1395752837101527, |
|
"learning_rate": 9.20768366137214e-06, |
|
"loss": 0.1128, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5304777594728172, |
|
"grad_norm": 0.8079597092244131, |
|
"learning_rate": 9.19204422600379e-06, |
|
"loss": 0.0527, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.5337726523887973, |
|
"grad_norm": 1.0336684369348528, |
|
"learning_rate": 9.176265496961242e-06, |
|
"loss": 0.0828, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.5370675453047776, |
|
"grad_norm": 0.8656528491730779, |
|
"learning_rate": 9.160347998541722e-06, |
|
"loss": 0.0704, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.5403624382207578, |
|
"grad_norm": 2.086614798853126, |
|
"learning_rate": 9.144292259653493e-06, |
|
"loss": 0.104, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.5436573311367381, |
|
"grad_norm": 1.5779000987891205, |
|
"learning_rate": 9.128098813798291e-06, |
|
"loss": 0.0996, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.5469522240527183, |
|
"grad_norm": 0.7283818376854391, |
|
"learning_rate": 9.111768199053588e-06, |
|
"loss": 0.0621, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.5502471169686985, |
|
"grad_norm": 1.2155979859224575, |
|
"learning_rate": 9.095300958054722e-06, |
|
"loss": 0.0653, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.5535420098846787, |
|
"grad_norm": 1.3669964434695867, |
|
"learning_rate": 9.078697637976861e-06, |
|
"loss": 0.1071, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.556836902800659, |
|
"grad_norm": 0.659337598200629, |
|
"learning_rate": 9.061958790516821e-06, |
|
"loss": 0.101, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.5601317957166392, |
|
"grad_norm": 3.064428601730586, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.0631, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.5634266886326195, |
|
"grad_norm": 1.8617437169334994, |
|
"learning_rate": 9.028076742735583e-06, |
|
"loss": 0.1062, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.5667215815485996, |
|
"grad_norm": 1.005975190266642, |
|
"learning_rate": 9.010934668250533e-06, |
|
"loss": 0.0706, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.5700164744645799, |
|
"grad_norm": 1.2704133524125742, |
|
"learning_rate": 8.993659318018191e-06, |
|
"loss": 0.1047, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.5733113673805601, |
|
"grad_norm": 1.5091840035688024, |
|
"learning_rate": 8.976251266065663e-06, |
|
"loss": 0.0915, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.5766062602965404, |
|
"grad_norm": 0.9983867645520093, |
|
"learning_rate": 8.958711090829477e-06, |
|
"loss": 0.0868, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.5799011532125206, |
|
"grad_norm": 0.9695191782522918, |
|
"learning_rate": 8.94103937513637e-06, |
|
"loss": 0.0782, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.5831960461285008, |
|
"grad_norm": 1.485821220772562, |
|
"learning_rate": 8.923236706183923e-06, |
|
"loss": 0.088, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.586490939044481, |
|
"grad_norm": 0.8778171297271887, |
|
"learning_rate": 8.905303675521031e-06, |
|
"loss": 0.0675, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.5897858319604613, |
|
"grad_norm": 1.4148551976101613, |
|
"learning_rate": 8.887240879028276e-06, |
|
"loss": 0.0968, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.5930807248764415, |
|
"grad_norm": 1.2313668213255908, |
|
"learning_rate": 8.869048916898109e-06, |
|
"loss": 0.0885, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.5963756177924218, |
|
"grad_norm": 0.7935552297095103, |
|
"learning_rate": 8.850728393614903e-06, |
|
"loss": 0.0919, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.5996705107084019, |
|
"grad_norm": 0.6219191194067725, |
|
"learning_rate": 8.832279917934881e-06, |
|
"loss": 0.0495, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.6029654036243822, |
|
"grad_norm": 0.891323405042387, |
|
"learning_rate": 8.813704102865881e-06, |
|
"loss": 0.1036, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6062602965403624, |
|
"grad_norm": 0.7672391951678563, |
|
"learning_rate": 8.795001565646983e-06, |
|
"loss": 0.0728, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.6095551894563427, |
|
"grad_norm": 0.9915439152928615, |
|
"learning_rate": 8.776172927728008e-06, |
|
"loss": 0.0744, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6128500823723229, |
|
"grad_norm": 0.7415850366120278, |
|
"learning_rate": 8.75721881474886e-06, |
|
"loss": 0.0999, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6161449752883031, |
|
"grad_norm": 0.7803751476377428, |
|
"learning_rate": 8.738139856518746e-06, |
|
"loss": 0.084, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.6194398682042833, |
|
"grad_norm": 0.6444369328111929, |
|
"learning_rate": 8.718936686995239e-06, |
|
"loss": 0.0632, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.6227347611202636, |
|
"grad_norm": 1.063336491675898, |
|
"learning_rate": 8.699609944263219e-06, |
|
"loss": 0.0854, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.6260296540362438, |
|
"grad_norm": 0.5254825539939502, |
|
"learning_rate": 8.680160270513671e-06, |
|
"loss": 0.0658, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.6293245469522241, |
|
"grad_norm": 2.1957036152343097, |
|
"learning_rate": 8.660588312022345e-06, |
|
"loss": 0.0767, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.6326194398682042, |
|
"grad_norm": 1.1425043272398887, |
|
"learning_rate": 8.640894719128274e-06, |
|
"loss": 0.1092, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.6359143327841845, |
|
"grad_norm": 0.6811077233527263, |
|
"learning_rate": 8.621080146212181e-06, |
|
"loss": 0.0552, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.6392092257001647, |
|
"grad_norm": 1.0217179933628129, |
|
"learning_rate": 8.601145251674718e-06, |
|
"loss": 0.0749, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.642504118616145, |
|
"grad_norm": 0.9919309113126284, |
|
"learning_rate": 8.581090697914602e-06, |
|
"loss": 0.0929, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.6457990115321252, |
|
"grad_norm": 1.178486792691301, |
|
"learning_rate": 8.560917151306594e-06, |
|
"loss": 0.1023, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.6490939044481054, |
|
"grad_norm": 1.155779967707836, |
|
"learning_rate": 8.540625282179364e-06, |
|
"loss": 0.0821, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.6523887973640856, |
|
"grad_norm": 1.0278193202776953, |
|
"learning_rate": 8.520215764793214e-06, |
|
"loss": 0.0739, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.6556836902800659, |
|
"grad_norm": 5.872968813903652, |
|
"learning_rate": 8.499689277317675e-06, |
|
"loss": 0.0763, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.6589785831960461, |
|
"grad_norm": 1.4949053388183082, |
|
"learning_rate": 8.479046501808971e-06, |
|
"loss": 0.0696, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6622734761120264, |
|
"grad_norm": 0.9342591652212858, |
|
"learning_rate": 8.45828812418736e-06, |
|
"loss": 0.0629, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.6655683690280065, |
|
"grad_norm": 0.5142904539906094, |
|
"learning_rate": 8.437414834214333e-06, |
|
"loss": 0.0653, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.6688632619439868, |
|
"grad_norm": 1.3243894037794444, |
|
"learning_rate": 8.416427325469705e-06, |
|
"loss": 0.1095, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.6721581548599671, |
|
"grad_norm": 1.3555237832265656, |
|
"learning_rate": 8.395326295328562e-06, |
|
"loss": 0.1028, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.6754530477759473, |
|
"grad_norm": 0.818855723195901, |
|
"learning_rate": 8.374112444938094e-06, |
|
"loss": 0.088, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.6787479406919276, |
|
"grad_norm": 0.8852074571699485, |
|
"learning_rate": 8.352786479194288e-06, |
|
"loss": 0.0526, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.6820428336079077, |
|
"grad_norm": 1.372137747701387, |
|
"learning_rate": 8.331349106718515e-06, |
|
"loss": 0.0957, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.685337726523888, |
|
"grad_norm": 0.8727917319312107, |
|
"learning_rate": 8.309801039833978e-06, |
|
"loss": 0.0895, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.6886326194398682, |
|
"grad_norm": 1.0509761315988109, |
|
"learning_rate": 8.28814299454205e-06, |
|
"loss": 0.0996, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.6919275123558485, |
|
"grad_norm": 0.8839477803791086, |
|
"learning_rate": 8.266375690498475e-06, |
|
"loss": 0.0865, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.6952224052718287, |
|
"grad_norm": 0.7064553470448757, |
|
"learning_rate": 8.244499850989453e-06, |
|
"loss": 0.0728, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.6985172981878089, |
|
"grad_norm": 0.9694201547278781, |
|
"learning_rate": 8.22251620290762e-06, |
|
"loss": 0.0581, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.7018121911037891, |
|
"grad_norm": 0.724783690835583, |
|
"learning_rate": 8.20042547672788e-06, |
|
"loss": 0.0828, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.7051070840197694, |
|
"grad_norm": 0.7137797204098316, |
|
"learning_rate": 8.178228406483145e-06, |
|
"loss": 0.0707, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.7084019769357496, |
|
"grad_norm": 0.4606523745916078, |
|
"learning_rate": 8.15592572973993e-06, |
|
"loss": 0.044, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.7116968698517299, |
|
"grad_norm": 0.6640092591585782, |
|
"learning_rate": 8.133518187573864e-06, |
|
"loss": 0.0561, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.71499176276771, |
|
"grad_norm": 0.8170937626114954, |
|
"learning_rate": 8.111006524545043e-06, |
|
"loss": 0.0823, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.7182866556836903, |
|
"grad_norm": 0.6372182616249538, |
|
"learning_rate": 8.088391488673313e-06, |
|
"loss": 0.066, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.7215815485996705, |
|
"grad_norm": 0.6375067936631406, |
|
"learning_rate": 8.065673831413396e-06, |
|
"loss": 0.0506, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.7248764415156508, |
|
"grad_norm": 0.6423309999365212, |
|
"learning_rate": 8.042854307629932e-06, |
|
"loss": 0.0629, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.728171334431631, |
|
"grad_norm": 0.752582677343103, |
|
"learning_rate": 8.019933675572389e-06, |
|
"loss": 0.0722, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.7314662273476112, |
|
"grad_norm": 0.8112402585396246, |
|
"learning_rate": 7.996912696849873e-06, |
|
"loss": 0.0842, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.7347611202635914, |
|
"grad_norm": 0.6919487336036387, |
|
"learning_rate": 7.97379213640582e-06, |
|
"loss": 0.0684, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.7380560131795717, |
|
"grad_norm": 0.7434807539917022, |
|
"learning_rate": 7.950572762492577e-06, |
|
"loss": 0.0682, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.7413509060955519, |
|
"grad_norm": 0.6848656249105235, |
|
"learning_rate": 7.927255346645872e-06, |
|
"loss": 0.0546, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.7446457990115322, |
|
"grad_norm": 0.7826479624770332, |
|
"learning_rate": 7.903840663659186e-06, |
|
"loss": 0.0684, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.7479406919275123, |
|
"grad_norm": 0.6927518800734283, |
|
"learning_rate": 7.880329491557996e-06, |
|
"loss": 0.079, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.7512355848434926, |
|
"grad_norm": 0.8763045243203113, |
|
"learning_rate": 7.856722611573938e-06, |
|
"loss": 0.1068, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.7545304777594728, |
|
"grad_norm": 0.8300681217056403, |
|
"learning_rate": 7.83302080811883e-06, |
|
"loss": 0.0667, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.7578253706754531, |
|
"grad_norm": 0.5437395859594083, |
|
"learning_rate": 7.809224868758621e-06, |
|
"loss": 0.0671, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.7611202635914333, |
|
"grad_norm": 0.7134167417475868, |
|
"learning_rate": 7.78533558418722e-06, |
|
"loss": 0.079, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.7644151565074135, |
|
"grad_norm": 0.8367162252369527, |
|
"learning_rate": 7.761353748200213e-06, |
|
"loss": 0.075, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.7677100494233937, |
|
"grad_norm": 0.6993381735068975, |
|
"learning_rate": 7.737280157668503e-06, |
|
"loss": 0.0665, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.771004942339374, |
|
"grad_norm": 0.644489745443266, |
|
"learning_rate": 7.713115612511815e-06, |
|
"loss": 0.0704, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.7742998352553542, |
|
"grad_norm": 0.6337392482963783, |
|
"learning_rate": 7.688860915672129e-06, |
|
"loss": 0.0487, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.7775947281713345, |
|
"grad_norm": 0.4306575759208823, |
|
"learning_rate": 7.664516873086987e-06, |
|
"loss": 0.0498, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.7808896210873146, |
|
"grad_norm": 0.6371209076121114, |
|
"learning_rate": 7.640084293662731e-06, |
|
"loss": 0.0581, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.7841845140032949, |
|
"grad_norm": 0.809205628205596, |
|
"learning_rate": 7.615563989247604e-06, |
|
"loss": 0.0886, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.7874794069192751, |
|
"grad_norm": 0.6807826450879982, |
|
"learning_rate": 7.590956774604791e-06, |
|
"loss": 0.0824, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.7907742998352554, |
|
"grad_norm": 0.9092838195300236, |
|
"learning_rate": 7.566263467385335e-06, |
|
"loss": 0.0703, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.7940691927512356, |
|
"grad_norm": 0.736565350841279, |
|
"learning_rate": 7.541484888100974e-06, |
|
"loss": 0.0695, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.7973640856672158, |
|
"grad_norm": 0.7220288466907268, |
|
"learning_rate": 7.516621860096873e-06, |
|
"loss": 0.0707, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.800658978583196, |
|
"grad_norm": 0.6829838831547227, |
|
"learning_rate": 7.491675209524272e-06, |
|
"loss": 0.0666, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.8039538714991763, |
|
"grad_norm": 0.8226949141177504, |
|
"learning_rate": 7.466645765313023e-06, |
|
"loss": 0.0752, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.8072487644151565, |
|
"grad_norm": 0.5909405519820083, |
|
"learning_rate": 7.4415343591440604e-06, |
|
"loss": 0.0582, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.8105436573311368, |
|
"grad_norm": 0.7318765147109815, |
|
"learning_rate": 7.416341825421755e-06, |
|
"loss": 0.078, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.8138385502471169, |
|
"grad_norm": 0.7063912838195767, |
|
"learning_rate": 7.391069001246193e-06, |
|
"loss": 0.0868, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.8171334431630972, |
|
"grad_norm": 0.6799477779267012, |
|
"learning_rate": 7.365716726385361e-06, |
|
"loss": 0.0681, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.8204283360790774, |
|
"grad_norm": 0.8516971338664023, |
|
"learning_rate": 7.3402858432472416e-06, |
|
"loss": 0.0761, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.8237232289950577, |
|
"grad_norm": 0.8051104503646311, |
|
"learning_rate": 7.3147771968518175e-06, |
|
"loss": 0.077, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.8270181219110379, |
|
"grad_norm": 0.8417638265928152, |
|
"learning_rate": 7.289191634803002e-06, |
|
"loss": 0.0721, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.8303130148270181, |
|
"grad_norm": 0.9280576906426667, |
|
"learning_rate": 7.263530007260466e-06, |
|
"loss": 0.0839, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.8336079077429983, |
|
"grad_norm": 0.8205604877193189, |
|
"learning_rate": 7.2377931669113934e-06, |
|
"loss": 0.084, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.8369028006589786, |
|
"grad_norm": 0.7347246169190605, |
|
"learning_rate": 7.211981968942147e-06, |
|
"loss": 0.0508, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.8401976935749588, |
|
"grad_norm": 0.7727540137134915, |
|
"learning_rate": 7.186097271009852e-06, |
|
"loss": 0.0504, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.8434925864909391, |
|
"grad_norm": 0.6116838823458901, |
|
"learning_rate": 7.160139933213899e-06, |
|
"loss": 0.0533, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.8467874794069192, |
|
"grad_norm": 0.8518782068127816, |
|
"learning_rate": 7.134110818067361e-06, |
|
"loss": 0.0775, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.8500823723228995, |
|
"grad_norm": 0.9449160515812749, |
|
"learning_rate": 7.1080107904683405e-06, |
|
"loss": 0.0721, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.8533772652388797, |
|
"grad_norm": 0.6964873142430633, |
|
"learning_rate": 7.08184071767122e-06, |
|
"loss": 0.0673, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.85667215815486, |
|
"grad_norm": 0.768104304709271, |
|
"learning_rate": 7.0556014692578554e-06, |
|
"loss": 0.0749, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.8599670510708401, |
|
"grad_norm": 0.7599189113700034, |
|
"learning_rate": 7.029293917108678e-06, |
|
"loss": 0.0684, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.8632619439868204, |
|
"grad_norm": 0.777387517223909, |
|
"learning_rate": 7.0029189353737195e-06, |
|
"loss": 0.0656, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.8665568369028006, |
|
"grad_norm": 0.7045793540209936, |
|
"learning_rate": 6.9764774004435685e-06, |
|
"loss": 0.0619, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.8698517298187809, |
|
"grad_norm": 0.6234760268316166, |
|
"learning_rate": 6.949970190920255e-06, |
|
"loss": 0.0708, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.8731466227347611, |
|
"grad_norm": 0.7124980322892176, |
|
"learning_rate": 6.9233981875880416e-06, |
|
"loss": 0.0521, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.8764415156507414, |
|
"grad_norm": 0.8490902000387839, |
|
"learning_rate": 6.896762273384179e-06, |
|
"loss": 0.0632, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.8797364085667215, |
|
"grad_norm": 0.6944201065528963, |
|
"learning_rate": 6.870063333369543e-06, |
|
"loss": 0.0716, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.8830313014827018, |
|
"grad_norm": 0.758349126043532, |
|
"learning_rate": 6.8433022546992444e-06, |
|
"loss": 0.0596, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.886326194398682, |
|
"grad_norm": 1.2664444257431744, |
|
"learning_rate": 6.81647992659314e-06, |
|
"loss": 0.0628, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.8896210873146623, |
|
"grad_norm": 0.8379324844684077, |
|
"learning_rate": 6.789597240306295e-06, |
|
"loss": 0.0674, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.8929159802306426, |
|
"grad_norm": 0.8462600835900949, |
|
"learning_rate": 6.762655089099353e-06, |
|
"loss": 0.0659, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.8962108731466227, |
|
"grad_norm": 0.9094387161179498, |
|
"learning_rate": 6.735654368208875e-06, |
|
"loss": 0.0623, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.899505766062603, |
|
"grad_norm": 0.7877875224066865, |
|
"learning_rate": 6.7085959748175685e-06, |
|
"loss": 0.0696, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.9028006589785832, |
|
"grad_norm": 0.6514864513423558, |
|
"learning_rate": 6.681480808024503e-06, |
|
"loss": 0.0766, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.9060955518945635, |
|
"grad_norm": 1.148236164352365, |
|
"learning_rate": 6.654309768815208e-06, |
|
"loss": 0.0903, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.9093904448105437, |
|
"grad_norm": 0.7078109102899715, |
|
"learning_rate": 6.627083760031755e-06, |
|
"loss": 0.0607, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.9126853377265239, |
|
"grad_norm": 0.613094345393223, |
|
"learning_rate": 6.599803686342748e-06, |
|
"loss": 0.0655, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.9159802306425041, |
|
"grad_norm": 0.6642339763695972, |
|
"learning_rate": 6.572470454213266e-06, |
|
"loss": 0.0731, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.9192751235584844, |
|
"grad_norm": 0.6971630112819691, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.0473, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.9225700164744646, |
|
"grad_norm": 0.7592858638911076, |
|
"learning_rate": 6.517648149294774e-06, |
|
"loss": 0.0581, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.9258649093904449, |
|
"grad_norm": 0.7189143571066544, |
|
"learning_rate": 6.490160898146919e-06, |
|
"loss": 0.0733, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.929159802306425, |
|
"grad_norm": 0.8305599945381572, |
|
"learning_rate": 6.4626241317803665e-06, |
|
"loss": 0.0807, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.9324546952224053, |
|
"grad_norm": 0.8787944618632045, |
|
"learning_rate": 6.4350387651896025e-06, |
|
"loss": 0.0648, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.9357495881383855, |
|
"grad_norm": 0.649270561331511, |
|
"learning_rate": 6.407405714984011e-06, |
|
"loss": 0.0921, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.9390444810543658, |
|
"grad_norm": 0.9873611661857511, |
|
"learning_rate": 6.379725899357408e-06, |
|
"loss": 0.0847, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.942339373970346, |
|
"grad_norm": 0.8338719043181901, |
|
"learning_rate": 6.3520002380575395e-06, |
|
"loss": 0.0673, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.9456342668863262, |
|
"grad_norm": 0.8390156519820746, |
|
"learning_rate": 6.324229652355513e-06, |
|
"loss": 0.0626, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.9489291598023064, |
|
"grad_norm": 0.7197773939188823, |
|
"learning_rate": 6.29641506501519e-06, |
|
"loss": 0.0864, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.9522240527182867, |
|
"grad_norm": 0.942984980454084, |
|
"learning_rate": 6.2685574002625235e-06, |
|
"loss": 0.0686, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.9555189456342669, |
|
"grad_norm": 0.9649936636393807, |
|
"learning_rate": 6.2406575837548455e-06, |
|
"loss": 0.0599, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.9588138385502472, |
|
"grad_norm": 0.6889881534410974, |
|
"learning_rate": 6.212716542550112e-06, |
|
"loss": 0.101, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.9621087314662273, |
|
"grad_norm": 0.9632795509211302, |
|
"learning_rate": 6.184735205076097e-06, |
|
"loss": 0.0773, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.9654036243822076, |
|
"grad_norm": 1.0400767819370376, |
|
"learning_rate": 6.156714501099544e-06, |
|
"loss": 0.0638, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.9686985172981878, |
|
"grad_norm": 1.0147243725605253, |
|
"learning_rate": 6.1286553616952705e-06, |
|
"loss": 0.0593, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.9719934102141681, |
|
"grad_norm": 0.6613193470791487, |
|
"learning_rate": 6.100558719215228e-06, |
|
"loss": 0.0632, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.9752883031301482, |
|
"grad_norm": 1.0408938474730054, |
|
"learning_rate": 6.072425507257528e-06, |
|
"loss": 0.0876, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.9785831960461285, |
|
"grad_norm": 0.712701647042842, |
|
"learning_rate": 6.044256660635412e-06, |
|
"loss": 0.0733, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.9818780889621087, |
|
"grad_norm": 0.6397491114376809, |
|
"learning_rate": 6.016053115346197e-06, |
|
"loss": 0.0561, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.985172981878089, |
|
"grad_norm": 0.7191102659386986, |
|
"learning_rate": 5.987815808540169e-06, |
|
"loss": 0.0791, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.9884678747940692, |
|
"grad_norm": 0.4709712102337363, |
|
"learning_rate": 5.959545678489447e-06, |
|
"loss": 0.0475, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.9917627677100495, |
|
"grad_norm": 0.8715274588578796, |
|
"learning_rate": 5.931243664556803e-06, |
|
"loss": 0.0771, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.9950576606260296, |
|
"grad_norm": 0.7017524340447387, |
|
"learning_rate": 5.902910707164449e-06, |
|
"loss": 0.0712, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.9983525535420099, |
|
"grad_norm": 0.7619744594259967, |
|
"learning_rate": 5.874547747762792e-06, |
|
"loss": 0.0585, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.9983525535420099, |
|
"eval_loss": 0.07007648050785065, |
|
"eval_runtime": 143.0638, |
|
"eval_samples_per_second": 35.683, |
|
"eval_steps_per_second": 1.118, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.00164744645799, |
|
"grad_norm": 0.5917774055195716, |
|
"learning_rate": 5.8461557287991455e-06, |
|
"loss": 0.0686, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.0049423393739703, |
|
"grad_norm": 0.4973762275932349, |
|
"learning_rate": 5.81773559368642e-06, |
|
"loss": 0.0524, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.0082372322899507, |
|
"grad_norm": 0.5021975231329254, |
|
"learning_rate": 5.7892882867717705e-06, |
|
"loss": 0.0577, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.0115321252059308, |
|
"grad_norm": 0.704352626743678, |
|
"learning_rate": 5.7608147533052194e-06, |
|
"loss": 0.0509, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.014827018121911, |
|
"grad_norm": 1.189723828759097, |
|
"learning_rate": 5.732315939408251e-06, |
|
"loss": 0.0815, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.0181219110378912, |
|
"grad_norm": 0.6036027009145574, |
|
"learning_rate": 5.703792792042363e-06, |
|
"loss": 0.0556, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.0214168039538716, |
|
"grad_norm": 0.5342904909103813, |
|
"learning_rate": 5.675246258977617e-06, |
|
"loss": 0.0487, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.0247116968698518, |
|
"grad_norm": 0.46763620767148034, |
|
"learning_rate": 5.646677288761132e-06, |
|
"loss": 0.0491, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.028006589785832, |
|
"grad_norm": 0.5696375911949768, |
|
"learning_rate": 5.618086830685569e-06, |
|
"loss": 0.047, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.031301482701812, |
|
"grad_norm": 0.38600791899244996, |
|
"learning_rate": 5.589475834757595e-06, |
|
"loss": 0.032, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.0345963756177925, |
|
"grad_norm": 1.0072710877393638, |
|
"learning_rate": 5.560845251666307e-06, |
|
"loss": 0.063, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.0378912685337727, |
|
"grad_norm": 0.663725882779124, |
|
"learning_rate": 5.532196032751647e-06, |
|
"loss": 0.0563, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.0411861614497528, |
|
"grad_norm": 0.6135177621912624, |
|
"learning_rate": 5.503529129972792e-06, |
|
"loss": 0.0514, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.044481054365733, |
|
"grad_norm": 0.7549455934476204, |
|
"learning_rate": 5.474845495876518e-06, |
|
"loss": 0.0563, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.0477759472817134, |
|
"grad_norm": 0.8244910748727189, |
|
"learning_rate": 5.4461460835655535e-06, |
|
"loss": 0.0804, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.0510708401976936, |
|
"grad_norm": 0.604488079236042, |
|
"learning_rate": 5.417431846666903e-06, |
|
"loss": 0.0679, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.0543657331136738, |
|
"grad_norm": 0.5136265587955748, |
|
"learning_rate": 5.388703739300167e-06, |
|
"loss": 0.0388, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.057660626029654, |
|
"grad_norm": 0.6007478171198604, |
|
"learning_rate": 5.359962716045836e-06, |
|
"loss": 0.0632, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.0609555189456343, |
|
"grad_norm": 0.4928892879154173, |
|
"learning_rate": 5.331209731913568e-06, |
|
"loss": 0.058, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.0642504118616145, |
|
"grad_norm": 0.5300520318408385, |
|
"learning_rate": 5.30244574231046e-06, |
|
"loss": 0.0528, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.0675453047775947, |
|
"grad_norm": 0.49159187140329286, |
|
"learning_rate": 5.273671703009301e-06, |
|
"loss": 0.046, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.0708401976935749, |
|
"grad_norm": 0.804620211006138, |
|
"learning_rate": 5.2448885701168094e-06, |
|
"loss": 0.0601, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.0741350906095553, |
|
"grad_norm": 0.4648011852930538, |
|
"learning_rate": 5.21609730004187e-06, |
|
"loss": 0.0438, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.0774299835255354, |
|
"grad_norm": 0.5362596735899865, |
|
"learning_rate": 5.187298849463748e-06, |
|
"loss": 0.0507, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.0807248764415156, |
|
"grad_norm": 0.5443586783585722, |
|
"learning_rate": 5.158494175300304e-06, |
|
"loss": 0.053, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.084019769357496, |
|
"grad_norm": 0.6076056192307563, |
|
"learning_rate": 5.129684234676195e-06, |
|
"loss": 0.0594, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.0873146622734762, |
|
"grad_norm": 0.9033252357763137, |
|
"learning_rate": 5.100869984891077e-06, |
|
"loss": 0.06, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.0906095551894563, |
|
"grad_norm": 0.454480847306655, |
|
"learning_rate": 5.072052383387787e-06, |
|
"loss": 0.0424, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.0939044481054365, |
|
"grad_norm": 0.46517988927206794, |
|
"learning_rate": 5.043232387720532e-06, |
|
"loss": 0.0443, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.0971993410214167, |
|
"grad_norm": 0.4148720401510593, |
|
"learning_rate": 5.014410955523079e-06, |
|
"loss": 0.0387, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.100494233937397, |
|
"grad_norm": 0.5146539821704307, |
|
"learning_rate": 4.9855890444769226e-06, |
|
"loss": 0.0563, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.1037891268533773, |
|
"grad_norm": 0.5267211782218569, |
|
"learning_rate": 4.956767612279468e-06, |
|
"loss": 0.044, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.1070840197693574, |
|
"grad_norm": 0.5731696810590752, |
|
"learning_rate": 4.927947616612216e-06, |
|
"loss": 0.0469, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.1103789126853378, |
|
"grad_norm": 0.4606767989043497, |
|
"learning_rate": 4.899130015108923e-06, |
|
"loss": 0.0556, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.113673805601318, |
|
"grad_norm": 0.5591348812226693, |
|
"learning_rate": 4.8703157653238055e-06, |
|
"loss": 0.0526, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.1169686985172982, |
|
"grad_norm": 0.5103079438074868, |
|
"learning_rate": 4.841505824699697e-06, |
|
"loss": 0.0651, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.1202635914332784, |
|
"grad_norm": 0.6163138349681117, |
|
"learning_rate": 4.812701150536254e-06, |
|
"loss": 0.0509, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.1235584843492585, |
|
"grad_norm": 0.4842115475256147, |
|
"learning_rate": 4.78390269995813e-06, |
|
"loss": 0.035, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.126853377265239, |
|
"grad_norm": 0.4047877822645327, |
|
"learning_rate": 4.755111429883191e-06, |
|
"loss": 0.0342, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.130148270181219, |
|
"grad_norm": 0.5782935405242332, |
|
"learning_rate": 4.726328296990699e-06, |
|
"loss": 0.0416, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.1334431630971993, |
|
"grad_norm": 0.5846524401590787, |
|
"learning_rate": 4.697554257689541e-06, |
|
"loss": 0.0419, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.1367380560131797, |
|
"grad_norm": 0.5096985328650335, |
|
"learning_rate": 4.668790268086432e-06, |
|
"loss": 0.044, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.1400329489291599, |
|
"grad_norm": 0.5796683420196656, |
|
"learning_rate": 4.640037283954165e-06, |
|
"loss": 0.0634, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.14332784184514, |
|
"grad_norm": 0.5897186824110954, |
|
"learning_rate": 4.611296260699833e-06, |
|
"loss": 0.0511, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.1466227347611202, |
|
"grad_norm": 0.5043407904517478, |
|
"learning_rate": 4.582568153333098e-06, |
|
"loss": 0.0474, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.1499176276771004, |
|
"grad_norm": 0.49203813884361564, |
|
"learning_rate": 4.553853916434448e-06, |
|
"loss": 0.0399, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.1532125205930808, |
|
"grad_norm": 0.6380533145833258, |
|
"learning_rate": 4.525154504123483e-06, |
|
"loss": 0.0628, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.156507413509061, |
|
"grad_norm": 0.6307435685302706, |
|
"learning_rate": 4.496470870027209e-06, |
|
"loss": 0.0544, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.1598023064250411, |
|
"grad_norm": 0.58051661483701, |
|
"learning_rate": 4.467803967248354e-06, |
|
"loss": 0.0549, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.1630971993410215, |
|
"grad_norm": 0.45506940053593953, |
|
"learning_rate": 4.439154748333695e-06, |
|
"loss": 0.0455, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.1663920922570017, |
|
"grad_norm": 0.4477960561383021, |
|
"learning_rate": 4.410524165242407e-06, |
|
"loss": 0.0417, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.1696869851729819, |
|
"grad_norm": 0.5024790404868378, |
|
"learning_rate": 4.381913169314432e-06, |
|
"loss": 0.0483, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.172981878088962, |
|
"grad_norm": 0.43352510094853813, |
|
"learning_rate": 4.3533227112388694e-06, |
|
"loss": 0.0381, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.1762767710049424, |
|
"grad_norm": 0.8015757322992388, |
|
"learning_rate": 4.324753741022383e-06, |
|
"loss": 0.0589, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.1795716639209226, |
|
"grad_norm": 0.554923192898479, |
|
"learning_rate": 4.296207207957638e-06, |
|
"loss": 0.0469, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.1828665568369028, |
|
"grad_norm": 0.4540612599730088, |
|
"learning_rate": 4.26768406059175e-06, |
|
"loss": 0.0469, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.186161449752883, |
|
"grad_norm": 0.4977016265485015, |
|
"learning_rate": 4.239185246694781e-06, |
|
"loss": 0.0486, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.1894563426688634, |
|
"grad_norm": 0.5773178206107633, |
|
"learning_rate": 4.21071171322823e-06, |
|
"loss": 0.0588, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.1927512355848435, |
|
"grad_norm": 0.5714806332591411, |
|
"learning_rate": 4.182264406313582e-06, |
|
"loss": 0.0473, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.1960461285008237, |
|
"grad_norm": 0.5399317568380463, |
|
"learning_rate": 4.1538442712008545e-06, |
|
"loss": 0.0515, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.1993410214168039, |
|
"grad_norm": 0.5077736606662918, |
|
"learning_rate": 4.12545225223721e-06, |
|
"loss": 0.0473, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.2026359143327843, |
|
"grad_norm": 0.65833510309246, |
|
"learning_rate": 4.097089292835551e-06, |
|
"loss": 0.0574, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.2059308072487644, |
|
"grad_norm": 0.5750314764693017, |
|
"learning_rate": 4.0687563354431986e-06, |
|
"loss": 0.033, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.2092257001647446, |
|
"grad_norm": 0.6672168173906087, |
|
"learning_rate": 4.040454321510554e-06, |
|
"loss": 0.0507, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.2125205930807248, |
|
"grad_norm": 0.46572043828398524, |
|
"learning_rate": 4.012184191459832e-06, |
|
"loss": 0.0448, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.2158154859967052, |
|
"grad_norm": 0.5294456067061011, |
|
"learning_rate": 3.983946884653804e-06, |
|
"loss": 0.0421, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.2191103789126854, |
|
"grad_norm": 0.7181848630920071, |
|
"learning_rate": 3.95574333936459e-06, |
|
"loss": 0.0609, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.2224052718286655, |
|
"grad_norm": 0.4872681980462519, |
|
"learning_rate": 3.927574492742473e-06, |
|
"loss": 0.0332, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.2257001647446457, |
|
"grad_norm": 0.5978073219647344, |
|
"learning_rate": 3.899441280784773e-06, |
|
"loss": 0.0557, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.2289950576606261, |
|
"grad_norm": 0.49268040219816195, |
|
"learning_rate": 3.8713446383047295e-06, |
|
"loss": 0.0539, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.2322899505766063, |
|
"grad_norm": 0.553488767277818, |
|
"learning_rate": 3.843285498900457e-06, |
|
"loss": 0.0438, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.2355848434925865, |
|
"grad_norm": 0.5769809240481462, |
|
"learning_rate": 3.815264794923903e-06, |
|
"loss": 0.0438, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.2388797364085666, |
|
"grad_norm": 0.4680099999633115, |
|
"learning_rate": 3.7872834574498894e-06, |
|
"loss": 0.0391, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.242174629324547, |
|
"grad_norm": 0.4990397184455205, |
|
"learning_rate": 3.7593424162451553e-06, |
|
"loss": 0.0513, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.2454695222405272, |
|
"grad_norm": 0.5670279278262034, |
|
"learning_rate": 3.731442599737478e-06, |
|
"loss": 0.0611, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.2487644151565074, |
|
"grad_norm": 0.4178810778744549, |
|
"learning_rate": 3.70358493498481e-06, |
|
"loss": 0.0461, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.2520593080724876, |
|
"grad_norm": 0.5498450231361147, |
|
"learning_rate": 3.6757703476444885e-06, |
|
"loss": 0.0372, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.255354200988468, |
|
"grad_norm": 0.45367014770072983, |
|
"learning_rate": 3.6479997619424605e-06, |
|
"loss": 0.0423, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.2586490939044481, |
|
"grad_norm": 0.4294200611194709, |
|
"learning_rate": 3.620274100642593e-06, |
|
"loss": 0.0552, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.2619439868204283, |
|
"grad_norm": 0.6276700882265509, |
|
"learning_rate": 3.5925942850159895e-06, |
|
"loss": 0.0659, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.2652388797364087, |
|
"grad_norm": 0.7113783547292587, |
|
"learning_rate": 3.564961234810399e-06, |
|
"loss": 0.067, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.2685337726523889, |
|
"grad_norm": 0.6367177743488461, |
|
"learning_rate": 3.5373758682196347e-06, |
|
"loss": 0.0626, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.271828665568369, |
|
"grad_norm": 0.6068919065481327, |
|
"learning_rate": 3.509839101853082e-06, |
|
"loss": 0.0546, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.2751235584843492, |
|
"grad_norm": 0.742600911574775, |
|
"learning_rate": 3.4823518507052277e-06, |
|
"loss": 0.061, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.2784184514003294, |
|
"grad_norm": 0.4142179254874713, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 0.0405, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.2817133443163098, |
|
"grad_norm": 0.7545310044049625, |
|
"learning_rate": 3.427529545786736e-06, |
|
"loss": 0.055, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.28500823723229, |
|
"grad_norm": 0.5556958907162003, |
|
"learning_rate": 3.400196313657253e-06, |
|
"loss": 0.0469, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.2883031301482701, |
|
"grad_norm": 0.5082442265119059, |
|
"learning_rate": 3.372916239968246e-06, |
|
"loss": 0.048, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.2915980230642505, |
|
"grad_norm": 0.5683891171997948, |
|
"learning_rate": 3.345690231184794e-06, |
|
"loss": 0.0413, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.2948929159802307, |
|
"grad_norm": 0.6720011431709395, |
|
"learning_rate": 3.318519191975499e-06, |
|
"loss": 0.0604, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.2981878088962109, |
|
"grad_norm": 0.5633138961258451, |
|
"learning_rate": 3.291404025182432e-06, |
|
"loss": 0.065, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.301482701812191, |
|
"grad_norm": 0.7184125904469478, |
|
"learning_rate": 3.264345631791127e-06, |
|
"loss": 0.0653, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.3047775947281712, |
|
"grad_norm": 0.6249487221408845, |
|
"learning_rate": 3.2373449109006476e-06, |
|
"loss": 0.0476, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.3080724876441516, |
|
"grad_norm": 0.8032427870564648, |
|
"learning_rate": 3.210402759693706e-06, |
|
"loss": 0.0731, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.3113673805601318, |
|
"grad_norm": 0.56609919677685, |
|
"learning_rate": 3.1835200734068604e-06, |
|
"loss": 0.0484, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.314662273476112, |
|
"grad_norm": 0.6259799233731378, |
|
"learning_rate": 3.1566977453007564e-06, |
|
"loss": 0.0621, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.3179571663920924, |
|
"grad_norm": 0.5571126132586377, |
|
"learning_rate": 3.1299366666304586e-06, |
|
"loss": 0.0536, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.3212520593080725, |
|
"grad_norm": 0.7177292111816991, |
|
"learning_rate": 3.103237726615822e-06, |
|
"loss": 0.0634, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.3245469522240527, |
|
"grad_norm": 0.4655411649268851, |
|
"learning_rate": 3.076601812411959e-06, |
|
"loss": 0.0386, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.327841845140033, |
|
"grad_norm": 0.5171630532203868, |
|
"learning_rate": 3.0500298090797465e-06, |
|
"loss": 0.0483, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.331136738056013, |
|
"grad_norm": 0.624563937797765, |
|
"learning_rate": 3.0235225995564323e-06, |
|
"loss": 0.0556, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.3344316309719935, |
|
"grad_norm": 0.5622033908392009, |
|
"learning_rate": 2.9970810646262805e-06, |
|
"loss": 0.0478, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.3377265238879736, |
|
"grad_norm": 0.4858535947041361, |
|
"learning_rate": 2.9707060828913226e-06, |
|
"loss": 0.0478, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.3410214168039538, |
|
"grad_norm": 0.40196887575712115, |
|
"learning_rate": 2.944398530742144e-06, |
|
"loss": 0.0462, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.3443163097199342, |
|
"grad_norm": 0.5650818660979695, |
|
"learning_rate": 2.9181592823287807e-06, |
|
"loss": 0.0655, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.3476112026359144, |
|
"grad_norm": 0.5475272072919456, |
|
"learning_rate": 2.8919892095316616e-06, |
|
"loss": 0.0519, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.3509060955518946, |
|
"grad_norm": 0.6655092099152591, |
|
"learning_rate": 2.865889181932639e-06, |
|
"loss": 0.0416, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.3542009884678747, |
|
"grad_norm": 0.8536644782226072, |
|
"learning_rate": 2.8398600667861032e-06, |
|
"loss": 0.0669, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.357495881383855, |
|
"grad_norm": 0.48323829394508544, |
|
"learning_rate": 2.813902728990149e-06, |
|
"loss": 0.0367, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.3607907742998353, |
|
"grad_norm": 0.6949809437868909, |
|
"learning_rate": 2.7880180310578546e-06, |
|
"loss": 0.0523, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.3640856672158155, |
|
"grad_norm": 0.43770382604271224, |
|
"learning_rate": 2.762206833088608e-06, |
|
"loss": 0.0527, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.3673805601317957, |
|
"grad_norm": 0.5081547664603686, |
|
"learning_rate": 2.7364699927395355e-06, |
|
"loss": 0.0613, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.370675453047776, |
|
"grad_norm": 0.6539834762026684, |
|
"learning_rate": 2.710808365197e-06, |
|
"loss": 0.0555, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.3739703459637562, |
|
"grad_norm": 0.39905302203795334, |
|
"learning_rate": 2.6852228031481837e-06, |
|
"loss": 0.0408, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.3772652388797364, |
|
"grad_norm": 0.45958447904532335, |
|
"learning_rate": 2.6597141567527614e-06, |
|
"loss": 0.0503, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.3805601317957166, |
|
"grad_norm": 0.4995326285015215, |
|
"learning_rate": 2.6342832736146403e-06, |
|
"loss": 0.0605, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.3838550247116967, |
|
"grad_norm": 0.4884609849725302, |
|
"learning_rate": 2.608930998753809e-06, |
|
"loss": 0.0602, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.3871499176276771, |
|
"grad_norm": 0.48846990238806653, |
|
"learning_rate": 2.5836581745782474e-06, |
|
"loss": 0.0429, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.3904448105436573, |
|
"grad_norm": 0.5807005137969414, |
|
"learning_rate": 2.558465640855943e-06, |
|
"loss": 0.052, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.3937397034596375, |
|
"grad_norm": 0.3856666040192254, |
|
"learning_rate": 2.533354234686979e-06, |
|
"loss": 0.0422, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.3970345963756179, |
|
"grad_norm": 0.3701363763937253, |
|
"learning_rate": 2.508324790475731e-06, |
|
"loss": 0.0449, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.400329489291598, |
|
"grad_norm": 0.5283156125790535, |
|
"learning_rate": 2.4833781399031275e-06, |
|
"loss": 0.0583, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.4036243822075782, |
|
"grad_norm": 0.5143083208475716, |
|
"learning_rate": 2.4585151118990286e-06, |
|
"loss": 0.0582, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.4069192751235584, |
|
"grad_norm": 0.4580082823859306, |
|
"learning_rate": 2.433736532614666e-06, |
|
"loss": 0.0503, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.4102141680395386, |
|
"grad_norm": 0.4733586276806861, |
|
"learning_rate": 2.4090432253952113e-06, |
|
"loss": 0.0595, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.413509060955519, |
|
"grad_norm": 0.46027613089003067, |
|
"learning_rate": 2.3844360107523973e-06, |
|
"loss": 0.0334, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.4168039538714992, |
|
"grad_norm": 0.618163403358967, |
|
"learning_rate": 2.3599157063372712e-06, |
|
"loss": 0.0505, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.4200988467874793, |
|
"grad_norm": 0.5692914543756001, |
|
"learning_rate": 2.3354831269130133e-06, |
|
"loss": 0.047, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.4233937397034597, |
|
"grad_norm": 0.5569933619176715, |
|
"learning_rate": 2.3111390843278743e-06, |
|
"loss": 0.0506, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.42668863261944, |
|
"grad_norm": 0.4384099356121434, |
|
"learning_rate": 2.2868843874881856e-06, |
|
"loss": 0.0453, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.42998352553542, |
|
"grad_norm": 0.5320473584418453, |
|
"learning_rate": 2.2627198423314988e-06, |
|
"loss": 0.0547, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.4332784184514002, |
|
"grad_norm": 0.4951776816150561, |
|
"learning_rate": 2.238646251799787e-06, |
|
"loss": 0.0517, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.4365733113673804, |
|
"grad_norm": 0.5305051346570233, |
|
"learning_rate": 2.2146644158127827e-06, |
|
"loss": 0.0508, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.4398682042833608, |
|
"grad_norm": 0.48235120417487776, |
|
"learning_rate": 2.1907751312413793e-06, |
|
"loss": 0.0498, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.443163097199341, |
|
"grad_norm": 0.7575565682766872, |
|
"learning_rate": 2.1669791918811724e-06, |
|
"loss": 0.0482, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.4464579901153214, |
|
"grad_norm": 0.6122464829305898, |
|
"learning_rate": 2.1432773884260627e-06, |
|
"loss": 0.0661, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.4497528830313016, |
|
"grad_norm": 0.49382428143445756, |
|
"learning_rate": 2.119670508442004e-06, |
|
"loss": 0.0372, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.4530477759472817, |
|
"grad_norm": 0.6113296705934868, |
|
"learning_rate": 2.0961593363408154e-06, |
|
"loss": 0.0489, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.456342668863262, |
|
"grad_norm": 0.4764803472849658, |
|
"learning_rate": 2.0727446533541302e-06, |
|
"loss": 0.0426, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.459637561779242, |
|
"grad_norm": 0.5321931460957434, |
|
"learning_rate": 2.0494272375074247e-06, |
|
"loss": 0.0428, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.4629324546952225, |
|
"grad_norm": 0.43368533141343174, |
|
"learning_rate": 2.0262078635941818e-06, |
|
"loss": 0.0377, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.4662273476112027, |
|
"grad_norm": 0.5227900476116077, |
|
"learning_rate": 2.0030873031501274e-06, |
|
"loss": 0.048, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.4695222405271828, |
|
"grad_norm": 0.40044438580877817, |
|
"learning_rate": 1.980066324427613e-06, |
|
"loss": 0.0367, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.4728171334431632, |
|
"grad_norm": 0.42569057497544066, |
|
"learning_rate": 1.9571456923700696e-06, |
|
"loss": 0.0485, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.4761120263591434, |
|
"grad_norm": 0.5011955876540544, |
|
"learning_rate": 1.9343261685866054e-06, |
|
"loss": 0.0684, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.4794069192751236, |
|
"grad_norm": 0.5257059685422952, |
|
"learning_rate": 1.911608511326688e-06, |
|
"loss": 0.0469, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.4827018121911038, |
|
"grad_norm": 0.5330212717649231, |
|
"learning_rate": 1.8889934754549583e-06, |
|
"loss": 0.0615, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.485996705107084, |
|
"grad_norm": 0.4377288880184422, |
|
"learning_rate": 1.8664818124261375e-06, |
|
"loss": 0.04, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.4892915980230643, |
|
"grad_norm": 0.4821221712040424, |
|
"learning_rate": 1.8440742702600706e-06, |
|
"loss": 0.0496, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.4925864909390445, |
|
"grad_norm": 0.42358079608202237, |
|
"learning_rate": 1.8217715935168562e-06, |
|
"loss": 0.0446, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.4958813838550247, |
|
"grad_norm": 0.6521628225316723, |
|
"learning_rate": 1.7995745232721207e-06, |
|
"loss": 0.0665, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.499176276771005, |
|
"grad_norm": 0.5512352891912379, |
|
"learning_rate": 1.777483797092381e-06, |
|
"loss": 0.0527, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.5024711696869852, |
|
"grad_norm": 0.4132207895248971, |
|
"learning_rate": 1.755500149010549e-06, |
|
"loss": 0.0369, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.5057660626029654, |
|
"grad_norm": 0.5452488198197322, |
|
"learning_rate": 1.7336243095015271e-06, |
|
"loss": 0.0457, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.5090609555189456, |
|
"grad_norm": 1.8013972479013802, |
|
"learning_rate": 1.7118570054579508e-06, |
|
"loss": 0.0788, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.5123558484349258, |
|
"grad_norm": 0.6158971711077378, |
|
"learning_rate": 1.6901989601660224e-06, |
|
"loss": 0.0577, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.515650741350906, |
|
"grad_norm": 0.6402888520963839, |
|
"learning_rate": 1.6686508932814871e-06, |
|
"loss": 0.0426, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.5189456342668863, |
|
"grad_norm": 0.5815365915637473, |
|
"learning_rate": 1.6472135208057128e-06, |
|
"loss": 0.0526, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.5222405271828665, |
|
"grad_norm": 0.5219074399966507, |
|
"learning_rate": 1.625887555061907e-06, |
|
"loss": 0.0428, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.525535420098847, |
|
"grad_norm": 0.5007230705662209, |
|
"learning_rate": 1.6046737046714366e-06, |
|
"loss": 0.0386, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.528830313014827, |
|
"grad_norm": 0.492364769802372, |
|
"learning_rate": 1.5835726745302953e-06, |
|
"loss": 0.0364, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.5321252059308073, |
|
"grad_norm": 0.5652000556154251, |
|
"learning_rate": 1.5625851657856666e-06, |
|
"loss": 0.0546, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.5354200988467874, |
|
"grad_norm": 0.5993897339775979, |
|
"learning_rate": 1.5417118758126408e-06, |
|
"loss": 0.0579, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.5387149917627676, |
|
"grad_norm": 1.3251444571487765, |
|
"learning_rate": 1.520953498191028e-06, |
|
"loss": 0.0747, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.5420098846787478, |
|
"grad_norm": 0.440371155414081, |
|
"learning_rate": 1.5003107226823255e-06, |
|
"loss": 0.0495, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.5453047775947282, |
|
"grad_norm": 0.5274460518323345, |
|
"learning_rate": 1.479784235206786e-06, |
|
"loss": 0.0457, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.5485996705107083, |
|
"grad_norm": 0.4509159507608483, |
|
"learning_rate": 1.459374717820637e-06, |
|
"loss": 0.0441, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.5518945634266887, |
|
"grad_norm": 0.5787329784185842, |
|
"learning_rate": 1.439082848693406e-06, |
|
"loss": 0.0455, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.555189456342669, |
|
"grad_norm": 0.9428413760935695, |
|
"learning_rate": 1.4189093020853989e-06, |
|
"loss": 0.0635, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.558484349258649, |
|
"grad_norm": 0.5810607886116554, |
|
"learning_rate": 1.3988547483252812e-06, |
|
"loss": 0.0591, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.5617792421746293, |
|
"grad_norm": 0.5621586581333317, |
|
"learning_rate": 1.3789198537878202e-06, |
|
"loss": 0.048, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.5650741350906094, |
|
"grad_norm": 0.614476271893997, |
|
"learning_rate": 1.3591052808717258e-06, |
|
"loss": 0.0574, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.5683690280065898, |
|
"grad_norm": 0.4366280882804736, |
|
"learning_rate": 1.339411687977657e-06, |
|
"loss": 0.0387, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.57166392092257, |
|
"grad_norm": 1.654229386447125, |
|
"learning_rate": 1.3198397294863285e-06, |
|
"loss": 0.0525, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.5749588138385504, |
|
"grad_norm": 0.5124984935464315, |
|
"learning_rate": 1.3003900557367816e-06, |
|
"loss": 0.0586, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.5782537067545306, |
|
"grad_norm": 0.5039902746309534, |
|
"learning_rate": 1.281063313004761e-06, |
|
"loss": 0.0409, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.5815485996705108, |
|
"grad_norm": 0.4453799136874429, |
|
"learning_rate": 1.261860143481255e-06, |
|
"loss": 0.0437, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.584843492586491, |
|
"grad_norm": 0.44736265726220936, |
|
"learning_rate": 1.2427811852511396e-06, |
|
"loss": 0.05, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.588138385502471, |
|
"grad_norm": 0.5751552043472024, |
|
"learning_rate": 1.223827072271993e-06, |
|
"loss": 0.0513, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.5914332784184513, |
|
"grad_norm": 0.4854076213664054, |
|
"learning_rate": 1.204998434353018e-06, |
|
"loss": 0.0434, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.5947281713344317, |
|
"grad_norm": 0.5304616858985192, |
|
"learning_rate": 1.1862958971341199e-06, |
|
"loss": 0.0537, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.5980230642504119, |
|
"grad_norm": 0.5357970833666896, |
|
"learning_rate": 1.1677200820651197e-06, |
|
"loss": 0.049, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.6013179571663922, |
|
"grad_norm": 0.6703644083736745, |
|
"learning_rate": 1.1492716063850973e-06, |
|
"loss": 0.0553, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.6046128500823724, |
|
"grad_norm": 0.5104289346948437, |
|
"learning_rate": 1.1309510831018927e-06, |
|
"loss": 0.0484, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.6079077429983526, |
|
"grad_norm": 0.791691486031595, |
|
"learning_rate": 1.112759120971723e-06, |
|
"loss": 0.0516, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.6112026359143328, |
|
"grad_norm": 0.5044446696201748, |
|
"learning_rate": 1.09469632447897e-06, |
|
"loss": 0.0412, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.614497528830313, |
|
"grad_norm": 0.44744812374789733, |
|
"learning_rate": 1.0767632938160787e-06, |
|
"loss": 0.0441, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.6177924217462931, |
|
"grad_norm": 0.689652904031741, |
|
"learning_rate": 1.0589606248636291e-06, |
|
"loss": 0.0468, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.6210873146622735, |
|
"grad_norm": 0.4644587386351254, |
|
"learning_rate": 1.0412889091705242e-06, |
|
"loss": 0.0356, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.6243822075782537, |
|
"grad_norm": 0.6499107202235256, |
|
"learning_rate": 1.0237487339343382e-06, |
|
"loss": 0.0574, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.627677100494234, |
|
"grad_norm": 0.4542177117918383, |
|
"learning_rate": 1.0063406819818106e-06, |
|
"loss": 0.0443, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.6309719934102143, |
|
"grad_norm": 0.6343789726555299, |
|
"learning_rate": 9.890653317494686e-07, |
|
"loss": 0.0524, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.6342668863261944, |
|
"grad_norm": 0.4208852075289343, |
|
"learning_rate": 9.719232572644189e-07, |
|
"loss": 0.0407, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.6375617792421746, |
|
"grad_norm": 0.45018417664569393, |
|
"learning_rate": 9.549150281252633e-07, |
|
"loss": 0.0382, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.6408566721581548, |
|
"grad_norm": 0.4664038740894182, |
|
"learning_rate": 9.380412094831809e-07, |
|
"loss": 0.0413, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.644151565074135, |
|
"grad_norm": 0.5658115763517576, |
|
"learning_rate": 9.213023620231404e-07, |
|
"loss": 0.055, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.6474464579901154, |
|
"grad_norm": 0.42505631586549236, |
|
"learning_rate": 9.046990419452795e-07, |
|
"loss": 0.0374, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.6507413509060955, |
|
"grad_norm": 0.5554370313066022, |
|
"learning_rate": 8.882318009464124e-07, |
|
"loss": 0.0758, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.654036243822076, |
|
"grad_norm": 0.6820183163387327, |
|
"learning_rate": 8.719011862017108e-07, |
|
"loss": 0.067, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.657331136738056, |
|
"grad_norm": 0.8815741831945997, |
|
"learning_rate": 8.557077403465069e-07, |
|
"loss": 0.0635, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.6606260296540363, |
|
"grad_norm": 0.6530261534927284, |
|
"learning_rate": 8.396520014582798e-07, |
|
"loss": 0.0564, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.6639209225700164, |
|
"grad_norm": 0.5563910425802013, |
|
"learning_rate": 8.237345030387589e-07, |
|
"loss": 0.0568, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.6672158154859966, |
|
"grad_norm": 1.6616538016948608, |
|
"learning_rate": 8.079557739962129e-07, |
|
"loss": 0.0433, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.6705107084019768, |
|
"grad_norm": 0.4729743527848457, |
|
"learning_rate": 7.923163386278615e-07, |
|
"loss": 0.0477, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.6738056013179572, |
|
"grad_norm": 0.484207261501026, |
|
"learning_rate": 7.768167166024637e-07, |
|
"loss": 0.0393, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.6771004942339374, |
|
"grad_norm": 0.4347790202564516, |
|
"learning_rate": 7.614574229430432e-07, |
|
"loss": 0.0348, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.6803953871499178, |
|
"grad_norm": 0.5159230901740568, |
|
"learning_rate": 7.462389680097831e-07, |
|
"loss": 0.0511, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.683690280065898, |
|
"grad_norm": 0.7846302974584749, |
|
"learning_rate": 7.31161857483057e-07, |
|
"loss": 0.0428, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.6869851729818781, |
|
"grad_norm": 0.5154541689981792, |
|
"learning_rate": 7.162265923466383e-07, |
|
"loss": 0.0481, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.6902800658978583, |
|
"grad_norm": 0.5103769113667321, |
|
"learning_rate": 7.014336688710411e-07, |
|
"loss": 0.0559, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.6935749588138385, |
|
"grad_norm": 0.5601518054326986, |
|
"learning_rate": 6.867835785970417e-07, |
|
"loss": 0.0383, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.6968698517298186, |
|
"grad_norm": 0.452690076025677, |
|
"learning_rate": 6.722768083193354e-07, |
|
"loss": 0.0393, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.700164744645799, |
|
"grad_norm": 0.46134612749678455, |
|
"learning_rate": 6.579138400703716e-07, |
|
"loss": 0.0515, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.7034596375617792, |
|
"grad_norm": 0.41746324511751276, |
|
"learning_rate": 6.436951511043243e-07, |
|
"loss": 0.0445, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.7067545304777596, |
|
"grad_norm": 0.45014253067830906, |
|
"learning_rate": 6.296212138812474e-07, |
|
"loss": 0.0438, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.7100494233937398, |
|
"grad_norm": 0.49892377023307155, |
|
"learning_rate": 6.156924960513638e-07, |
|
"loss": 0.0452, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.71334431630972, |
|
"grad_norm": 0.47105725038366275, |
|
"learning_rate": 6.019094604395359e-07, |
|
"loss": 0.054, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.7166392092257001, |
|
"grad_norm": 0.4259013974972623, |
|
"learning_rate": 5.882725650298787e-07, |
|
"loss": 0.0377, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.7199341021416803, |
|
"grad_norm": 0.7122055487087868, |
|
"learning_rate": 5.747822629505484e-07, |
|
"loss": 0.0606, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.7232289950576605, |
|
"grad_norm": 0.6463022317948001, |
|
"learning_rate": 5.614390024586808e-07, |
|
"loss": 0.0948, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.7265238879736409, |
|
"grad_norm": 0.6557315175026269, |
|
"learning_rate": 5.482432269255011e-07, |
|
"loss": 0.0594, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.729818780889621, |
|
"grad_norm": 0.48159732485479767, |
|
"learning_rate": 5.351953748215872e-07, |
|
"loss": 0.0562, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.7331136738056014, |
|
"grad_norm": 0.5429610524246544, |
|
"learning_rate": 5.222958797023036e-07, |
|
"loss": 0.0469, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.7364085667215816, |
|
"grad_norm": 0.4703329992162841, |
|
"learning_rate": 5.095451701933923e-07, |
|
"loss": 0.0495, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.7397034596375618, |
|
"grad_norm": 0.4436763771779832, |
|
"learning_rate": 4.969436699767344e-07, |
|
"loss": 0.0354, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.742998352553542, |
|
"grad_norm": 0.5025190325700492, |
|
"learning_rate": 4.844917977762653e-07, |
|
"loss": 0.056, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.7462932454695221, |
|
"grad_norm": 0.661761525862607, |
|
"learning_rate": 4.721899673440694e-07, |
|
"loss": 0.0436, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.7495881383855023, |
|
"grad_norm": 0.5762138308981635, |
|
"learning_rate": 4.6003858744662564e-07, |
|
"loss": 0.0552, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.7528830313014827, |
|
"grad_norm": 0.5387051088420545, |
|
"learning_rate": 4.4803806185122866e-07, |
|
"loss": 0.0479, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.7561779242174629, |
|
"grad_norm": 0.537079178923195, |
|
"learning_rate": 4.361887893125677e-07, |
|
"loss": 0.0565, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.7594728171334433, |
|
"grad_norm": 0.5780295144664594, |
|
"learning_rate": 4.244911635594856e-07, |
|
"loss": 0.0555, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.7627677100494235, |
|
"grad_norm": 0.6971174595241891, |
|
"learning_rate": 4.1294557328188376e-07, |
|
"loss": 0.0468, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.7660626029654036, |
|
"grad_norm": 0.5714544842706962, |
|
"learning_rate": 4.0155240211781966e-07, |
|
"loss": 0.069, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.7693574958813838, |
|
"grad_norm": 0.5560933792076089, |
|
"learning_rate": 3.9031202864074634e-07, |
|
"loss": 0.0526, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.772652388797364, |
|
"grad_norm": 0.5076535062447541, |
|
"learning_rate": 3.7922482634694667e-07, |
|
"loss": 0.0495, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.7759472817133442, |
|
"grad_norm": 0.4733107950145234, |
|
"learning_rate": 3.6829116364310914e-07, |
|
"loss": 0.048, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.7792421746293245, |
|
"grad_norm": 0.5645137566585913, |
|
"learning_rate": 3.575114038340977e-07, |
|
"loss": 0.0503, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.782537067545305, |
|
"grad_norm": 0.5224355605212421, |
|
"learning_rate": 3.4688590511087304e-07, |
|
"loss": 0.0553, |
|
"step": 541 |
|
}, |
|
{ |
|
"epoch": 1.7858319604612851, |
|
"grad_norm": 0.495152224806279, |
|
"learning_rate": 3.3641502053859355e-07, |
|
"loss": 0.0304, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 1.7891268533772653, |
|
"grad_norm": 0.40973303317244536, |
|
"learning_rate": 3.2609909804488195e-07, |
|
"loss": 0.0313, |
|
"step": 543 |
|
}, |
|
{ |
|
"epoch": 1.7924217462932455, |
|
"grad_norm": 0.47058046777642437, |
|
"learning_rate": 3.159384804082666e-07, |
|
"loss": 0.0526, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 1.7957166392092256, |
|
"grad_norm": 0.4838577514212764, |
|
"learning_rate": 3.0593350524678823e-07, |
|
"loss": 0.0371, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.7990115321252058, |
|
"grad_norm": 0.6690590772761237, |
|
"learning_rate": 2.9608450500678566e-07, |
|
"loss": 0.0604, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 1.8023064250411862, |
|
"grad_norm": 0.4109056760035354, |
|
"learning_rate": 2.863918069518451e-07, |
|
"loss": 0.0331, |
|
"step": 547 |
|
}, |
|
{ |
|
"epoch": 1.8056013179571664, |
|
"grad_norm": 0.7159007380510568, |
|
"learning_rate": 2.7685573315192895e-07, |
|
"loss": 0.0721, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 1.8088962108731468, |
|
"grad_norm": 0.4865674890840018, |
|
"learning_rate": 2.67476600472672e-07, |
|
"loss": 0.0451, |
|
"step": 549 |
|
}, |
|
{ |
|
"epoch": 1.812191103789127, |
|
"grad_norm": 0.5942738860891101, |
|
"learning_rate": 2.5825472056485556e-07, |
|
"loss": 0.062, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.8154859967051071, |
|
"grad_norm": 0.5357536121944303, |
|
"learning_rate": 2.4919039985404626e-07, |
|
"loss": 0.0609, |
|
"step": 551 |
|
}, |
|
{ |
|
"epoch": 1.8187808896210873, |
|
"grad_norm": 0.3866866163849165, |
|
"learning_rate": 2.4028393953042074e-07, |
|
"loss": 0.0296, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 1.8220757825370675, |
|
"grad_norm": 0.49750309911350266, |
|
"learning_rate": 2.315356355387527e-07, |
|
"loss": 0.0444, |
|
"step": 553 |
|
}, |
|
{ |
|
"epoch": 1.8253706754530477, |
|
"grad_norm": 0.6734726196583376, |
|
"learning_rate": 2.2294577856858236e-07, |
|
"loss": 0.0552, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 1.828665568369028, |
|
"grad_norm": 0.4599661443293077, |
|
"learning_rate": 2.1451465404455473e-07, |
|
"loss": 0.041, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.8319604612850082, |
|
"grad_norm": 0.691277736971953, |
|
"learning_rate": 2.0624254211693894e-07, |
|
"loss": 0.061, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 1.8352553542009886, |
|
"grad_norm": 0.42780093096680555, |
|
"learning_rate": 1.9812971765231394e-07, |
|
"loss": 0.0364, |
|
"step": 557 |
|
}, |
|
{ |
|
"epoch": 1.8385502471169688, |
|
"grad_norm": 0.578589972444535, |
|
"learning_rate": 1.901764502244424e-07, |
|
"loss": 0.0613, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 1.841845140032949, |
|
"grad_norm": 0.4763862013898226, |
|
"learning_rate": 1.823830041053065e-07, |
|
"loss": 0.0437, |
|
"step": 559 |
|
}, |
|
{ |
|
"epoch": 1.8451400329489291, |
|
"grad_norm": 0.45718093901396817, |
|
"learning_rate": 1.7474963825633185e-07, |
|
"loss": 0.043, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.8484349258649093, |
|
"grad_norm": 0.6366482024659451, |
|
"learning_rate": 1.6727660631977894e-07, |
|
"loss": 0.0556, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 1.8517298187808895, |
|
"grad_norm": 0.7471772626827173, |
|
"learning_rate": 1.5996415661031662e-07, |
|
"loss": 0.0425, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 1.8550247116968699, |
|
"grad_norm": 0.3521582825319659, |
|
"learning_rate": 1.528125321067725e-07, |
|
"loss": 0.0382, |
|
"step": 563 |
|
}, |
|
{ |
|
"epoch": 1.85831960461285, |
|
"grad_norm": 0.5973061078363581, |
|
"learning_rate": 1.4582197044405556e-07, |
|
"loss": 0.0509, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 1.8616144975288305, |
|
"grad_norm": 0.5093523377380892, |
|
"learning_rate": 1.389927039052652e-07, |
|
"loss": 0.0444, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.8649093904448106, |
|
"grad_norm": 0.44171165477963154, |
|
"learning_rate": 1.323249594139664e-07, |
|
"loss": 0.0468, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 1.8682042833607908, |
|
"grad_norm": 0.44675757705445573, |
|
"learning_rate": 1.2581895852665671e-07, |
|
"loss": 0.0374, |
|
"step": 567 |
|
}, |
|
{ |
|
"epoch": 1.871499176276771, |
|
"grad_norm": 0.6020514933209717, |
|
"learning_rate": 1.1947491742539841e-07, |
|
"loss": 0.0503, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 1.8747940691927512, |
|
"grad_norm": 0.42586058680868216, |
|
"learning_rate": 1.1329304691063692e-07, |
|
"loss": 0.0392, |
|
"step": 569 |
|
}, |
|
{ |
|
"epoch": 1.8780889621087313, |
|
"grad_norm": 0.4423611658945744, |
|
"learning_rate": 1.0727355239419868e-07, |
|
"loss": 0.0469, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.8813838550247117, |
|
"grad_norm": 0.424883733072018, |
|
"learning_rate": 1.014166338924627e-07, |
|
"loss": 0.0475, |
|
"step": 571 |
|
}, |
|
{ |
|
"epoch": 1.884678747940692, |
|
"grad_norm": 0.508275043696402, |
|
"learning_rate": 9.572248601971646e-08, |
|
"loss": 0.0588, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 1.8879736408566723, |
|
"grad_norm": 0.532268589710946, |
|
"learning_rate": 9.019129798168658e-08, |
|
"loss": 0.0413, |
|
"step": 573 |
|
}, |
|
{ |
|
"epoch": 1.8912685337726525, |
|
"grad_norm": 0.49075007985093444, |
|
"learning_rate": 8.482325356925614e-08, |
|
"loss": 0.0438, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 1.8945634266886326, |
|
"grad_norm": 0.6405783999237776, |
|
"learning_rate": 7.96185311523523e-08, |
|
"loss": 0.0517, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.8978583196046128, |
|
"grad_norm": 0.4374824821968711, |
|
"learning_rate": 7.45773036740255e-08, |
|
"loss": 0.0594, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 1.901153212520593, |
|
"grad_norm": 0.4881691605292657, |
|
"learning_rate": 6.969973864469626e-08, |
|
"loss": 0.0478, |
|
"step": 577 |
|
}, |
|
{ |
|
"epoch": 1.9044481054365732, |
|
"grad_norm": 0.4169109199296669, |
|
"learning_rate": 6.498599813659524e-08, |
|
"loss": 0.0329, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 1.9077429983525536, |
|
"grad_norm": 0.7757102581030492, |
|
"learning_rate": 6.043623877837301e-08, |
|
"loss": 0.0459, |
|
"step": 579 |
|
}, |
|
{ |
|
"epoch": 1.9110378912685337, |
|
"grad_norm": 0.6871632648014142, |
|
"learning_rate": 5.6050611749899896e-08, |
|
"loss": 0.0499, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.9143327841845141, |
|
"grad_norm": 0.4909555169664079, |
|
"learning_rate": 5.182926277723821e-08, |
|
"loss": 0.0439, |
|
"step": 581 |
|
}, |
|
{ |
|
"epoch": 1.9176276771004943, |
|
"grad_norm": 0.5085351517406403, |
|
"learning_rate": 4.777233212780396e-08, |
|
"loss": 0.043, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 1.9209225700164745, |
|
"grad_norm": 0.4133261269232907, |
|
"learning_rate": 4.387995460570282e-08, |
|
"loss": 0.0422, |
|
"step": 583 |
|
}, |
|
{ |
|
"epoch": 1.9242174629324547, |
|
"grad_norm": 0.3457486280213094, |
|
"learning_rate": 4.015225954725421e-08, |
|
"loss": 0.0302, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 1.9275123558484348, |
|
"grad_norm": 0.5306123017488412, |
|
"learning_rate": 3.658937081669034e-08, |
|
"loss": 0.0347, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.930807248764415, |
|
"grad_norm": 0.6170841106215647, |
|
"learning_rate": 3.3191406802041693e-08, |
|
"loss": 0.0427, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 1.9341021416803954, |
|
"grad_norm": 0.45743745755641063, |
|
"learning_rate": 2.9958480411204086e-08, |
|
"loss": 0.0487, |
|
"step": 587 |
|
}, |
|
{ |
|
"epoch": 1.9373970345963756, |
|
"grad_norm": 0.7985516650647139, |
|
"learning_rate": 2.6890699068187197e-08, |
|
"loss": 0.0598, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 1.940691927512356, |
|
"grad_norm": 0.969435591211712, |
|
"learning_rate": 2.3988164709542462e-08, |
|
"loss": 0.046, |
|
"step": 589 |
|
}, |
|
{ |
|
"epoch": 1.9439868204283361, |
|
"grad_norm": 0.5530499902423203, |
|
"learning_rate": 2.1250973780977957e-08, |
|
"loss": 0.0626, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.9472817133443163, |
|
"grad_norm": 0.4726855774076432, |
|
"learning_rate": 1.8679217234154335e-08, |
|
"loss": 0.0442, |
|
"step": 591 |
|
}, |
|
{ |
|
"epoch": 1.9505766062602965, |
|
"grad_norm": 0.5910043256950622, |
|
"learning_rate": 1.627298052366111e-08, |
|
"loss": 0.0533, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 1.9538714991762767, |
|
"grad_norm": 0.37046514170506584, |
|
"learning_rate": 1.4032343604177267e-08, |
|
"loss": 0.0436, |
|
"step": 593 |
|
}, |
|
{ |
|
"epoch": 1.9571663920922568, |
|
"grad_norm": 0.45020576348906904, |
|
"learning_rate": 1.1957380927816176e-08, |
|
"loss": 0.0392, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 1.9604612850082372, |
|
"grad_norm": 0.4456283814258553, |
|
"learning_rate": 1.0048161441649217e-08, |
|
"loss": 0.0464, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.9637561779242174, |
|
"grad_norm": 0.5191136185767354, |
|
"learning_rate": 8.304748585417077e-09, |
|
"loss": 0.0432, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 1.9670510708401978, |
|
"grad_norm": 0.7541935440652175, |
|
"learning_rate": 6.72720028942031e-09, |
|
"loss": 0.0417, |
|
"step": 597 |
|
}, |
|
{ |
|
"epoch": 1.970345963756178, |
|
"grad_norm": 0.4930059149169314, |
|
"learning_rate": 5.315568972594775e-09, |
|
"loss": 0.0522, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 1.9736408566721582, |
|
"grad_norm": 0.43182626877243907, |
|
"learning_rate": 4.0699015407702495e-09, |
|
"loss": 0.0426, |
|
"step": 599 |
|
}, |
|
{ |
|
"epoch": 1.9769357495881383, |
|
"grad_norm": 0.5518177594295087, |
|
"learning_rate": 2.990239385112226e-09, |
|
"loss": 0.0565, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.9802306425041185, |
|
"grad_norm": 0.5188892218788691, |
|
"learning_rate": 2.076618380744133e-09, |
|
"loss": 0.0684, |
|
"step": 601 |
|
}, |
|
{ |
|
"epoch": 1.9835255354200987, |
|
"grad_norm": 0.43199397069665996, |
|
"learning_rate": 1.3290688855588374e-09, |
|
"loss": 0.0396, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 1.986820428336079, |
|
"grad_norm": 0.4470921544911916, |
|
"learning_rate": 7.476157392072303e-10, |
|
"loss": 0.0385, |
|
"step": 603 |
|
}, |
|
{ |
|
"epoch": 1.9901153212520593, |
|
"grad_norm": 0.5332848840635471, |
|
"learning_rate": 3.322782622738885e-10, |
|
"loss": 0.0585, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 1.9934102141680397, |
|
"grad_norm": 0.6284366325020901, |
|
"learning_rate": 8.307025563536464e-11, |
|
"loss": 0.0517, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.9967051070840198, |
|
"grad_norm": 0.6056246452486423, |
|
"learning_rate": 0.0, |
|
"loss": 0.048, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.9967051070840198, |
|
"eval_loss": 0.0587012954056263, |
|
"eval_runtime": 144.0131, |
|
"eval_samples_per_second": 35.448, |
|
"eval_steps_per_second": 1.111, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 1.9967051070840198, |
|
"step": 606, |
|
"total_flos": 1.811911707237417e+17, |
|
"train_loss": 0.0716345354195426, |
|
"train_runtime": 6724.9995, |
|
"train_samples_per_second": 11.537, |
|
"train_steps_per_second": 0.09 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 606, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.811911707237417e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|