|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.026314635323012148, |
|
"eval_steps": 500, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 8.771545107670716e-05, |
|
"grad_norm": 28.291993022356824, |
|
"learning_rate": 4.385964912280702e-08, |
|
"loss": 0.9764, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.00017543090215341433, |
|
"grad_norm": 11.00431285069151, |
|
"learning_rate": 8.771929824561404e-08, |
|
"loss": 0.7373, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0002631463532301215, |
|
"grad_norm": 19.575902791602918, |
|
"learning_rate": 1.3157894736842107e-07, |
|
"loss": 0.92, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.00035086180430682866, |
|
"grad_norm": 28.862884630243123, |
|
"learning_rate": 1.7543859649122808e-07, |
|
"loss": 0.9196, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0004385772553835358, |
|
"grad_norm": 15.982248327528751, |
|
"learning_rate": 2.192982456140351e-07, |
|
"loss": 0.8366, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.000526292706460243, |
|
"grad_norm": 31.85723876161732, |
|
"learning_rate": 2.6315789473684213e-07, |
|
"loss": 0.9335, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0006140081575369502, |
|
"grad_norm": 21.310207454796295, |
|
"learning_rate": 3.070175438596491e-07, |
|
"loss": 0.8362, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0007017236086136573, |
|
"grad_norm": 20.052830776823505, |
|
"learning_rate": 3.5087719298245616e-07, |
|
"loss": 0.8015, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0007894390596903645, |
|
"grad_norm": 16.06788143210757, |
|
"learning_rate": 3.9473684210526315e-07, |
|
"loss": 0.8729, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0008771545107670716, |
|
"grad_norm": 29.100726513914584, |
|
"learning_rate": 4.385964912280702e-07, |
|
"loss": 0.9058, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0009648699618437788, |
|
"grad_norm": 13.993390572028792, |
|
"learning_rate": 4.824561403508772e-07, |
|
"loss": 0.7093, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.001052585412920486, |
|
"grad_norm": 21.107935511000072, |
|
"learning_rate": 5.263157894736843e-07, |
|
"loss": 0.8955, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0011403008639971931, |
|
"grad_norm": 13.66193898339087, |
|
"learning_rate": 5.701754385964912e-07, |
|
"loss": 0.7219, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0012280163150739003, |
|
"grad_norm": 10.537203866107753, |
|
"learning_rate": 6.140350877192982e-07, |
|
"loss": 0.8429, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0013157317661506075, |
|
"grad_norm": 12.393106853157317, |
|
"learning_rate": 6.578947368421053e-07, |
|
"loss": 0.6708, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0014034472172273146, |
|
"grad_norm": 8.734604355126535, |
|
"learning_rate": 7.017543859649123e-07, |
|
"loss": 0.6507, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0014911626683040218, |
|
"grad_norm": 9.124362491394539, |
|
"learning_rate": 7.456140350877194e-07, |
|
"loss": 0.838, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.001578878119380729, |
|
"grad_norm": 8.958389642999963, |
|
"learning_rate": 7.894736842105263e-07, |
|
"loss": 0.6849, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0016665935704574361, |
|
"grad_norm": 11.542677492312867, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 0.6926, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0017543090215341433, |
|
"grad_norm": 8.045066225626593, |
|
"learning_rate": 8.771929824561404e-07, |
|
"loss": 0.7006, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0018420244726108505, |
|
"grad_norm": 8.146906074379428, |
|
"learning_rate": 9.210526315789474e-07, |
|
"loss": 0.6737, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0019297399236875576, |
|
"grad_norm": 6.502955757535831, |
|
"learning_rate": 9.649122807017545e-07, |
|
"loss": 0.7495, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.002017455374764265, |
|
"grad_norm": 8.736982858234592, |
|
"learning_rate": 1.0087719298245615e-06, |
|
"loss": 0.7324, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.002105170825840972, |
|
"grad_norm": 7.851959741269017, |
|
"learning_rate": 1.0526315789473685e-06, |
|
"loss": 0.6686, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.002192886276917679, |
|
"grad_norm": 8.594840793358543, |
|
"learning_rate": 1.0964912280701756e-06, |
|
"loss": 0.8064, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0022806017279943863, |
|
"grad_norm": 8.935665287337994, |
|
"learning_rate": 1.1403508771929824e-06, |
|
"loss": 0.6751, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0023683171790710934, |
|
"grad_norm": 11.146850280588064, |
|
"learning_rate": 1.1842105263157894e-06, |
|
"loss": 0.7884, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0024560326301478006, |
|
"grad_norm": 6.917869007862471, |
|
"learning_rate": 1.2280701754385965e-06, |
|
"loss": 0.8772, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.0025437480812245078, |
|
"grad_norm": 9.32145567192897, |
|
"learning_rate": 1.2719298245614037e-06, |
|
"loss": 0.6486, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.002631463532301215, |
|
"grad_norm": 7.83399807213587, |
|
"learning_rate": 1.3157894736842106e-06, |
|
"loss": 0.7793, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.002719178983377922, |
|
"grad_norm": 5.701851482721999, |
|
"learning_rate": 1.3596491228070178e-06, |
|
"loss": 0.6418, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0028068944344546293, |
|
"grad_norm": 6.357569510522249, |
|
"learning_rate": 1.4035087719298246e-06, |
|
"loss": 0.7803, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0028946098855313364, |
|
"grad_norm": 6.1458878660724, |
|
"learning_rate": 1.4473684210526317e-06, |
|
"loss": 0.6075, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.0029823253366080436, |
|
"grad_norm": 5.258525934759675, |
|
"learning_rate": 1.4912280701754387e-06, |
|
"loss": 0.7558, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0030700407876847508, |
|
"grad_norm": 5.96497463401995, |
|
"learning_rate": 1.5350877192982458e-06, |
|
"loss": 0.5807, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.003157756238761458, |
|
"grad_norm": 9.97378904781871, |
|
"learning_rate": 1.5789473684210526e-06, |
|
"loss": 0.6766, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.003245471689838165, |
|
"grad_norm": 10.558130153122322, |
|
"learning_rate": 1.6228070175438598e-06, |
|
"loss": 0.6318, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0033331871409148723, |
|
"grad_norm": 7.730592682668347, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 0.5723, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0034209025919915794, |
|
"grad_norm": 6.513997535111305, |
|
"learning_rate": 1.710526315789474e-06, |
|
"loss": 0.7381, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.0035086180430682866, |
|
"grad_norm": 6.4186997859745185, |
|
"learning_rate": 1.7543859649122807e-06, |
|
"loss": 0.676, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0035963334941449938, |
|
"grad_norm": 4.789756704738587, |
|
"learning_rate": 1.798245614035088e-06, |
|
"loss": 0.8106, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.003684048945221701, |
|
"grad_norm": 8.552415866186008, |
|
"learning_rate": 1.8421052631578948e-06, |
|
"loss": 0.7834, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.003771764396298408, |
|
"grad_norm": 5.104236885105078, |
|
"learning_rate": 1.8859649122807019e-06, |
|
"loss": 0.6694, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0038594798473751152, |
|
"grad_norm": 6.998642641947579, |
|
"learning_rate": 1.929824561403509e-06, |
|
"loss": 0.7184, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.003947195298451822, |
|
"grad_norm": 6.754484565741454, |
|
"learning_rate": 1.973684210526316e-06, |
|
"loss": 0.7682, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.00403491074952853, |
|
"grad_norm": 5.702466747706841, |
|
"learning_rate": 2.017543859649123e-06, |
|
"loss": 0.7167, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.004122626200605236, |
|
"grad_norm": 7.038100758557257, |
|
"learning_rate": 2.06140350877193e-06, |
|
"loss": 0.6709, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.004210341651681944, |
|
"grad_norm": 8.659378609826204, |
|
"learning_rate": 2.105263157894737e-06, |
|
"loss": 0.6508, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.004298057102758651, |
|
"grad_norm": 9.315174303463822, |
|
"learning_rate": 2.149122807017544e-06, |
|
"loss": 0.6168, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.004385772553835358, |
|
"grad_norm": 7.447716885721135, |
|
"learning_rate": 2.192982456140351e-06, |
|
"loss": 0.6738, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.004473488004912065, |
|
"grad_norm": 5.600770404460154, |
|
"learning_rate": 2.236842105263158e-06, |
|
"loss": 0.6311, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.004561203455988773, |
|
"grad_norm": 7.059691201242354, |
|
"learning_rate": 2.280701754385965e-06, |
|
"loss": 0.7204, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.004648918907065479, |
|
"grad_norm": 5.589092290239263, |
|
"learning_rate": 2.324561403508772e-06, |
|
"loss": 0.7266, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.004736634358142187, |
|
"grad_norm": 5.801762781587569, |
|
"learning_rate": 2.368421052631579e-06, |
|
"loss": 0.5336, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.004824349809218894, |
|
"grad_norm": 5.599754768073974, |
|
"learning_rate": 2.412280701754386e-06, |
|
"loss": 0.6338, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.004912065260295601, |
|
"grad_norm": 5.66437398031977, |
|
"learning_rate": 2.456140350877193e-06, |
|
"loss": 0.7813, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.004999780711372308, |
|
"grad_norm": 6.32022790188225, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.6613, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.0050874961624490156, |
|
"grad_norm": 8.01474270706056, |
|
"learning_rate": 2.5438596491228075e-06, |
|
"loss": 0.6451, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.005175211613525722, |
|
"grad_norm": 6.586182462850705, |
|
"learning_rate": 2.5877192982456147e-06, |
|
"loss": 0.6984, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.00526292706460243, |
|
"grad_norm": 5.61553252576188, |
|
"learning_rate": 2.631578947368421e-06, |
|
"loss": 0.5773, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.005350642515679137, |
|
"grad_norm": 5.5274818204706895, |
|
"learning_rate": 2.6754385964912284e-06, |
|
"loss": 0.6083, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.005438357966755844, |
|
"grad_norm": 3.8762804528384254, |
|
"learning_rate": 2.7192982456140356e-06, |
|
"loss": 0.7174, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.005526073417832551, |
|
"grad_norm": 5.248404081335598, |
|
"learning_rate": 2.7631578947368424e-06, |
|
"loss": 0.7066, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0056137888689092585, |
|
"grad_norm": 7.214109517049078, |
|
"learning_rate": 2.8070175438596493e-06, |
|
"loss": 0.692, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.005701504319985965, |
|
"grad_norm": 5.429278596290352, |
|
"learning_rate": 2.8508771929824565e-06, |
|
"loss": 0.6145, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.005789219771062673, |
|
"grad_norm": 17.638205100824422, |
|
"learning_rate": 2.8947368421052634e-06, |
|
"loss": 0.7677, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.00587693522213938, |
|
"grad_norm": 5.677374136021176, |
|
"learning_rate": 2.9385964912280706e-06, |
|
"loss": 0.6779, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.005964650673216087, |
|
"grad_norm": 5.453107411280262, |
|
"learning_rate": 2.9824561403508774e-06, |
|
"loss": 0.6428, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.006052366124292794, |
|
"grad_norm": 5.888626008478417, |
|
"learning_rate": 3.0263157894736843e-06, |
|
"loss": 0.6342, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0061400815753695015, |
|
"grad_norm": 5.3185045733144225, |
|
"learning_rate": 3.0701754385964915e-06, |
|
"loss": 0.5644, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.006227797026446208, |
|
"grad_norm": 4.902919731780363, |
|
"learning_rate": 3.1140350877192988e-06, |
|
"loss": 0.709, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.006315512477522916, |
|
"grad_norm": 8.773622618503456, |
|
"learning_rate": 3.157894736842105e-06, |
|
"loss": 0.6674, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.006403227928599623, |
|
"grad_norm": 6.7570883776978174, |
|
"learning_rate": 3.2017543859649124e-06, |
|
"loss": 0.6918, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.00649094337967633, |
|
"grad_norm": 5.597179964370573, |
|
"learning_rate": 3.2456140350877197e-06, |
|
"loss": 0.7119, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.006578658830753037, |
|
"grad_norm": 5.4824260737552795, |
|
"learning_rate": 3.289473684210527e-06, |
|
"loss": 0.5667, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0066663742818297445, |
|
"grad_norm": 6.083422094529157, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.5972, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.006754089732906451, |
|
"grad_norm": 6.688559230122185, |
|
"learning_rate": 3.3771929824561406e-06, |
|
"loss": 0.6079, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.006841805183983159, |
|
"grad_norm": 4.675152512564395, |
|
"learning_rate": 3.421052631578948e-06, |
|
"loss": 0.6431, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.006929520635059866, |
|
"grad_norm": 6.61824094926871, |
|
"learning_rate": 3.464912280701755e-06, |
|
"loss": 0.7219, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.007017236086136573, |
|
"grad_norm": 4.3090639659166685, |
|
"learning_rate": 3.5087719298245615e-06, |
|
"loss": 0.6267, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.00710495153721328, |
|
"grad_norm": 5.908526205124108, |
|
"learning_rate": 3.5526315789473687e-06, |
|
"loss": 0.5598, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0071926669882899875, |
|
"grad_norm": 4.954945711406169, |
|
"learning_rate": 3.596491228070176e-06, |
|
"loss": 0.6251, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.007280382439366694, |
|
"grad_norm": 6.403352381905709, |
|
"learning_rate": 3.640350877192983e-06, |
|
"loss": 0.6921, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.007368097890443402, |
|
"grad_norm": 5.8960340556018505, |
|
"learning_rate": 3.6842105263157896e-06, |
|
"loss": 0.5803, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.007455813341520109, |
|
"grad_norm": 5.5832723717085795, |
|
"learning_rate": 3.728070175438597e-06, |
|
"loss": 0.7109, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.007543528792596816, |
|
"grad_norm": 6.9538610646678425, |
|
"learning_rate": 3.7719298245614037e-06, |
|
"loss": 0.57, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.007631244243673523, |
|
"grad_norm": 4.9040721673618615, |
|
"learning_rate": 3.815789473684211e-06, |
|
"loss": 0.6681, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.0077189596947502305, |
|
"grad_norm": 4.367227562952691, |
|
"learning_rate": 3.859649122807018e-06, |
|
"loss": 0.5881, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.007806675145826937, |
|
"grad_norm": 6.135869823936115, |
|
"learning_rate": 3.903508771929825e-06, |
|
"loss": 0.6333, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.007894390596903644, |
|
"grad_norm": 5.26232269598073, |
|
"learning_rate": 3.947368421052632e-06, |
|
"loss": 0.6228, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.007982106047980352, |
|
"grad_norm": 5.478510766614749, |
|
"learning_rate": 3.991228070175439e-06, |
|
"loss": 0.6889, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.00806982149905706, |
|
"grad_norm": 7.252221492478827, |
|
"learning_rate": 4.035087719298246e-06, |
|
"loss": 0.6726, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.008157536950133767, |
|
"grad_norm": 6.810323867433885, |
|
"learning_rate": 4.078947368421053e-06, |
|
"loss": 0.6186, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.008245252401210473, |
|
"grad_norm": 5.1477310672971965, |
|
"learning_rate": 4.12280701754386e-06, |
|
"loss": 0.6739, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.00833296785228718, |
|
"grad_norm": 4.455009313283226, |
|
"learning_rate": 4.166666666666667e-06, |
|
"loss": 0.6676, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.008420683303363888, |
|
"grad_norm": 4.854476484535793, |
|
"learning_rate": 4.210526315789474e-06, |
|
"loss": 0.624, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.008508398754440595, |
|
"grad_norm": 8.775528791539337, |
|
"learning_rate": 4.254385964912281e-06, |
|
"loss": 0.7236, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.008596114205517301, |
|
"grad_norm": 4.656928105654083, |
|
"learning_rate": 4.298245614035088e-06, |
|
"loss": 0.4853, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.008683829656594009, |
|
"grad_norm": 6.1151229878888795, |
|
"learning_rate": 4.342105263157895e-06, |
|
"loss": 0.6611, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.008771545107670716, |
|
"grad_norm": 4.846266795088099, |
|
"learning_rate": 4.385964912280702e-06, |
|
"loss": 0.6899, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.008859260558747424, |
|
"grad_norm": 5.63076019856985, |
|
"learning_rate": 4.429824561403509e-06, |
|
"loss": 0.7394, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.00894697600982413, |
|
"grad_norm": 6.152211661702361, |
|
"learning_rate": 4.473684210526316e-06, |
|
"loss": 0.6366, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.009034691460900838, |
|
"grad_norm": 5.271237730819475, |
|
"learning_rate": 4.517543859649123e-06, |
|
"loss": 0.6776, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.009122406911977545, |
|
"grad_norm": 6.150704296921181, |
|
"learning_rate": 4.56140350877193e-06, |
|
"loss": 0.7287, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.009210122363054253, |
|
"grad_norm": 5.511353295743786, |
|
"learning_rate": 4.605263157894737e-06, |
|
"loss": 0.7156, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.009297837814130959, |
|
"grad_norm": 5.651321362023493, |
|
"learning_rate": 4.649122807017544e-06, |
|
"loss": 0.5971, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.009385553265207666, |
|
"grad_norm": 4.521052312786367, |
|
"learning_rate": 4.692982456140351e-06, |
|
"loss": 0.662, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.009473268716284374, |
|
"grad_norm": 6.5893774516601775, |
|
"learning_rate": 4.736842105263158e-06, |
|
"loss": 0.6838, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.009560984167361081, |
|
"grad_norm": 7.413604525506308, |
|
"learning_rate": 4.780701754385965e-06, |
|
"loss": 0.6798, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.009648699618437787, |
|
"grad_norm": 5.258683042524991, |
|
"learning_rate": 4.824561403508772e-06, |
|
"loss": 0.7137, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.009736415069514495, |
|
"grad_norm": 3.56629655229689, |
|
"learning_rate": 4.8684210526315795e-06, |
|
"loss": 0.5524, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.009824130520591202, |
|
"grad_norm": 7.972594797604, |
|
"learning_rate": 4.912280701754386e-06, |
|
"loss": 0.7946, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.00991184597166791, |
|
"grad_norm": 5.9169587346561965, |
|
"learning_rate": 4.956140350877193e-06, |
|
"loss": 0.6985, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.009999561422744616, |
|
"grad_norm": 4.9028768240583895, |
|
"learning_rate": 5e-06, |
|
"loss": 0.7471, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.010087276873821324, |
|
"grad_norm": 4.952040118758915, |
|
"learning_rate": 4.999999903143301e-06, |
|
"loss": 0.6645, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.010174992324898031, |
|
"grad_norm": 5.307375041926707, |
|
"learning_rate": 4.999999612573212e-06, |
|
"loss": 0.6568, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.010262707775974739, |
|
"grad_norm": 4.417210142946582, |
|
"learning_rate": 4.9999991282897545e-06, |
|
"loss": 0.6633, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.010350423227051445, |
|
"grad_norm": 6.813103500844099, |
|
"learning_rate": 4.999998450292966e-06, |
|
"loss": 0.7479, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.010438138678128152, |
|
"grad_norm": 5.220452049535287, |
|
"learning_rate": 4.9999975785829e-06, |
|
"loss": 0.5982, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.01052585412920486, |
|
"grad_norm": 6.470241976711781, |
|
"learning_rate": 4.999996513159624e-06, |
|
"loss": 0.5915, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.010613569580281567, |
|
"grad_norm": 5.236784827517624, |
|
"learning_rate": 4.99999525402322e-06, |
|
"loss": 0.665, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.010701285031358273, |
|
"grad_norm": 5.5322906674158565, |
|
"learning_rate": 4.999993801173785e-06, |
|
"loss": 0.473, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.01078900048243498, |
|
"grad_norm": 5.643434680672429, |
|
"learning_rate": 4.999992154611433e-06, |
|
"loss": 0.5802, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.010876715933511688, |
|
"grad_norm": 4.909123022379139, |
|
"learning_rate": 4.9999903143362905e-06, |
|
"loss": 0.6103, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.010964431384588396, |
|
"grad_norm": 7.046173121098522, |
|
"learning_rate": 4.999988280348501e-06, |
|
"loss": 0.6601, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.011052146835665102, |
|
"grad_norm": 5.567754476589664, |
|
"learning_rate": 4.99998605264822e-06, |
|
"loss": 0.7144, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.01113986228674181, |
|
"grad_norm": 6.670512866037107, |
|
"learning_rate": 4.999983631235623e-06, |
|
"loss": 0.5034, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.011227577737818517, |
|
"grad_norm": 5.068760146843144, |
|
"learning_rate": 4.999981016110896e-06, |
|
"loss": 0.5965, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.011315293188895225, |
|
"grad_norm": 5.493410339028754, |
|
"learning_rate": 4.999978207274243e-06, |
|
"loss": 0.6697, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.01140300863997193, |
|
"grad_norm": 5.662089015796081, |
|
"learning_rate": 4.999975204725879e-06, |
|
"loss": 0.7182, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.011490724091048638, |
|
"grad_norm": 3.734356064938746, |
|
"learning_rate": 4.999972008466039e-06, |
|
"loss": 0.632, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.011578439542125346, |
|
"grad_norm": 4.29907687663725, |
|
"learning_rate": 4.99996861849497e-06, |
|
"loss": 0.6321, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.011666154993202053, |
|
"grad_norm": 5.292963722155827, |
|
"learning_rate": 4.999965034812934e-06, |
|
"loss": 0.5768, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.01175387044427876, |
|
"grad_norm": 4.564589196086129, |
|
"learning_rate": 4.99996125742021e-06, |
|
"loss": 0.5991, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.011841585895355467, |
|
"grad_norm": 5.889974426321806, |
|
"learning_rate": 4.99995728631709e-06, |
|
"loss": 0.568, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.011929301346432174, |
|
"grad_norm": 4.903556688362067, |
|
"learning_rate": 4.999953121503881e-06, |
|
"loss": 0.6221, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.012017016797508882, |
|
"grad_norm": 4.652137494582458, |
|
"learning_rate": 4.999948762980906e-06, |
|
"loss": 0.6499, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.012104732248585588, |
|
"grad_norm": 7.2681565015460965, |
|
"learning_rate": 4.999944210748504e-06, |
|
"loss": 0.7997, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.012192447699662295, |
|
"grad_norm": 4.498966830496647, |
|
"learning_rate": 4.999939464807027e-06, |
|
"loss": 0.7033, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.012280163150739003, |
|
"grad_norm": 5.658829625864849, |
|
"learning_rate": 4.999934525156842e-06, |
|
"loss": 0.6234, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.01236787860181571, |
|
"grad_norm": 6.170987539440289, |
|
"learning_rate": 4.9999293917983325e-06, |
|
"loss": 0.7359, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.012455594052892417, |
|
"grad_norm": 4.889450035742974, |
|
"learning_rate": 4.999924064731896e-06, |
|
"loss": 0.6418, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.012543309503969124, |
|
"grad_norm": 5.565665252735285, |
|
"learning_rate": 4.9999185439579445e-06, |
|
"loss": 0.8114, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.012631024955045832, |
|
"grad_norm": 5.009655972578068, |
|
"learning_rate": 4.9999128294769075e-06, |
|
"loss": 0.7307, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.01271874040612254, |
|
"grad_norm": 5.011444448419762, |
|
"learning_rate": 4.999906921289227e-06, |
|
"loss": 0.6434, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.012806455857199245, |
|
"grad_norm": 5.91290249112379, |
|
"learning_rate": 4.999900819395361e-06, |
|
"loss": 0.7576, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.012894171308275953, |
|
"grad_norm": 5.291827066915767, |
|
"learning_rate": 4.9998945237957814e-06, |
|
"loss": 0.717, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.01298188675935266, |
|
"grad_norm": 6.889695918810895, |
|
"learning_rate": 4.9998880344909765e-06, |
|
"loss": 0.6566, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.013069602210429368, |
|
"grad_norm": 4.139725258131711, |
|
"learning_rate": 4.999881351481449e-06, |
|
"loss": 0.6139, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.013157317661506074, |
|
"grad_norm": 5.041147601092224, |
|
"learning_rate": 4.999874474767718e-06, |
|
"loss": 0.7046, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.013245033112582781, |
|
"grad_norm": 4.850191233243735, |
|
"learning_rate": 4.999867404350315e-06, |
|
"loss": 0.6494, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.013332748563659489, |
|
"grad_norm": 5.608814210289025, |
|
"learning_rate": 4.999860140229788e-06, |
|
"loss": 0.8654, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.013420464014736197, |
|
"grad_norm": 4.097824317856954, |
|
"learning_rate": 4.9998526824067e-06, |
|
"loss": 0.6889, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.013508179465812903, |
|
"grad_norm": 6.425927321695068, |
|
"learning_rate": 4.999845030881629e-06, |
|
"loss": 0.5837, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.01359589491688961, |
|
"grad_norm": 7.686652681051417, |
|
"learning_rate": 4.999837185655168e-06, |
|
"loss": 0.6869, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.013683610367966318, |
|
"grad_norm": 6.199666417167642, |
|
"learning_rate": 4.9998291467279245e-06, |
|
"loss": 0.7371, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.013771325819043024, |
|
"grad_norm": 6.797879751043678, |
|
"learning_rate": 4.999820914100522e-06, |
|
"loss": 0.6912, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.013859041270119731, |
|
"grad_norm": 9.837640179642968, |
|
"learning_rate": 4.999812487773597e-06, |
|
"loss": 0.8045, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.013946756721196439, |
|
"grad_norm": 6.620454193744729, |
|
"learning_rate": 4.9998038677478044e-06, |
|
"loss": 0.6018, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.014034472172273146, |
|
"grad_norm": 4.952380418390811, |
|
"learning_rate": 4.99979505402381e-06, |
|
"loss": 0.5851, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.014122187623349852, |
|
"grad_norm": 4.571346505498035, |
|
"learning_rate": 4.999786046602299e-06, |
|
"loss": 0.6633, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.01420990307442656, |
|
"grad_norm": 6.745466717777739, |
|
"learning_rate": 4.999776845483968e-06, |
|
"loss": 0.714, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.014297618525503267, |
|
"grad_norm": 4.888639355192875, |
|
"learning_rate": 4.999767450669531e-06, |
|
"loss": 0.5328, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.014385333976579975, |
|
"grad_norm": 5.263414218540685, |
|
"learning_rate": 4.999757862159713e-06, |
|
"loss": 0.6746, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.014473049427656681, |
|
"grad_norm": 5.8723140369149895, |
|
"learning_rate": 4.99974807995526e-06, |
|
"loss": 0.7101, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.014560764878733388, |
|
"grad_norm": 4.125348885535371, |
|
"learning_rate": 4.999738104056931e-06, |
|
"loss": 0.6418, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.014648480329810096, |
|
"grad_norm": 5.079939786355144, |
|
"learning_rate": 4.999727934465495e-06, |
|
"loss": 0.6757, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.014736195780886804, |
|
"grad_norm": 4.436648943550616, |
|
"learning_rate": 4.999717571181742e-06, |
|
"loss": 0.6878, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.01482391123196351, |
|
"grad_norm": 4.6070293178483706, |
|
"learning_rate": 4.999707014206475e-06, |
|
"loss": 0.6882, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.014911626683040217, |
|
"grad_norm": 4.337658765605819, |
|
"learning_rate": 4.999696263540513e-06, |
|
"loss": 0.6418, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.014999342134116925, |
|
"grad_norm": 5.834498841218243, |
|
"learning_rate": 4.999685319184688e-06, |
|
"loss": 0.6367, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.015087057585193632, |
|
"grad_norm": 6.027148776110112, |
|
"learning_rate": 4.999674181139848e-06, |
|
"loss": 0.7505, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.015174773036270338, |
|
"grad_norm": 4.712652033599274, |
|
"learning_rate": 4.999662849406855e-06, |
|
"loss": 0.7515, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.015262488487347046, |
|
"grad_norm": 5.325275991673836, |
|
"learning_rate": 4.99965132398659e-06, |
|
"loss": 0.7871, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.015350203938423753, |
|
"grad_norm": 5.006048437293231, |
|
"learning_rate": 4.999639604879943e-06, |
|
"loss": 0.6038, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.015437919389500461, |
|
"grad_norm": 4.692976251794895, |
|
"learning_rate": 4.999627692087824e-06, |
|
"loss": 0.7106, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.015525634840577167, |
|
"grad_norm": 6.484912012474024, |
|
"learning_rate": 4.999615585611156e-06, |
|
"loss": 0.6456, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.015613350291653874, |
|
"grad_norm": 7.072312221146792, |
|
"learning_rate": 4.999603285450875e-06, |
|
"loss": 0.6986, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.015701065742730582, |
|
"grad_norm": 5.072158684292459, |
|
"learning_rate": 4.999590791607936e-06, |
|
"loss": 0.6386, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.015788781193807288, |
|
"grad_norm": 5.674801641765509, |
|
"learning_rate": 4.999578104083307e-06, |
|
"loss": 0.6512, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.015876496644883997, |
|
"grad_norm": 6.011232915930249, |
|
"learning_rate": 4.9995652228779715e-06, |
|
"loss": 0.6166, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.015964212095960703, |
|
"grad_norm": 7.067996556252431, |
|
"learning_rate": 4.999552147992926e-06, |
|
"loss": 0.8316, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.01605192754703741, |
|
"grad_norm": 6.191586224655665, |
|
"learning_rate": 4.999538879429183e-06, |
|
"loss": 0.7167, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.01613964299811412, |
|
"grad_norm": 5.40861794404673, |
|
"learning_rate": 4.999525417187774e-06, |
|
"loss": 0.6604, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.016227358449190824, |
|
"grad_norm": 5.619694849325643, |
|
"learning_rate": 4.999511761269739e-06, |
|
"loss": 0.7141, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.016315073900267534, |
|
"grad_norm": 7.467663008400906, |
|
"learning_rate": 4.999497911676138e-06, |
|
"loss": 0.6086, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.01640278935134424, |
|
"grad_norm": 4.645589903763359, |
|
"learning_rate": 4.999483868408043e-06, |
|
"loss": 0.6932, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.016490504802420945, |
|
"grad_norm": 4.819294533224638, |
|
"learning_rate": 4.999469631466544e-06, |
|
"loss": 0.6256, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.016578220253497655, |
|
"grad_norm": 4.711171445741636, |
|
"learning_rate": 4.999455200852741e-06, |
|
"loss": 0.7445, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.01666593570457436, |
|
"grad_norm": 4.371758877075776, |
|
"learning_rate": 4.999440576567755e-06, |
|
"loss": 0.6801, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.016753651155651066, |
|
"grad_norm": 5.761171404408883, |
|
"learning_rate": 4.999425758612718e-06, |
|
"loss": 0.6701, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.016841366606727776, |
|
"grad_norm": 4.340375314807721, |
|
"learning_rate": 4.999410746988778e-06, |
|
"loss": 0.5556, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.01692908205780448, |
|
"grad_norm": 4.775058922031801, |
|
"learning_rate": 4.9993955416970986e-06, |
|
"loss": 0.6915, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.01701679750888119, |
|
"grad_norm": 4.301940379009061, |
|
"learning_rate": 4.999380142738857e-06, |
|
"loss": 0.6982, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.017104512959957897, |
|
"grad_norm": 4.746670538298819, |
|
"learning_rate": 4.9993645501152485e-06, |
|
"loss": 0.5392, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.017192228411034603, |
|
"grad_norm": 5.312812102176541, |
|
"learning_rate": 4.999348763827479e-06, |
|
"loss": 0.6254, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.017279943862111312, |
|
"grad_norm": 6.073252701324542, |
|
"learning_rate": 4.999332783876774e-06, |
|
"loss": 0.7221, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.017367659313188018, |
|
"grad_norm": 6.783014797465277, |
|
"learning_rate": 4.999316610264369e-06, |
|
"loss": 0.5914, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.017455374764264724, |
|
"grad_norm": 5.105373260000072, |
|
"learning_rate": 4.999300242991519e-06, |
|
"loss": 0.4895, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.017543090215341433, |
|
"grad_norm": 5.3256898167081825, |
|
"learning_rate": 4.999283682059493e-06, |
|
"loss": 0.714, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01763080566641814, |
|
"grad_norm": 7.815945435660424, |
|
"learning_rate": 4.999266927469572e-06, |
|
"loss": 0.7691, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.017718521117494848, |
|
"grad_norm": 4.350216346007481, |
|
"learning_rate": 4.999249979223056e-06, |
|
"loss": 0.7205, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.017806236568571554, |
|
"grad_norm": 4.167534183562087, |
|
"learning_rate": 4.999232837321257e-06, |
|
"loss": 0.6716, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.01789395201964826, |
|
"grad_norm": 6.564156035042191, |
|
"learning_rate": 4.999215501765504e-06, |
|
"loss": 0.6139, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.01798166747072497, |
|
"grad_norm": 4.58988335300785, |
|
"learning_rate": 4.9991979725571395e-06, |
|
"loss": 0.6241, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.018069382921801675, |
|
"grad_norm": 7.14774553510386, |
|
"learning_rate": 4.999180249697524e-06, |
|
"loss": 0.7338, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.01815709837287838, |
|
"grad_norm": 4.3154768710391656, |
|
"learning_rate": 4.999162333188028e-06, |
|
"loss": 0.646, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.01824481382395509, |
|
"grad_norm": 3.930924147546703, |
|
"learning_rate": 4.999144223030041e-06, |
|
"loss": 0.7162, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.018332529275031796, |
|
"grad_norm": 3.75066761929553, |
|
"learning_rate": 4.999125919224966e-06, |
|
"loss": 0.6283, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.018420244726108505, |
|
"grad_norm": 4.916459254987505, |
|
"learning_rate": 4.999107421774222e-06, |
|
"loss": 0.6716, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.01850796017718521, |
|
"grad_norm": 4.570226928027306, |
|
"learning_rate": 4.999088730679241e-06, |
|
"loss": 0.6527, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.018595675628261917, |
|
"grad_norm": 3.6658012035372605, |
|
"learning_rate": 4.999069845941472e-06, |
|
"loss": 0.5452, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.018683391079338627, |
|
"grad_norm": 4.697816375671605, |
|
"learning_rate": 4.999050767562379e-06, |
|
"loss": 0.7316, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.018771106530415332, |
|
"grad_norm": 5.639876519194002, |
|
"learning_rate": 4.99903149554344e-06, |
|
"loss": 0.5152, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.018858821981492038, |
|
"grad_norm": 5.527702869650481, |
|
"learning_rate": 4.999012029886147e-06, |
|
"loss": 0.6119, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.018946537432568748, |
|
"grad_norm": 6.019639388484205, |
|
"learning_rate": 4.998992370592008e-06, |
|
"loss": 0.7366, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.019034252883645453, |
|
"grad_norm": 4.014799337285965, |
|
"learning_rate": 4.998972517662549e-06, |
|
"loss": 0.7088, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.019121968334722163, |
|
"grad_norm": 7.876499612097003, |
|
"learning_rate": 4.998952471099307e-06, |
|
"loss": 0.5565, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.01920968378579887, |
|
"grad_norm": 7.386792956892447, |
|
"learning_rate": 4.998932230903835e-06, |
|
"loss": 0.6387, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.019297399236875575, |
|
"grad_norm": 5.346097163630257, |
|
"learning_rate": 4.998911797077701e-06, |
|
"loss": 0.6237, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.019385114687952284, |
|
"grad_norm": 6.133310652425816, |
|
"learning_rate": 4.998891169622488e-06, |
|
"loss": 0.7428, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.01947283013902899, |
|
"grad_norm": 4.224801633855712, |
|
"learning_rate": 4.998870348539797e-06, |
|
"loss": 0.7206, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.019560545590105696, |
|
"grad_norm": 5.648869005800134, |
|
"learning_rate": 4.998849333831238e-06, |
|
"loss": 0.6249, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.019648261041182405, |
|
"grad_norm": 4.634920959306503, |
|
"learning_rate": 4.998828125498441e-06, |
|
"loss": 0.6764, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.01973597649225911, |
|
"grad_norm": 4.882651557085375, |
|
"learning_rate": 4.998806723543049e-06, |
|
"loss": 0.6682, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.01982369194333582, |
|
"grad_norm": 4.5073631852916645, |
|
"learning_rate": 4.998785127966721e-06, |
|
"loss": 0.7658, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.019911407394412526, |
|
"grad_norm": 6.444404326993186, |
|
"learning_rate": 4.99876333877113e-06, |
|
"loss": 0.7161, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.019999122845489232, |
|
"grad_norm": 5.926254683053582, |
|
"learning_rate": 4.998741355957963e-06, |
|
"loss": 0.6083, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.02008683829656594, |
|
"grad_norm": 4.715935033600424, |
|
"learning_rate": 4.998719179528925e-06, |
|
"loss": 0.5764, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.020174553747642647, |
|
"grad_norm": 4.06642116262848, |
|
"learning_rate": 4.998696809485734e-06, |
|
"loss": 0.6436, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.020262269198719353, |
|
"grad_norm": 4.060536926809771, |
|
"learning_rate": 4.998674245830123e-06, |
|
"loss": 0.6455, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.020349984649796062, |
|
"grad_norm": 5.769596888340199, |
|
"learning_rate": 4.9986514885638405e-06, |
|
"loss": 0.6422, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.020437700100872768, |
|
"grad_norm": 5.619149975421577, |
|
"learning_rate": 4.99862853768865e-06, |
|
"loss": 0.5151, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.020525415551949477, |
|
"grad_norm": 5.738973149236573, |
|
"learning_rate": 4.998605393206329e-06, |
|
"loss": 0.5698, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.020613131003026183, |
|
"grad_norm": 3.9117936997485443, |
|
"learning_rate": 4.998582055118672e-06, |
|
"loss": 0.6139, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.02070084645410289, |
|
"grad_norm": 5.594946157519774, |
|
"learning_rate": 4.998558523427488e-06, |
|
"loss": 0.6305, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.0207885619051796, |
|
"grad_norm": 3.7796595114227816, |
|
"learning_rate": 4.998534798134598e-06, |
|
"loss": 0.6064, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.020876277356256304, |
|
"grad_norm": 5.530110712124758, |
|
"learning_rate": 4.998510879241842e-06, |
|
"loss": 0.7404, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.02096399280733301, |
|
"grad_norm": 5.795681054870311, |
|
"learning_rate": 4.998486766751073e-06, |
|
"loss": 0.6637, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.02105170825840972, |
|
"grad_norm": 5.250443330736557, |
|
"learning_rate": 4.99846246066416e-06, |
|
"loss": 0.7229, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.021139423709486425, |
|
"grad_norm": 5.307033877732376, |
|
"learning_rate": 4.998437960982985e-06, |
|
"loss": 0.729, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.021227139160563135, |
|
"grad_norm": 4.264326950314863, |
|
"learning_rate": 4.998413267709446e-06, |
|
"loss": 0.6363, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.02131485461163984, |
|
"grad_norm": 4.56674428695937, |
|
"learning_rate": 4.99838838084546e-06, |
|
"loss": 0.573, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.021402570062716546, |
|
"grad_norm": 5.367393577306364, |
|
"learning_rate": 4.998363300392951e-06, |
|
"loss": 0.6187, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.021490285513793256, |
|
"grad_norm": 5.58627031411974, |
|
"learning_rate": 4.998338026353865e-06, |
|
"loss": 0.635, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.02157800096486996, |
|
"grad_norm": 4.1536241104050005, |
|
"learning_rate": 4.9983125587301594e-06, |
|
"loss": 0.7296, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.021665716415946668, |
|
"grad_norm": 5.369955138376355, |
|
"learning_rate": 4.998286897523808e-06, |
|
"loss": 0.5939, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.021753431867023377, |
|
"grad_norm": 4.749169550030242, |
|
"learning_rate": 4.998261042736799e-06, |
|
"loss": 0.7125, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.021841147318100083, |
|
"grad_norm": 3.847851803716185, |
|
"learning_rate": 4.998234994371135e-06, |
|
"loss": 0.6874, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.021928862769176792, |
|
"grad_norm": 6.3610718821634755, |
|
"learning_rate": 4.998208752428836e-06, |
|
"loss": 0.6839, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.022016578220253498, |
|
"grad_norm": 6.90892255007994, |
|
"learning_rate": 4.998182316911934e-06, |
|
"loss": 0.6706, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.022104293671330204, |
|
"grad_norm": 4.842858396629252, |
|
"learning_rate": 4.998155687822478e-06, |
|
"loss": 0.7887, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.022192009122406913, |
|
"grad_norm": 6.80960196083629, |
|
"learning_rate": 4.99812886516253e-06, |
|
"loss": 0.6891, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.02227972457348362, |
|
"grad_norm": 6.897100992823047, |
|
"learning_rate": 4.998101848934171e-06, |
|
"loss": 0.7213, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.022367440024560325, |
|
"grad_norm": 4.383904436150581, |
|
"learning_rate": 4.9980746391394916e-06, |
|
"loss": 0.5472, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.022455155475637034, |
|
"grad_norm": 6.136102422729719, |
|
"learning_rate": 4.998047235780603e-06, |
|
"loss": 0.7462, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.02254287092671374, |
|
"grad_norm": 5.873462354540876, |
|
"learning_rate": 4.9980196388596255e-06, |
|
"loss": 0.6893, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.02263058637779045, |
|
"grad_norm": 5.36389164609212, |
|
"learning_rate": 4.9979918483787e-06, |
|
"loss": 0.725, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.022718301828867155, |
|
"grad_norm": 6.634852411669424, |
|
"learning_rate": 4.997963864339978e-06, |
|
"loss": 0.7619, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.02280601727994386, |
|
"grad_norm": 4.201015694891079, |
|
"learning_rate": 4.99793568674563e-06, |
|
"loss": 0.653, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.02289373273102057, |
|
"grad_norm": 4.951129353141893, |
|
"learning_rate": 4.997907315597836e-06, |
|
"loss": 0.7543, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.022981448182097276, |
|
"grad_norm": 4.331792323630216, |
|
"learning_rate": 4.997878750898798e-06, |
|
"loss": 0.6553, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.023069163633173982, |
|
"grad_norm": 4.764837636647203, |
|
"learning_rate": 4.997849992650727e-06, |
|
"loss": 0.719, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.02315687908425069, |
|
"grad_norm": 7.315146297212186, |
|
"learning_rate": 4.997821040855852e-06, |
|
"loss": 0.8217, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.023244594535327397, |
|
"grad_norm": 4.5164891139288015, |
|
"learning_rate": 4.997791895516417e-06, |
|
"loss": 0.5553, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.023332309986404107, |
|
"grad_norm": 4.651549875308793, |
|
"learning_rate": 4.99776255663468e-06, |
|
"loss": 0.6981, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.023420025437480813, |
|
"grad_norm": 4.941120481014187, |
|
"learning_rate": 4.997733024212913e-06, |
|
"loss": 0.604, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.02350774088855752, |
|
"grad_norm": 6.3616778757465315, |
|
"learning_rate": 4.997703298253406e-06, |
|
"loss": 0.7253, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.023595456339634228, |
|
"grad_norm": 4.723855693485358, |
|
"learning_rate": 4.997673378758462e-06, |
|
"loss": 0.7335, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.023683171790710934, |
|
"grad_norm": 4.336523073382538, |
|
"learning_rate": 4.997643265730399e-06, |
|
"loss": 0.5665, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.02377088724178764, |
|
"grad_norm": 6.547875149524498, |
|
"learning_rate": 4.997612959171549e-06, |
|
"loss": 0.6542, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.02385860269286435, |
|
"grad_norm": 5.285021138793967, |
|
"learning_rate": 4.997582459084264e-06, |
|
"loss": 0.7824, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.023946318143941055, |
|
"grad_norm": 4.447718203152539, |
|
"learning_rate": 4.9975517654709025e-06, |
|
"loss": 0.6728, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.024034033595017764, |
|
"grad_norm": 4.323105158596241, |
|
"learning_rate": 4.997520878333847e-06, |
|
"loss": 0.6516, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.02412174904609447, |
|
"grad_norm": 4.091596093860627, |
|
"learning_rate": 4.997489797675489e-06, |
|
"loss": 0.5786, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.024209464497171176, |
|
"grad_norm": 4.50262054947591, |
|
"learning_rate": 4.997458523498236e-06, |
|
"loss": 0.6632, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.024297179948247885, |
|
"grad_norm": 5.394966563241667, |
|
"learning_rate": 4.997427055804513e-06, |
|
"loss": 0.7415, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.02438489539932459, |
|
"grad_norm": 5.134838704391961, |
|
"learning_rate": 4.9973953945967565e-06, |
|
"loss": 0.6225, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.024472610850401297, |
|
"grad_norm": 4.555937935551801, |
|
"learning_rate": 4.9973635398774226e-06, |
|
"loss": 0.7451, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.024560326301478006, |
|
"grad_norm": 4.014041307501394, |
|
"learning_rate": 4.997331491648976e-06, |
|
"loss": 0.607, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.024648041752554712, |
|
"grad_norm": 5.398424400960683, |
|
"learning_rate": 4.9972992499139025e-06, |
|
"loss": 0.665, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.02473575720363142, |
|
"grad_norm": 6.959554022697295, |
|
"learning_rate": 4.9972668146746995e-06, |
|
"loss": 0.8175, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.024823472654708127, |
|
"grad_norm": 5.048396931572014, |
|
"learning_rate": 4.997234185933879e-06, |
|
"loss": 0.6961, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.024911188105784833, |
|
"grad_norm": 4.737474855724115, |
|
"learning_rate": 4.997201363693972e-06, |
|
"loss": 0.5337, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.024998903556861542, |
|
"grad_norm": 7.374843310231967, |
|
"learning_rate": 4.997168347957521e-06, |
|
"loss": 0.6791, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.025086619007938248, |
|
"grad_norm": 4.306967488515473, |
|
"learning_rate": 4.997135138727081e-06, |
|
"loss": 0.8791, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.025174334459014954, |
|
"grad_norm": 3.7949900410813737, |
|
"learning_rate": 4.99710173600523e-06, |
|
"loss": 0.7743, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.025262049910091663, |
|
"grad_norm": 4.842604758031469, |
|
"learning_rate": 4.997068139794554e-06, |
|
"loss": 0.6602, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.02534976536116837, |
|
"grad_norm": 3.531764677671023, |
|
"learning_rate": 4.9970343500976545e-06, |
|
"loss": 0.6317, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.02543748081224508, |
|
"grad_norm": 5.68234167540357, |
|
"learning_rate": 4.997000366917153e-06, |
|
"loss": 0.7404, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.025525196263321785, |
|
"grad_norm": 4.623883782994243, |
|
"learning_rate": 4.9969661902556804e-06, |
|
"loss": 0.6093, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.02561291171439849, |
|
"grad_norm": 5.9956405593570175, |
|
"learning_rate": 4.996931820115885e-06, |
|
"loss": 0.6773, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.0257006271654752, |
|
"grad_norm": 5.06274620174889, |
|
"learning_rate": 4.996897256500433e-06, |
|
"loss": 0.7249, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.025788342616551906, |
|
"grad_norm": 5.989915075597491, |
|
"learning_rate": 4.996862499411998e-06, |
|
"loss": 0.7526, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.02587605806762861, |
|
"grad_norm": 4.58567195302804, |
|
"learning_rate": 4.996827548853276e-06, |
|
"loss": 0.6762, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.02596377351870532, |
|
"grad_norm": 4.097368677404026, |
|
"learning_rate": 4.996792404826974e-06, |
|
"loss": 0.6238, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.026051488969782027, |
|
"grad_norm": 4.021749832913485, |
|
"learning_rate": 4.996757067335816e-06, |
|
"loss": 0.7958, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.026139204420858736, |
|
"grad_norm": 4.679522912267575, |
|
"learning_rate": 4.99672153638254e-06, |
|
"loss": 0.6583, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.026226919871935442, |
|
"grad_norm": 4.256974035317045, |
|
"learning_rate": 4.996685811969898e-06, |
|
"loss": 0.6464, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.026314635323012148, |
|
"grad_norm": 4.4862335847168096, |
|
"learning_rate": 4.996649894100659e-06, |
|
"loss": 0.6116, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 11400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 150, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7802380615680.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|