|
{ |
|
"best_metric": 2.395761489868164, |
|
"best_model_checkpoint": "../../saves/Baichuan2-7B-Chat/lora/sft/checkpoint-2000", |
|
"epoch": 7.901234567901234, |
|
"eval_steps": 400, |
|
"global_step": 2000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.1405200958251953, |
|
"learning_rate": 2.25e-05, |
|
"loss": 3.6308, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.766953945159912, |
|
"learning_rate": 4.75e-05, |
|
"loss": 3.4926, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.2588557004928589, |
|
"learning_rate": 4.99984138555282e-05, |
|
"loss": 3.2621, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.9718258380889893, |
|
"learning_rate": 4.999293114538139e-05, |
|
"loss": 3.0924, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.9004219770431519, |
|
"learning_rate": 4.998353314622318e-05, |
|
"loss": 3.0325, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7595831751823425, |
|
"learning_rate": 4.997022133030516e-05, |
|
"loss": 2.9351, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.8930522799491882, |
|
"learning_rate": 4.9952997783001254e-05, |
|
"loss": 2.8068, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7985192537307739, |
|
"learning_rate": 4.9931865202480996e-05, |
|
"loss": 2.8503, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.9129031896591187, |
|
"learning_rate": 4.990682689928687e-05, |
|
"loss": 2.7241, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.8816404342651367, |
|
"learning_rate": 4.9877886795815685e-05, |
|
"loss": 2.8525, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.8212659358978271, |
|
"learning_rate": 4.98450494257041e-05, |
|
"loss": 2.7173, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.9286770224571228, |
|
"learning_rate": 4.980831993311844e-05, |
|
"loss": 2.7857, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.95149165391922, |
|
"learning_rate": 4.976770407194877e-05, |
|
"loss": 2.6764, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.1459342241287231, |
|
"learning_rate": 4.972320820490759e-05, |
|
"loss": 2.7001, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.1330541372299194, |
|
"learning_rate": 4.967483930253302e-05, |
|
"loss": 2.7024, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.9277874827384949, |
|
"learning_rate": 4.962260494209683e-05, |
|
"loss": 2.7039, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.0230640172958374, |
|
"learning_rate": 4.9566513306417444e-05, |
|
"loss": 2.7423, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.9915482997894287, |
|
"learning_rate": 4.950657318257805e-05, |
|
"loss": 2.7303, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.105600357055664, |
|
"learning_rate": 4.944279396055003e-05, |
|
"loss": 2.6616, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.1231801509857178, |
|
"learning_rate": 4.937518563172196e-05, |
|
"loss": 2.655, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.908206582069397, |
|
"learning_rate": 4.930375878733445e-05, |
|
"loss": 2.6541, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.087323546409607, |
|
"learning_rate": 4.922852461682093e-05, |
|
"loss": 2.5646, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.0399665832519531, |
|
"learning_rate": 4.9149494906054716e-05, |
|
"loss": 2.6036, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.9571551084518433, |
|
"learning_rate": 4.906668203550279e-05, |
|
"loss": 2.6212, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.9485632181167603, |
|
"learning_rate": 4.8980098978286215e-05, |
|
"loss": 2.6717, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.9359139204025269, |
|
"learning_rate": 4.888975929814792e-05, |
|
"loss": 2.5967, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.2552564144134521, |
|
"learning_rate": 4.8795677147327776e-05, |
|
"loss": 2.5608, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.9426449537277222, |
|
"learning_rate": 4.8697867264345616e-05, |
|
"loss": 2.5731, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 1.132430076599121, |
|
"learning_rate": 4.859634497169233e-05, |
|
"loss": 2.5884, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.9066994786262512, |
|
"learning_rate": 4.849112617342955e-05, |
|
"loss": 2.5888, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 1.0188608169555664, |
|
"learning_rate": 4.8382227352698115e-05, |
|
"loss": 2.5849, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 1.3850712776184082, |
|
"learning_rate": 4.826966556913597e-05, |
|
"loss": 2.485, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 1.1342747211456299, |
|
"learning_rate": 4.815345845620563e-05, |
|
"loss": 2.5624, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.0687206983566284, |
|
"learning_rate": 4.803362421843177e-05, |
|
"loss": 2.5051, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 1.5436629056930542, |
|
"learning_rate": 4.7910181628549454e-05, |
|
"loss": 2.5185, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 1.2030800580978394, |
|
"learning_rate": 4.77831500245632e-05, |
|
"loss": 2.5122, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.2365000247955322, |
|
"learning_rate": 4.765254930671762e-05, |
|
"loss": 2.5704, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.1403887271881104, |
|
"learning_rate": 4.75183999343799e-05, |
|
"loss": 2.5605, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 1.2193725109100342, |
|
"learning_rate": 4.738072292283473e-05, |
|
"loss": 2.569, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 1.4231560230255127, |
|
"learning_rate": 4.723953983999215e-05, |
|
"loss": 2.4809, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"eval_loss": 2.4551122188568115, |
|
"eval_runtime": 134.6274, |
|
"eval_samples_per_second": 6.685, |
|
"eval_steps_per_second": 3.343, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.26221764087677, |
|
"learning_rate": 4.70948728030088e-05, |
|
"loss": 2.6339, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.2207887172698975, |
|
"learning_rate": 4.694674447482312e-05, |
|
"loss": 2.5877, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 1.2746591567993164, |
|
"learning_rate": 4.679517806060509e-05, |
|
"loss": 2.5866, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 1.774005651473999, |
|
"learning_rate": 4.664019730412101e-05, |
|
"loss": 2.5073, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.4896618127822876, |
|
"learning_rate": 4.648182648401389e-05, |
|
"loss": 2.4688, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.3457367420196533, |
|
"learning_rate": 4.6320090410000027e-05, |
|
"loss": 2.527, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 1.2498492002487183, |
|
"learning_rate": 4.615501441898248e-05, |
|
"loss": 2.625, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 1.3643558025360107, |
|
"learning_rate": 4.598662437108186e-05, |
|
"loss": 2.4755, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.198166847229004, |
|
"learning_rate": 4.581494664558518e-05, |
|
"loss": 2.5688, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 3.3917434215545654, |
|
"learning_rate": 4.564000813681342e-05, |
|
"loss": 2.5182, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.562139630317688, |
|
"learning_rate": 4.546183624990832e-05, |
|
"loss": 2.4533, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.1284795999526978, |
|
"learning_rate": 4.528045889653927e-05, |
|
"loss": 2.4901, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 1.7664827108383179, |
|
"learning_rate": 4.509590449053074e-05, |
|
"loss": 2.5075, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 1.6162073612213135, |
|
"learning_rate": 4.49082019434111e-05, |
|
"loss": 2.4769, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.3468163013458252, |
|
"learning_rate": 4.471738065988347e-05, |
|
"loss": 2.4979, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.0762629508972168, |
|
"learning_rate": 4.452347053321926e-05, |
|
"loss": 2.5436, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.1567480564117432, |
|
"learning_rate": 4.432650194057527e-05, |
|
"loss": 2.5454, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 1.419041395187378, |
|
"learning_rate": 4.412650573823489e-05, |
|
"loss": 2.4681, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.2923465967178345, |
|
"learning_rate": 4.392351325677433e-05, |
|
"loss": 2.565, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.2892262935638428, |
|
"learning_rate": 4.371755629615442e-05, |
|
"loss": 2.5258, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 1.467966914176941, |
|
"learning_rate": 4.3508667120739046e-05, |
|
"loss": 2.5776, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 1.2278165817260742, |
|
"learning_rate": 4.329687845424069e-05, |
|
"loss": 2.4175, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 1.3225311040878296, |
|
"learning_rate": 4.308222347459411e-05, |
|
"loss": 2.4561, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.2582958936691284, |
|
"learning_rate": 4.286473580875878e-05, |
|
"loss": 2.3885, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 1.206189751625061, |
|
"learning_rate": 4.264444952745108e-05, |
|
"loss": 2.5041, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 1.9777090549468994, |
|
"learning_rate": 4.242139913980686e-05, |
|
"loss": 2.4763, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 1.91414475440979, |
|
"learning_rate": 4.219561958797543e-05, |
|
"loss": 2.37, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.0806653499603271, |
|
"learning_rate": 4.196714624164565e-05, |
|
"loss": 2.5985, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 1.2435009479522705, |
|
"learning_rate": 4.1736014892505064e-05, |
|
"loss": 2.4765, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 1.3920471668243408, |
|
"learning_rate": 4.150226174863292e-05, |
|
"loss": 2.4446, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.949141263961792, |
|
"learning_rate": 4.126592342882795e-05, |
|
"loss": 2.4979, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 1.1306403875350952, |
|
"learning_rate": 4.1027036956871854e-05, |
|
"loss": 2.4096, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.9906802773475647, |
|
"learning_rate": 4.078563975572928e-05, |
|
"loss": 2.5409, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.4917031526565552, |
|
"learning_rate": 4.054176964168528e-05, |
|
"loss": 2.4508, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.554909110069275, |
|
"learning_rate": 4.029546481842123e-05, |
|
"loss": 2.4673, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.2943602800369263, |
|
"learning_rate": 4.004676387102995e-05, |
|
"loss": 2.4801, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 1.301687240600586, |
|
"learning_rate": 3.9795705759971116e-05, |
|
"loss": 2.4779, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.2175750732421875, |
|
"learning_rate": 3.9542329814967914e-05, |
|
"loss": 2.3964, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 2.502758502960205, |
|
"learning_rate": 3.92866757288458e-05, |
|
"loss": 2.4044, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 1.508583664894104, |
|
"learning_rate": 3.9028783551314347e-05, |
|
"loss": 2.5229, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"eval_loss": 2.413785696029663, |
|
"eval_runtime": 133.2078, |
|
"eval_samples_per_second": 6.756, |
|
"eval_steps_per_second": 3.378, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.3288512229919434, |
|
"learning_rate": 3.876869368269327e-05, |
|
"loss": 2.4517, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.4469561576843262, |
|
"learning_rate": 3.850644686758346e-05, |
|
"loss": 2.5377, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.560353398323059, |
|
"learning_rate": 3.82420841884841e-05, |
|
"loss": 2.3569, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 1.9207900762557983, |
|
"learning_rate": 3.7975647059356875e-05, |
|
"loss": 2.4131, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.685535192489624, |
|
"learning_rate": 3.770717721913819e-05, |
|
"loss": 2.5124, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.3592054843902588, |
|
"learning_rate": 3.743671672520054e-05, |
|
"loss": 2.3343, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.9445059299468994, |
|
"learning_rate": 3.716430794676402e-05, |
|
"loss": 2.4614, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 1.6313419342041016, |
|
"learning_rate": 3.688999355825887e-05, |
|
"loss": 2.4678, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 2.071474313735962, |
|
"learning_rate": 3.661381653264031e-05, |
|
"loss": 2.4016, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 6.210580825805664, |
|
"learning_rate": 3.633582013465658e-05, |
|
"loss": 2.3772, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.459627628326416, |
|
"learning_rate": 3.605604791407124e-05, |
|
"loss": 2.4438, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.63, |
|
"grad_norm": 1.3812425136566162, |
|
"learning_rate": 3.577454369884086e-05, |
|
"loss": 2.4352, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 1.443032145500183, |
|
"learning_rate": 3.549135158824913e-05, |
|
"loss": 2.3374, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 3.71, |
|
"grad_norm": 2.8968636989593506, |
|
"learning_rate": 3.520651594599842e-05, |
|
"loss": 2.3911, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 3.75, |
|
"grad_norm": 1.7020437717437744, |
|
"learning_rate": 3.4920081393259955e-05, |
|
"loss": 2.5022, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 3.79, |
|
"grad_norm": 1.4983431100845337, |
|
"learning_rate": 3.463209280168365e-05, |
|
"loss": 2.4919, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 1.527735948562622, |
|
"learning_rate": 3.434259528636872e-05, |
|
"loss": 2.423, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 3.87, |
|
"grad_norm": 1.3608715534210205, |
|
"learning_rate": 3.405163419879611e-05, |
|
"loss": 2.4668, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 1.6936486959457397, |
|
"learning_rate": 3.37592551197239e-05, |
|
"loss": 2.4736, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 3.95, |
|
"grad_norm": 1.6318974494934082, |
|
"learning_rate": 3.34655038520469e-05, |
|
"loss": 2.4683, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 1.3295326232910156, |
|
"learning_rate": 3.317042641362126e-05, |
|
"loss": 2.3889, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 1.5521697998046875, |
|
"learning_rate": 3.2874069030055534e-05, |
|
"loss": 2.4913, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.07, |
|
"grad_norm": 1.2893122434616089, |
|
"learning_rate": 3.257647812746922e-05, |
|
"loss": 2.4289, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.11, |
|
"grad_norm": 1.4011497497558594, |
|
"learning_rate": 3.227770032521975e-05, |
|
"loss": 2.4604, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 1.7100721597671509, |
|
"learning_rate": 3.1977782428599364e-05, |
|
"loss": 2.3778, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 1.4909169673919678, |
|
"learning_rate": 3.1676771421502746e-05, |
|
"loss": 2.4634, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 4.23, |
|
"grad_norm": 2.009910821914673, |
|
"learning_rate": 3.137471445906675e-05, |
|
"loss": 2.4035, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 1.4564893245697021, |
|
"learning_rate": 3.107165886028326e-05, |
|
"loss": 2.4581, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 1.6162135601043701, |
|
"learning_rate": 3.076765210058638e-05, |
|
"loss": 2.4216, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 1.469684362411499, |
|
"learning_rate": 3.046274180441512e-05, |
|
"loss": 2.3395, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 4.39, |
|
"grad_norm": 2.3828556537628174, |
|
"learning_rate": 3.015697573775283e-05, |
|
"loss": 2.4602, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 1.5302035808563232, |
|
"learning_rate": 2.9850401800644257e-05, |
|
"loss": 2.4116, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 4.46, |
|
"grad_norm": 2.1008236408233643, |
|
"learning_rate": 2.9543068019691833e-05, |
|
"loss": 2.2545, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 4.5, |
|
"grad_norm": 1.4228670597076416, |
|
"learning_rate": 2.923502254053193e-05, |
|
"loss": 2.4589, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 1.4719305038452148, |
|
"learning_rate": 2.892631362029265e-05, |
|
"loss": 2.3918, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 1.771802544593811, |
|
"learning_rate": 2.8616989620034013e-05, |
|
"loss": 2.3929, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 4.62, |
|
"grad_norm": 1.5566627979278564, |
|
"learning_rate": 2.83070989971719e-05, |
|
"loss": 2.3442, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 4.66, |
|
"grad_norm": 1.8499693870544434, |
|
"learning_rate": 2.7996690297886995e-05, |
|
"loss": 2.4422, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 1.5866152048110962, |
|
"learning_rate": 2.768581214951964e-05, |
|
"loss": 2.4489, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 1.5571675300598145, |
|
"learning_rate": 2.737451325295214e-05, |
|
"loss": 2.3453, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"eval_loss": 2.4050841331481934, |
|
"eval_runtime": 133.8041, |
|
"eval_samples_per_second": 6.726, |
|
"eval_steps_per_second": 3.363, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 4.78, |
|
"grad_norm": 1.371382474899292, |
|
"learning_rate": 2.706284237497948e-05, |
|
"loss": 2.3094, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 4.82, |
|
"grad_norm": 1.5894430875778198, |
|
"learning_rate": 2.675084834066968e-05, |
|
"loss": 2.352, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 1.9093360900878906, |
|
"learning_rate": 2.6438580025715138e-05, |
|
"loss": 2.3941, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 1.7057812213897705, |
|
"learning_rate": 2.612608634877588e-05, |
|
"loss": 2.408, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 4.94, |
|
"grad_norm": 4.448972225189209, |
|
"learning_rate": 2.5813416263816227e-05, |
|
"loss": 2.4234, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 1.4726636409759521, |
|
"learning_rate": 2.550061875243584e-05, |
|
"loss": 2.4223, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"grad_norm": 1.3479527235031128, |
|
"learning_rate": 2.5187742816196487e-05, |
|
"loss": 2.3444, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 5.06, |
|
"grad_norm": 1.584678292274475, |
|
"learning_rate": 2.487483746894563e-05, |
|
"loss": 2.4881, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 1.5328477621078491, |
|
"learning_rate": 2.4561951729138167e-05, |
|
"loss": 2.3752, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 5.14, |
|
"grad_norm": 2.0441181659698486, |
|
"learning_rate": 2.4249134612157346e-05, |
|
"loss": 2.4605, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 5.18, |
|
"grad_norm": 1.5883582830429077, |
|
"learning_rate": 2.393643512263627e-05, |
|
"loss": 2.3095, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 5.21, |
|
"grad_norm": 1.6504632234573364, |
|
"learning_rate": 2.3623902246780994e-05, |
|
"loss": 2.3773, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 5.25, |
|
"grad_norm": 2.101841926574707, |
|
"learning_rate": 2.331158494469657e-05, |
|
"loss": 2.3966, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 5.29, |
|
"grad_norm": 1.5765920877456665, |
|
"learning_rate": 2.2999532142717174e-05, |
|
"loss": 2.4361, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 5.33, |
|
"grad_norm": 2.0858278274536133, |
|
"learning_rate": 2.268779272574146e-05, |
|
"loss": 2.3576, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 5.37, |
|
"grad_norm": 1.7046364545822144, |
|
"learning_rate": 2.2376415529574525e-05, |
|
"loss": 2.4298, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 5.41, |
|
"grad_norm": 1.7461296319961548, |
|
"learning_rate": 2.206544933327742e-05, |
|
"loss": 2.3175, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 5.45, |
|
"grad_norm": 2.0052788257598877, |
|
"learning_rate": 2.1754942851525677e-05, |
|
"loss": 2.3432, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 5.49, |
|
"grad_norm": 1.8527193069458008, |
|
"learning_rate": 2.1444944726977857e-05, |
|
"loss": 2.2937, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 5.53, |
|
"grad_norm": 1.8431612253189087, |
|
"learning_rate": 2.1135503522655374e-05, |
|
"loss": 2.3031, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 5.57, |
|
"grad_norm": 1.8249716758728027, |
|
"learning_rate": 2.082666771433484e-05, |
|
"loss": 2.4171, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 5.61, |
|
"grad_norm": 1.6596335172653198, |
|
"learning_rate": 2.0518485682954025e-05, |
|
"loss": 2.4917, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 5.65, |
|
"grad_norm": 1.8855317831039429, |
|
"learning_rate": 2.0211005707032733e-05, |
|
"loss": 2.3648, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"grad_norm": 1.6180534362792969, |
|
"learning_rate": 1.9904275955109652e-05, |
|
"loss": 2.4083, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 5.73, |
|
"grad_norm": 1.5273176431655884, |
|
"learning_rate": 1.959834447819649e-05, |
|
"loss": 2.4187, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 5.77, |
|
"grad_norm": 1.8004169464111328, |
|
"learning_rate": 1.9293259202250517e-05, |
|
"loss": 2.4147, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 5.81, |
|
"grad_norm": 1.641048550605774, |
|
"learning_rate": 1.8989067920666633e-05, |
|
"loss": 2.3738, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 5.85, |
|
"grad_norm": 1.586946964263916, |
|
"learning_rate": 1.8685818286790325e-05, |
|
"loss": 2.4126, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 5.89, |
|
"grad_norm": 2.0825576782226562, |
|
"learning_rate": 1.8383557806452433e-05, |
|
"loss": 2.3781, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 5.93, |
|
"grad_norm": 1.7725000381469727, |
|
"learning_rate": 1.808233383052709e-05, |
|
"loss": 2.2956, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 5.97, |
|
"grad_norm": 1.7009029388427734, |
|
"learning_rate": 1.7782193547513974e-05, |
|
"loss": 2.3416, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 1.7379595041275024, |
|
"learning_rate": 1.7483183976145894e-05, |
|
"loss": 2.3466, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 6.04, |
|
"grad_norm": 1.678911805152893, |
|
"learning_rate": 1.7185351958023082e-05, |
|
"loss": 2.4167, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 6.08, |
|
"grad_norm": 1.663160800933838, |
|
"learning_rate": 1.6888744150275148e-05, |
|
"loss": 2.4156, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 6.12, |
|
"grad_norm": 1.5950766801834106, |
|
"learning_rate": 1.6593407018251973e-05, |
|
"loss": 2.3795, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 6.16, |
|
"grad_norm": 1.5608184337615967, |
|
"learning_rate": 1.6299386828244645e-05, |
|
"loss": 2.3945, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 6.2, |
|
"grad_norm": 2.6302921772003174, |
|
"learning_rate": 1.60067296402376e-05, |
|
"loss": 2.3195, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 6.24, |
|
"grad_norm": 1.7563198804855347, |
|
"learning_rate": 1.5715481300692993e-05, |
|
"loss": 2.3551, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 6.28, |
|
"grad_norm": 2.2081732749938965, |
|
"learning_rate": 1.5425687435368648e-05, |
|
"loss": 2.3597, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"grad_norm": 2.120513916015625, |
|
"learning_rate": 1.5137393442170461e-05, |
|
"loss": 2.3758, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.32, |
|
"eval_loss": 2.400667428970337, |
|
"eval_runtime": 133.4489, |
|
"eval_samples_per_second": 6.744, |
|
"eval_steps_per_second": 3.372, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 6.36, |
|
"grad_norm": 1.9258661270141602, |
|
"learning_rate": 1.4850644484040584e-05, |
|
"loss": 2.3852, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 1.749426007270813, |
|
"learning_rate": 1.4565485481882396e-05, |
|
"loss": 2.3067, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 6.44, |
|
"grad_norm": 1.9953992366790771, |
|
"learning_rate": 1.4281961107523336e-05, |
|
"loss": 2.3013, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 6.48, |
|
"grad_norm": 2.156952381134033, |
|
"learning_rate": 1.4000115776716849e-05, |
|
"loss": 2.3504, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 6.52, |
|
"grad_norm": 2.4170098304748535, |
|
"learning_rate": 1.371999364218437e-05, |
|
"loss": 2.3035, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 6.56, |
|
"grad_norm": 2.338102340698242, |
|
"learning_rate": 1.3441638586698527e-05, |
|
"loss": 2.2753, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 6.6, |
|
"grad_norm": 2.286085605621338, |
|
"learning_rate": 1.3165094216208696e-05, |
|
"loss": 2.3644, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 6.64, |
|
"grad_norm": 2.505244016647339, |
|
"learning_rate": 1.2890403853009847e-05, |
|
"loss": 2.371, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 6.68, |
|
"grad_norm": 1.636423110961914, |
|
"learning_rate": 1.2617610528955814e-05, |
|
"loss": 2.3602, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 6.72, |
|
"grad_norm": 1.6253471374511719, |
|
"learning_rate": 1.234675697871818e-05, |
|
"loss": 2.3858, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"grad_norm": 1.9490761756896973, |
|
"learning_rate": 1.2077885633091595e-05, |
|
"loss": 2.2864, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 1.7611408233642578, |
|
"learning_rate": 1.1811038612346728e-05, |
|
"loss": 2.2646, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 6.83, |
|
"grad_norm": 1.9415556192398071, |
|
"learning_rate": 1.154625771963192e-05, |
|
"loss": 2.311, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 6.87, |
|
"grad_norm": 2.0429086685180664, |
|
"learning_rate": 1.1283584434424455e-05, |
|
"loss": 2.3504, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 6.91, |
|
"grad_norm": 2.0815227031707764, |
|
"learning_rate": 1.102305990603257e-05, |
|
"loss": 2.3426, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 6.95, |
|
"grad_norm": 1.8559825420379639, |
|
"learning_rate": 1.0764724947149132e-05, |
|
"loss": 2.3183, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 6.99, |
|
"grad_norm": 2.6576716899871826, |
|
"learning_rate": 1.0508620027458158e-05, |
|
"loss": 2.378, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 7.03, |
|
"grad_norm": 1.9085129499435425, |
|
"learning_rate": 1.0254785267294958e-05, |
|
"loss": 2.3286, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 7.07, |
|
"grad_norm": 1.899032711982727, |
|
"learning_rate": 1.0003260431361039e-05, |
|
"loss": 2.3615, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 7.11, |
|
"grad_norm": 1.8750344514846802, |
|
"learning_rate": 9.75408492249478e-06, |
|
"loss": 2.3459, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 7.15, |
|
"grad_norm": 2.1118319034576416, |
|
"learning_rate": 9.507297775498707e-06, |
|
"loss": 2.4204, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 7.19, |
|
"grad_norm": 1.971839189529419, |
|
"learning_rate": 9.262937651024462e-06, |
|
"loss": 2.3497, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 7.23, |
|
"grad_norm": 2.0775558948516846, |
|
"learning_rate": 9.02104282951641e-06, |
|
"loss": 2.3027, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 7.27, |
|
"grad_norm": 2.3251700401306152, |
|
"learning_rate": 8.781651205214775e-06, |
|
"loss": 2.3317, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 7.31, |
|
"grad_norm": 2.0580742359161377, |
|
"learning_rate": 8.544800280219282e-06, |
|
"loss": 2.3516, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 7.35, |
|
"grad_norm": 2.2532641887664795, |
|
"learning_rate": 8.310527158614204e-06, |
|
"loss": 2.2712, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 7.39, |
|
"grad_norm": 2.1395344734191895, |
|
"learning_rate": 8.07886854065585e-06, |
|
"loss": 2.3357, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 7.43, |
|
"grad_norm": 1.6818287372589111, |
|
"learning_rate": 7.849860717023217e-06, |
|
"loss": 2.3414, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 7.47, |
|
"grad_norm": 2.29758882522583, |
|
"learning_rate": 7.62353956313284e-06, |
|
"loss": 2.2435, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 7.51, |
|
"grad_norm": 2.3084988594055176, |
|
"learning_rate": 7.3999405335187124e-06, |
|
"loss": 2.3185, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 7.55, |
|
"grad_norm": 2.1061959266662598, |
|
"learning_rate": 7.17909865627813e-06, |
|
"loss": 2.3499, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 7.59, |
|
"grad_norm": 1.7624549865722656, |
|
"learning_rate": 6.961048527584296e-06, |
|
"loss": 2.3895, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 7.62, |
|
"grad_norm": 2.2806477546691895, |
|
"learning_rate": 6.745824306266685e-06, |
|
"loss": 2.3313, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 7.66, |
|
"grad_norm": 1.7848331928253174, |
|
"learning_rate": 6.533459708459827e-06, |
|
"loss": 2.4686, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 7.7, |
|
"grad_norm": 1.8504621982574463, |
|
"learning_rate": 6.323988002321471e-06, |
|
"loss": 2.2985, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 7.74, |
|
"grad_norm": 2.1124908924102783, |
|
"learning_rate": 6.1174420028209585e-06, |
|
"loss": 2.3432, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 7.78, |
|
"grad_norm": 1.7486050128936768, |
|
"learning_rate": 5.9138540665985595e-06, |
|
"loss": 2.3414, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 7.82, |
|
"grad_norm": 2.394221782684326, |
|
"learning_rate": 5.713256086896604e-06, |
|
"loss": 2.3297, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 7.86, |
|
"grad_norm": 2.157335042953491, |
|
"learning_rate": 5.5156794885632165e-06, |
|
"loss": 2.2748, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"grad_norm": 2.8654778003692627, |
|
"learning_rate": 5.3211552231294485e-06, |
|
"loss": 2.3073, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 7.9, |
|
"eval_loss": 2.395761489868164, |
|
"eval_runtime": 134.6111, |
|
"eval_samples_per_second": 6.686, |
|
"eval_steps_per_second": 3.343, |
|
"step": 2000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2530, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 400, |
|
"total_flos": 2.2040949247338086e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|