ukk0708's picture
Upload 13 files
6cb7d73 verified
{
"best_metric": 2.395761489868164,
"best_model_checkpoint": "../../saves/Baichuan2-7B-Chat/lora/sft/checkpoint-2000",
"epoch": 7.901234567901234,
"eval_steps": 400,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04,
"grad_norm": 1.1405200958251953,
"learning_rate": 2.25e-05,
"loss": 3.6308,
"step": 10
},
{
"epoch": 0.08,
"grad_norm": 1.766953945159912,
"learning_rate": 4.75e-05,
"loss": 3.4926,
"step": 20
},
{
"epoch": 0.12,
"grad_norm": 1.2588557004928589,
"learning_rate": 4.99984138555282e-05,
"loss": 3.2621,
"step": 30
},
{
"epoch": 0.16,
"grad_norm": 0.9718258380889893,
"learning_rate": 4.999293114538139e-05,
"loss": 3.0924,
"step": 40
},
{
"epoch": 0.2,
"grad_norm": 0.9004219770431519,
"learning_rate": 4.998353314622318e-05,
"loss": 3.0325,
"step": 50
},
{
"epoch": 0.24,
"grad_norm": 0.7595831751823425,
"learning_rate": 4.997022133030516e-05,
"loss": 2.9351,
"step": 60
},
{
"epoch": 0.28,
"grad_norm": 0.8930522799491882,
"learning_rate": 4.9952997783001254e-05,
"loss": 2.8068,
"step": 70
},
{
"epoch": 0.32,
"grad_norm": 0.7985192537307739,
"learning_rate": 4.9931865202480996e-05,
"loss": 2.8503,
"step": 80
},
{
"epoch": 0.36,
"grad_norm": 0.9129031896591187,
"learning_rate": 4.990682689928687e-05,
"loss": 2.7241,
"step": 90
},
{
"epoch": 0.4,
"grad_norm": 0.8816404342651367,
"learning_rate": 4.9877886795815685e-05,
"loss": 2.8525,
"step": 100
},
{
"epoch": 0.43,
"grad_norm": 0.8212659358978271,
"learning_rate": 4.98450494257041e-05,
"loss": 2.7173,
"step": 110
},
{
"epoch": 0.47,
"grad_norm": 0.9286770224571228,
"learning_rate": 4.980831993311844e-05,
"loss": 2.7857,
"step": 120
},
{
"epoch": 0.51,
"grad_norm": 0.95149165391922,
"learning_rate": 4.976770407194877e-05,
"loss": 2.6764,
"step": 130
},
{
"epoch": 0.55,
"grad_norm": 1.1459342241287231,
"learning_rate": 4.972320820490759e-05,
"loss": 2.7001,
"step": 140
},
{
"epoch": 0.59,
"grad_norm": 1.1330541372299194,
"learning_rate": 4.967483930253302e-05,
"loss": 2.7024,
"step": 150
},
{
"epoch": 0.63,
"grad_norm": 0.9277874827384949,
"learning_rate": 4.962260494209683e-05,
"loss": 2.7039,
"step": 160
},
{
"epoch": 0.67,
"grad_norm": 1.0230640172958374,
"learning_rate": 4.9566513306417444e-05,
"loss": 2.7423,
"step": 170
},
{
"epoch": 0.71,
"grad_norm": 0.9915482997894287,
"learning_rate": 4.950657318257805e-05,
"loss": 2.7303,
"step": 180
},
{
"epoch": 0.75,
"grad_norm": 1.105600357055664,
"learning_rate": 4.944279396055003e-05,
"loss": 2.6616,
"step": 190
},
{
"epoch": 0.79,
"grad_norm": 1.1231801509857178,
"learning_rate": 4.937518563172196e-05,
"loss": 2.655,
"step": 200
},
{
"epoch": 0.83,
"grad_norm": 0.908206582069397,
"learning_rate": 4.930375878733445e-05,
"loss": 2.6541,
"step": 210
},
{
"epoch": 0.87,
"grad_norm": 1.087323546409607,
"learning_rate": 4.922852461682093e-05,
"loss": 2.5646,
"step": 220
},
{
"epoch": 0.91,
"grad_norm": 1.0399665832519531,
"learning_rate": 4.9149494906054716e-05,
"loss": 2.6036,
"step": 230
},
{
"epoch": 0.95,
"grad_norm": 0.9571551084518433,
"learning_rate": 4.906668203550279e-05,
"loss": 2.6212,
"step": 240
},
{
"epoch": 0.99,
"grad_norm": 0.9485632181167603,
"learning_rate": 4.8980098978286215e-05,
"loss": 2.6717,
"step": 250
},
{
"epoch": 1.03,
"grad_norm": 0.9359139204025269,
"learning_rate": 4.888975929814792e-05,
"loss": 2.5967,
"step": 260
},
{
"epoch": 1.07,
"grad_norm": 1.2552564144134521,
"learning_rate": 4.8795677147327776e-05,
"loss": 2.5608,
"step": 270
},
{
"epoch": 1.11,
"grad_norm": 0.9426449537277222,
"learning_rate": 4.8697867264345616e-05,
"loss": 2.5731,
"step": 280
},
{
"epoch": 1.15,
"grad_norm": 1.132430076599121,
"learning_rate": 4.859634497169233e-05,
"loss": 2.5884,
"step": 290
},
{
"epoch": 1.19,
"grad_norm": 0.9066994786262512,
"learning_rate": 4.849112617342955e-05,
"loss": 2.5888,
"step": 300
},
{
"epoch": 1.22,
"grad_norm": 1.0188608169555664,
"learning_rate": 4.8382227352698115e-05,
"loss": 2.5849,
"step": 310
},
{
"epoch": 1.26,
"grad_norm": 1.3850712776184082,
"learning_rate": 4.826966556913597e-05,
"loss": 2.485,
"step": 320
},
{
"epoch": 1.3,
"grad_norm": 1.1342747211456299,
"learning_rate": 4.815345845620563e-05,
"loss": 2.5624,
"step": 330
},
{
"epoch": 1.34,
"grad_norm": 1.0687206983566284,
"learning_rate": 4.803362421843177e-05,
"loss": 2.5051,
"step": 340
},
{
"epoch": 1.38,
"grad_norm": 1.5436629056930542,
"learning_rate": 4.7910181628549454e-05,
"loss": 2.5185,
"step": 350
},
{
"epoch": 1.42,
"grad_norm": 1.2030800580978394,
"learning_rate": 4.77831500245632e-05,
"loss": 2.5122,
"step": 360
},
{
"epoch": 1.46,
"grad_norm": 1.2365000247955322,
"learning_rate": 4.765254930671762e-05,
"loss": 2.5704,
"step": 370
},
{
"epoch": 1.5,
"grad_norm": 1.1403887271881104,
"learning_rate": 4.75183999343799e-05,
"loss": 2.5605,
"step": 380
},
{
"epoch": 1.54,
"grad_norm": 1.2193725109100342,
"learning_rate": 4.738072292283473e-05,
"loss": 2.569,
"step": 390
},
{
"epoch": 1.58,
"grad_norm": 1.4231560230255127,
"learning_rate": 4.723953983999215e-05,
"loss": 2.4809,
"step": 400
},
{
"epoch": 1.58,
"eval_loss": 2.4551122188568115,
"eval_runtime": 134.6274,
"eval_samples_per_second": 6.685,
"eval_steps_per_second": 3.343,
"step": 400
},
{
"epoch": 1.62,
"grad_norm": 1.26221764087677,
"learning_rate": 4.70948728030088e-05,
"loss": 2.6339,
"step": 410
},
{
"epoch": 1.66,
"grad_norm": 1.2207887172698975,
"learning_rate": 4.694674447482312e-05,
"loss": 2.5877,
"step": 420
},
{
"epoch": 1.7,
"grad_norm": 1.2746591567993164,
"learning_rate": 4.679517806060509e-05,
"loss": 2.5866,
"step": 430
},
{
"epoch": 1.74,
"grad_norm": 1.774005651473999,
"learning_rate": 4.664019730412101e-05,
"loss": 2.5073,
"step": 440
},
{
"epoch": 1.78,
"grad_norm": 1.4896618127822876,
"learning_rate": 4.648182648401389e-05,
"loss": 2.4688,
"step": 450
},
{
"epoch": 1.82,
"grad_norm": 1.3457367420196533,
"learning_rate": 4.6320090410000027e-05,
"loss": 2.527,
"step": 460
},
{
"epoch": 1.86,
"grad_norm": 1.2498492002487183,
"learning_rate": 4.615501441898248e-05,
"loss": 2.625,
"step": 470
},
{
"epoch": 1.9,
"grad_norm": 1.3643558025360107,
"learning_rate": 4.598662437108186e-05,
"loss": 2.4755,
"step": 480
},
{
"epoch": 1.94,
"grad_norm": 1.198166847229004,
"learning_rate": 4.581494664558518e-05,
"loss": 2.5688,
"step": 490
},
{
"epoch": 1.98,
"grad_norm": 3.3917434215545654,
"learning_rate": 4.564000813681342e-05,
"loss": 2.5182,
"step": 500
},
{
"epoch": 2.01,
"grad_norm": 1.562139630317688,
"learning_rate": 4.546183624990832e-05,
"loss": 2.4533,
"step": 510
},
{
"epoch": 2.05,
"grad_norm": 1.1284795999526978,
"learning_rate": 4.528045889653927e-05,
"loss": 2.4901,
"step": 520
},
{
"epoch": 2.09,
"grad_norm": 1.7664827108383179,
"learning_rate": 4.509590449053074e-05,
"loss": 2.5075,
"step": 530
},
{
"epoch": 2.13,
"grad_norm": 1.6162073612213135,
"learning_rate": 4.49082019434111e-05,
"loss": 2.4769,
"step": 540
},
{
"epoch": 2.17,
"grad_norm": 1.3468163013458252,
"learning_rate": 4.471738065988347e-05,
"loss": 2.4979,
"step": 550
},
{
"epoch": 2.21,
"grad_norm": 1.0762629508972168,
"learning_rate": 4.452347053321926e-05,
"loss": 2.5436,
"step": 560
},
{
"epoch": 2.25,
"grad_norm": 1.1567480564117432,
"learning_rate": 4.432650194057527e-05,
"loss": 2.5454,
"step": 570
},
{
"epoch": 2.29,
"grad_norm": 1.419041395187378,
"learning_rate": 4.412650573823489e-05,
"loss": 2.4681,
"step": 580
},
{
"epoch": 2.33,
"grad_norm": 1.2923465967178345,
"learning_rate": 4.392351325677433e-05,
"loss": 2.565,
"step": 590
},
{
"epoch": 2.37,
"grad_norm": 1.2892262935638428,
"learning_rate": 4.371755629615442e-05,
"loss": 2.5258,
"step": 600
},
{
"epoch": 2.41,
"grad_norm": 1.467966914176941,
"learning_rate": 4.3508667120739046e-05,
"loss": 2.5776,
"step": 610
},
{
"epoch": 2.45,
"grad_norm": 1.2278165817260742,
"learning_rate": 4.329687845424069e-05,
"loss": 2.4175,
"step": 620
},
{
"epoch": 2.49,
"grad_norm": 1.3225311040878296,
"learning_rate": 4.308222347459411e-05,
"loss": 2.4561,
"step": 630
},
{
"epoch": 2.53,
"grad_norm": 1.2582958936691284,
"learning_rate": 4.286473580875878e-05,
"loss": 2.3885,
"step": 640
},
{
"epoch": 2.57,
"grad_norm": 1.206189751625061,
"learning_rate": 4.264444952745108e-05,
"loss": 2.5041,
"step": 650
},
{
"epoch": 2.61,
"grad_norm": 1.9777090549468994,
"learning_rate": 4.242139913980686e-05,
"loss": 2.4763,
"step": 660
},
{
"epoch": 2.65,
"grad_norm": 1.91414475440979,
"learning_rate": 4.219561958797543e-05,
"loss": 2.37,
"step": 670
},
{
"epoch": 2.69,
"grad_norm": 1.0806653499603271,
"learning_rate": 4.196714624164565e-05,
"loss": 2.5985,
"step": 680
},
{
"epoch": 2.73,
"grad_norm": 1.2435009479522705,
"learning_rate": 4.1736014892505064e-05,
"loss": 2.4765,
"step": 690
},
{
"epoch": 2.77,
"grad_norm": 1.3920471668243408,
"learning_rate": 4.150226174863292e-05,
"loss": 2.4446,
"step": 700
},
{
"epoch": 2.8,
"grad_norm": 1.949141263961792,
"learning_rate": 4.126592342882795e-05,
"loss": 2.4979,
"step": 710
},
{
"epoch": 2.84,
"grad_norm": 1.1306403875350952,
"learning_rate": 4.1027036956871854e-05,
"loss": 2.4096,
"step": 720
},
{
"epoch": 2.88,
"grad_norm": 0.9906802773475647,
"learning_rate": 4.078563975572928e-05,
"loss": 2.5409,
"step": 730
},
{
"epoch": 2.92,
"grad_norm": 1.4917031526565552,
"learning_rate": 4.054176964168528e-05,
"loss": 2.4508,
"step": 740
},
{
"epoch": 2.96,
"grad_norm": 1.554909110069275,
"learning_rate": 4.029546481842123e-05,
"loss": 2.4673,
"step": 750
},
{
"epoch": 3.0,
"grad_norm": 1.2943602800369263,
"learning_rate": 4.004676387102995e-05,
"loss": 2.4801,
"step": 760
},
{
"epoch": 3.04,
"grad_norm": 1.301687240600586,
"learning_rate": 3.9795705759971116e-05,
"loss": 2.4779,
"step": 770
},
{
"epoch": 3.08,
"grad_norm": 1.2175750732421875,
"learning_rate": 3.9542329814967914e-05,
"loss": 2.3964,
"step": 780
},
{
"epoch": 3.12,
"grad_norm": 2.502758502960205,
"learning_rate": 3.92866757288458e-05,
"loss": 2.4044,
"step": 790
},
{
"epoch": 3.16,
"grad_norm": 1.508583664894104,
"learning_rate": 3.9028783551314347e-05,
"loss": 2.5229,
"step": 800
},
{
"epoch": 3.16,
"eval_loss": 2.413785696029663,
"eval_runtime": 133.2078,
"eval_samples_per_second": 6.756,
"eval_steps_per_second": 3.378,
"step": 800
},
{
"epoch": 3.2,
"grad_norm": 1.3288512229919434,
"learning_rate": 3.876869368269327e-05,
"loss": 2.4517,
"step": 810
},
{
"epoch": 3.24,
"grad_norm": 1.4469561576843262,
"learning_rate": 3.850644686758346e-05,
"loss": 2.5377,
"step": 820
},
{
"epoch": 3.28,
"grad_norm": 1.560353398323059,
"learning_rate": 3.82420841884841e-05,
"loss": 2.3569,
"step": 830
},
{
"epoch": 3.32,
"grad_norm": 1.9207900762557983,
"learning_rate": 3.7975647059356875e-05,
"loss": 2.4131,
"step": 840
},
{
"epoch": 3.36,
"grad_norm": 1.685535192489624,
"learning_rate": 3.770717721913819e-05,
"loss": 2.5124,
"step": 850
},
{
"epoch": 3.4,
"grad_norm": 1.3592054843902588,
"learning_rate": 3.743671672520054e-05,
"loss": 2.3343,
"step": 860
},
{
"epoch": 3.44,
"grad_norm": 1.9445059299468994,
"learning_rate": 3.716430794676402e-05,
"loss": 2.4614,
"step": 870
},
{
"epoch": 3.48,
"grad_norm": 1.6313419342041016,
"learning_rate": 3.688999355825887e-05,
"loss": 2.4678,
"step": 880
},
{
"epoch": 3.52,
"grad_norm": 2.071474313735962,
"learning_rate": 3.661381653264031e-05,
"loss": 2.4016,
"step": 890
},
{
"epoch": 3.56,
"grad_norm": 6.210580825805664,
"learning_rate": 3.633582013465658e-05,
"loss": 2.3772,
"step": 900
},
{
"epoch": 3.6,
"grad_norm": 1.459627628326416,
"learning_rate": 3.605604791407124e-05,
"loss": 2.4438,
"step": 910
},
{
"epoch": 3.63,
"grad_norm": 1.3812425136566162,
"learning_rate": 3.577454369884086e-05,
"loss": 2.4352,
"step": 920
},
{
"epoch": 3.67,
"grad_norm": 1.443032145500183,
"learning_rate": 3.549135158824913e-05,
"loss": 2.3374,
"step": 930
},
{
"epoch": 3.71,
"grad_norm": 2.8968636989593506,
"learning_rate": 3.520651594599842e-05,
"loss": 2.3911,
"step": 940
},
{
"epoch": 3.75,
"grad_norm": 1.7020437717437744,
"learning_rate": 3.4920081393259955e-05,
"loss": 2.5022,
"step": 950
},
{
"epoch": 3.79,
"grad_norm": 1.4983431100845337,
"learning_rate": 3.463209280168365e-05,
"loss": 2.4919,
"step": 960
},
{
"epoch": 3.83,
"grad_norm": 1.527735948562622,
"learning_rate": 3.434259528636872e-05,
"loss": 2.423,
"step": 970
},
{
"epoch": 3.87,
"grad_norm": 1.3608715534210205,
"learning_rate": 3.405163419879611e-05,
"loss": 2.4668,
"step": 980
},
{
"epoch": 3.91,
"grad_norm": 1.6936486959457397,
"learning_rate": 3.37592551197239e-05,
"loss": 2.4736,
"step": 990
},
{
"epoch": 3.95,
"grad_norm": 1.6318974494934082,
"learning_rate": 3.34655038520469e-05,
"loss": 2.4683,
"step": 1000
},
{
"epoch": 3.99,
"grad_norm": 1.3295326232910156,
"learning_rate": 3.317042641362126e-05,
"loss": 2.3889,
"step": 1010
},
{
"epoch": 4.03,
"grad_norm": 1.5521697998046875,
"learning_rate": 3.2874069030055534e-05,
"loss": 2.4913,
"step": 1020
},
{
"epoch": 4.07,
"grad_norm": 1.2893122434616089,
"learning_rate": 3.257647812746922e-05,
"loss": 2.4289,
"step": 1030
},
{
"epoch": 4.11,
"grad_norm": 1.4011497497558594,
"learning_rate": 3.227770032521975e-05,
"loss": 2.4604,
"step": 1040
},
{
"epoch": 4.15,
"grad_norm": 1.7100721597671509,
"learning_rate": 3.1977782428599364e-05,
"loss": 2.3778,
"step": 1050
},
{
"epoch": 4.19,
"grad_norm": 1.4909169673919678,
"learning_rate": 3.1676771421502746e-05,
"loss": 2.4634,
"step": 1060
},
{
"epoch": 4.23,
"grad_norm": 2.009910821914673,
"learning_rate": 3.137471445906675e-05,
"loss": 2.4035,
"step": 1070
},
{
"epoch": 4.27,
"grad_norm": 1.4564893245697021,
"learning_rate": 3.107165886028326e-05,
"loss": 2.4581,
"step": 1080
},
{
"epoch": 4.31,
"grad_norm": 1.6162135601043701,
"learning_rate": 3.076765210058638e-05,
"loss": 2.4216,
"step": 1090
},
{
"epoch": 4.35,
"grad_norm": 1.469684362411499,
"learning_rate": 3.046274180441512e-05,
"loss": 2.3395,
"step": 1100
},
{
"epoch": 4.39,
"grad_norm": 2.3828556537628174,
"learning_rate": 3.015697573775283e-05,
"loss": 2.4602,
"step": 1110
},
{
"epoch": 4.42,
"grad_norm": 1.5302035808563232,
"learning_rate": 2.9850401800644257e-05,
"loss": 2.4116,
"step": 1120
},
{
"epoch": 4.46,
"grad_norm": 2.1008236408233643,
"learning_rate": 2.9543068019691833e-05,
"loss": 2.2545,
"step": 1130
},
{
"epoch": 4.5,
"grad_norm": 1.4228670597076416,
"learning_rate": 2.923502254053193e-05,
"loss": 2.4589,
"step": 1140
},
{
"epoch": 4.54,
"grad_norm": 1.4719305038452148,
"learning_rate": 2.892631362029265e-05,
"loss": 2.3918,
"step": 1150
},
{
"epoch": 4.58,
"grad_norm": 1.771802544593811,
"learning_rate": 2.8616989620034013e-05,
"loss": 2.3929,
"step": 1160
},
{
"epoch": 4.62,
"grad_norm": 1.5566627979278564,
"learning_rate": 2.83070989971719e-05,
"loss": 2.3442,
"step": 1170
},
{
"epoch": 4.66,
"grad_norm": 1.8499693870544434,
"learning_rate": 2.7996690297886995e-05,
"loss": 2.4422,
"step": 1180
},
{
"epoch": 4.7,
"grad_norm": 1.5866152048110962,
"learning_rate": 2.768581214951964e-05,
"loss": 2.4489,
"step": 1190
},
{
"epoch": 4.74,
"grad_norm": 1.5571675300598145,
"learning_rate": 2.737451325295214e-05,
"loss": 2.3453,
"step": 1200
},
{
"epoch": 4.74,
"eval_loss": 2.4050841331481934,
"eval_runtime": 133.8041,
"eval_samples_per_second": 6.726,
"eval_steps_per_second": 3.363,
"step": 1200
},
{
"epoch": 4.78,
"grad_norm": 1.371382474899292,
"learning_rate": 2.706284237497948e-05,
"loss": 2.3094,
"step": 1210
},
{
"epoch": 4.82,
"grad_norm": 1.5894430875778198,
"learning_rate": 2.675084834066968e-05,
"loss": 2.352,
"step": 1220
},
{
"epoch": 4.86,
"grad_norm": 1.9093360900878906,
"learning_rate": 2.6438580025715138e-05,
"loss": 2.3941,
"step": 1230
},
{
"epoch": 4.9,
"grad_norm": 1.7057812213897705,
"learning_rate": 2.612608634877588e-05,
"loss": 2.408,
"step": 1240
},
{
"epoch": 4.94,
"grad_norm": 4.448972225189209,
"learning_rate": 2.5813416263816227e-05,
"loss": 2.4234,
"step": 1250
},
{
"epoch": 4.98,
"grad_norm": 1.4726636409759521,
"learning_rate": 2.550061875243584e-05,
"loss": 2.4223,
"step": 1260
},
{
"epoch": 5.02,
"grad_norm": 1.3479527235031128,
"learning_rate": 2.5187742816196487e-05,
"loss": 2.3444,
"step": 1270
},
{
"epoch": 5.06,
"grad_norm": 1.584678292274475,
"learning_rate": 2.487483746894563e-05,
"loss": 2.4881,
"step": 1280
},
{
"epoch": 5.1,
"grad_norm": 1.5328477621078491,
"learning_rate": 2.4561951729138167e-05,
"loss": 2.3752,
"step": 1290
},
{
"epoch": 5.14,
"grad_norm": 2.0441181659698486,
"learning_rate": 2.4249134612157346e-05,
"loss": 2.4605,
"step": 1300
},
{
"epoch": 5.18,
"grad_norm": 1.5883582830429077,
"learning_rate": 2.393643512263627e-05,
"loss": 2.3095,
"step": 1310
},
{
"epoch": 5.21,
"grad_norm": 1.6504632234573364,
"learning_rate": 2.3623902246780994e-05,
"loss": 2.3773,
"step": 1320
},
{
"epoch": 5.25,
"grad_norm": 2.101841926574707,
"learning_rate": 2.331158494469657e-05,
"loss": 2.3966,
"step": 1330
},
{
"epoch": 5.29,
"grad_norm": 1.5765920877456665,
"learning_rate": 2.2999532142717174e-05,
"loss": 2.4361,
"step": 1340
},
{
"epoch": 5.33,
"grad_norm": 2.0858278274536133,
"learning_rate": 2.268779272574146e-05,
"loss": 2.3576,
"step": 1350
},
{
"epoch": 5.37,
"grad_norm": 1.7046364545822144,
"learning_rate": 2.2376415529574525e-05,
"loss": 2.4298,
"step": 1360
},
{
"epoch": 5.41,
"grad_norm": 1.7461296319961548,
"learning_rate": 2.206544933327742e-05,
"loss": 2.3175,
"step": 1370
},
{
"epoch": 5.45,
"grad_norm": 2.0052788257598877,
"learning_rate": 2.1754942851525677e-05,
"loss": 2.3432,
"step": 1380
},
{
"epoch": 5.49,
"grad_norm": 1.8527193069458008,
"learning_rate": 2.1444944726977857e-05,
"loss": 2.2937,
"step": 1390
},
{
"epoch": 5.53,
"grad_norm": 1.8431612253189087,
"learning_rate": 2.1135503522655374e-05,
"loss": 2.3031,
"step": 1400
},
{
"epoch": 5.57,
"grad_norm": 1.8249716758728027,
"learning_rate": 2.082666771433484e-05,
"loss": 2.4171,
"step": 1410
},
{
"epoch": 5.61,
"grad_norm": 1.6596335172653198,
"learning_rate": 2.0518485682954025e-05,
"loss": 2.4917,
"step": 1420
},
{
"epoch": 5.65,
"grad_norm": 1.8855317831039429,
"learning_rate": 2.0211005707032733e-05,
"loss": 2.3648,
"step": 1430
},
{
"epoch": 5.69,
"grad_norm": 1.6180534362792969,
"learning_rate": 1.9904275955109652e-05,
"loss": 2.4083,
"step": 1440
},
{
"epoch": 5.73,
"grad_norm": 1.5273176431655884,
"learning_rate": 1.959834447819649e-05,
"loss": 2.4187,
"step": 1450
},
{
"epoch": 5.77,
"grad_norm": 1.8004169464111328,
"learning_rate": 1.9293259202250517e-05,
"loss": 2.4147,
"step": 1460
},
{
"epoch": 5.81,
"grad_norm": 1.641048550605774,
"learning_rate": 1.8989067920666633e-05,
"loss": 2.3738,
"step": 1470
},
{
"epoch": 5.85,
"grad_norm": 1.586946964263916,
"learning_rate": 1.8685818286790325e-05,
"loss": 2.4126,
"step": 1480
},
{
"epoch": 5.89,
"grad_norm": 2.0825576782226562,
"learning_rate": 1.8383557806452433e-05,
"loss": 2.3781,
"step": 1490
},
{
"epoch": 5.93,
"grad_norm": 1.7725000381469727,
"learning_rate": 1.808233383052709e-05,
"loss": 2.2956,
"step": 1500
},
{
"epoch": 5.97,
"grad_norm": 1.7009029388427734,
"learning_rate": 1.7782193547513974e-05,
"loss": 2.3416,
"step": 1510
},
{
"epoch": 6.0,
"grad_norm": 1.7379595041275024,
"learning_rate": 1.7483183976145894e-05,
"loss": 2.3466,
"step": 1520
},
{
"epoch": 6.04,
"grad_norm": 1.678911805152893,
"learning_rate": 1.7185351958023082e-05,
"loss": 2.4167,
"step": 1530
},
{
"epoch": 6.08,
"grad_norm": 1.663160800933838,
"learning_rate": 1.6888744150275148e-05,
"loss": 2.4156,
"step": 1540
},
{
"epoch": 6.12,
"grad_norm": 1.5950766801834106,
"learning_rate": 1.6593407018251973e-05,
"loss": 2.3795,
"step": 1550
},
{
"epoch": 6.16,
"grad_norm": 1.5608184337615967,
"learning_rate": 1.6299386828244645e-05,
"loss": 2.3945,
"step": 1560
},
{
"epoch": 6.2,
"grad_norm": 2.6302921772003174,
"learning_rate": 1.60067296402376e-05,
"loss": 2.3195,
"step": 1570
},
{
"epoch": 6.24,
"grad_norm": 1.7563198804855347,
"learning_rate": 1.5715481300692993e-05,
"loss": 2.3551,
"step": 1580
},
{
"epoch": 6.28,
"grad_norm": 2.2081732749938965,
"learning_rate": 1.5425687435368648e-05,
"loss": 2.3597,
"step": 1590
},
{
"epoch": 6.32,
"grad_norm": 2.120513916015625,
"learning_rate": 1.5137393442170461e-05,
"loss": 2.3758,
"step": 1600
},
{
"epoch": 6.32,
"eval_loss": 2.400667428970337,
"eval_runtime": 133.4489,
"eval_samples_per_second": 6.744,
"eval_steps_per_second": 3.372,
"step": 1600
},
{
"epoch": 6.36,
"grad_norm": 1.9258661270141602,
"learning_rate": 1.4850644484040584e-05,
"loss": 2.3852,
"step": 1610
},
{
"epoch": 6.4,
"grad_norm": 1.749426007270813,
"learning_rate": 1.4565485481882396e-05,
"loss": 2.3067,
"step": 1620
},
{
"epoch": 6.44,
"grad_norm": 1.9953992366790771,
"learning_rate": 1.4281961107523336e-05,
"loss": 2.3013,
"step": 1630
},
{
"epoch": 6.48,
"grad_norm": 2.156952381134033,
"learning_rate": 1.4000115776716849e-05,
"loss": 2.3504,
"step": 1640
},
{
"epoch": 6.52,
"grad_norm": 2.4170098304748535,
"learning_rate": 1.371999364218437e-05,
"loss": 2.3035,
"step": 1650
},
{
"epoch": 6.56,
"grad_norm": 2.338102340698242,
"learning_rate": 1.3441638586698527e-05,
"loss": 2.2753,
"step": 1660
},
{
"epoch": 6.6,
"grad_norm": 2.286085605621338,
"learning_rate": 1.3165094216208696e-05,
"loss": 2.3644,
"step": 1670
},
{
"epoch": 6.64,
"grad_norm": 2.505244016647339,
"learning_rate": 1.2890403853009847e-05,
"loss": 2.371,
"step": 1680
},
{
"epoch": 6.68,
"grad_norm": 1.636423110961914,
"learning_rate": 1.2617610528955814e-05,
"loss": 2.3602,
"step": 1690
},
{
"epoch": 6.72,
"grad_norm": 1.6253471374511719,
"learning_rate": 1.234675697871818e-05,
"loss": 2.3858,
"step": 1700
},
{
"epoch": 6.76,
"grad_norm": 1.9490761756896973,
"learning_rate": 1.2077885633091595e-05,
"loss": 2.2864,
"step": 1710
},
{
"epoch": 6.8,
"grad_norm": 1.7611408233642578,
"learning_rate": 1.1811038612346728e-05,
"loss": 2.2646,
"step": 1720
},
{
"epoch": 6.83,
"grad_norm": 1.9415556192398071,
"learning_rate": 1.154625771963192e-05,
"loss": 2.311,
"step": 1730
},
{
"epoch": 6.87,
"grad_norm": 2.0429086685180664,
"learning_rate": 1.1283584434424455e-05,
"loss": 2.3504,
"step": 1740
},
{
"epoch": 6.91,
"grad_norm": 2.0815227031707764,
"learning_rate": 1.102305990603257e-05,
"loss": 2.3426,
"step": 1750
},
{
"epoch": 6.95,
"grad_norm": 1.8559825420379639,
"learning_rate": 1.0764724947149132e-05,
"loss": 2.3183,
"step": 1760
},
{
"epoch": 6.99,
"grad_norm": 2.6576716899871826,
"learning_rate": 1.0508620027458158e-05,
"loss": 2.378,
"step": 1770
},
{
"epoch": 7.03,
"grad_norm": 1.9085129499435425,
"learning_rate": 1.0254785267294958e-05,
"loss": 2.3286,
"step": 1780
},
{
"epoch": 7.07,
"grad_norm": 1.899032711982727,
"learning_rate": 1.0003260431361039e-05,
"loss": 2.3615,
"step": 1790
},
{
"epoch": 7.11,
"grad_norm": 1.8750344514846802,
"learning_rate": 9.75408492249478e-06,
"loss": 2.3459,
"step": 1800
},
{
"epoch": 7.15,
"grad_norm": 2.1118319034576416,
"learning_rate": 9.507297775498707e-06,
"loss": 2.4204,
"step": 1810
},
{
"epoch": 7.19,
"grad_norm": 1.971839189529419,
"learning_rate": 9.262937651024462e-06,
"loss": 2.3497,
"step": 1820
},
{
"epoch": 7.23,
"grad_norm": 2.0775558948516846,
"learning_rate": 9.02104282951641e-06,
"loss": 2.3027,
"step": 1830
},
{
"epoch": 7.27,
"grad_norm": 2.3251700401306152,
"learning_rate": 8.781651205214775e-06,
"loss": 2.3317,
"step": 1840
},
{
"epoch": 7.31,
"grad_norm": 2.0580742359161377,
"learning_rate": 8.544800280219282e-06,
"loss": 2.3516,
"step": 1850
},
{
"epoch": 7.35,
"grad_norm": 2.2532641887664795,
"learning_rate": 8.310527158614204e-06,
"loss": 2.2712,
"step": 1860
},
{
"epoch": 7.39,
"grad_norm": 2.1395344734191895,
"learning_rate": 8.07886854065585e-06,
"loss": 2.3357,
"step": 1870
},
{
"epoch": 7.43,
"grad_norm": 1.6818287372589111,
"learning_rate": 7.849860717023217e-06,
"loss": 2.3414,
"step": 1880
},
{
"epoch": 7.47,
"grad_norm": 2.29758882522583,
"learning_rate": 7.62353956313284e-06,
"loss": 2.2435,
"step": 1890
},
{
"epoch": 7.51,
"grad_norm": 2.3084988594055176,
"learning_rate": 7.3999405335187124e-06,
"loss": 2.3185,
"step": 1900
},
{
"epoch": 7.55,
"grad_norm": 2.1061959266662598,
"learning_rate": 7.17909865627813e-06,
"loss": 2.3499,
"step": 1910
},
{
"epoch": 7.59,
"grad_norm": 1.7624549865722656,
"learning_rate": 6.961048527584296e-06,
"loss": 2.3895,
"step": 1920
},
{
"epoch": 7.62,
"grad_norm": 2.2806477546691895,
"learning_rate": 6.745824306266685e-06,
"loss": 2.3313,
"step": 1930
},
{
"epoch": 7.66,
"grad_norm": 1.7848331928253174,
"learning_rate": 6.533459708459827e-06,
"loss": 2.4686,
"step": 1940
},
{
"epoch": 7.7,
"grad_norm": 1.8504621982574463,
"learning_rate": 6.323988002321471e-06,
"loss": 2.2985,
"step": 1950
},
{
"epoch": 7.74,
"grad_norm": 2.1124908924102783,
"learning_rate": 6.1174420028209585e-06,
"loss": 2.3432,
"step": 1960
},
{
"epoch": 7.78,
"grad_norm": 1.7486050128936768,
"learning_rate": 5.9138540665985595e-06,
"loss": 2.3414,
"step": 1970
},
{
"epoch": 7.82,
"grad_norm": 2.394221782684326,
"learning_rate": 5.713256086896604e-06,
"loss": 2.3297,
"step": 1980
},
{
"epoch": 7.86,
"grad_norm": 2.157335042953491,
"learning_rate": 5.5156794885632165e-06,
"loss": 2.2748,
"step": 1990
},
{
"epoch": 7.9,
"grad_norm": 2.8654778003692627,
"learning_rate": 5.3211552231294485e-06,
"loss": 2.3073,
"step": 2000
},
{
"epoch": 7.9,
"eval_loss": 2.395761489868164,
"eval_runtime": 134.6111,
"eval_samples_per_second": 6.686,
"eval_steps_per_second": 3.343,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 2530,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 400,
"total_flos": 2.2040949247338086e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}