RoyJoy's picture
Training in progress, step 163, checkpoint
fc34c05 verified
{
"best_metric": 1.9801422357559204,
"best_model_checkpoint": "miner_id_24/checkpoint-150",
"epoch": 3.0149726461272675,
"eval_steps": 25,
"global_step": 163,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018427872156636913,
"grad_norm": 15.235661506652832,
"learning_rate": 1.6666666666666667e-05,
"loss": 10.9897,
"step": 1
},
{
"epoch": 0.018427872156636913,
"eval_loss": 10.919320106506348,
"eval_runtime": 0.2022,
"eval_samples_per_second": 247.286,
"eval_steps_per_second": 64.294,
"step": 1
},
{
"epoch": 0.036855744313273826,
"grad_norm": 17.35055160522461,
"learning_rate": 3.3333333333333335e-05,
"loss": 11.0612,
"step": 2
},
{
"epoch": 0.05528361646991074,
"grad_norm": 21.442893981933594,
"learning_rate": 5e-05,
"loss": 10.9134,
"step": 3
},
{
"epoch": 0.07371148862654765,
"grad_norm": 26.32647132873535,
"learning_rate": 6.666666666666667e-05,
"loss": 10.8533,
"step": 4
},
{
"epoch": 0.09213936078318456,
"grad_norm": 28.515634536743164,
"learning_rate": 8.333333333333334e-05,
"loss": 10.7699,
"step": 5
},
{
"epoch": 0.11056723293982149,
"grad_norm": 30.190385818481445,
"learning_rate": 0.0001,
"loss": 10.458,
"step": 6
},
{
"epoch": 0.1289951050964584,
"grad_norm": 41.93853759765625,
"learning_rate": 9.999099116842838e-05,
"loss": 9.8715,
"step": 7
},
{
"epoch": 0.1474229772530953,
"grad_norm": 49.35661697387695,
"learning_rate": 9.996396828078219e-05,
"loss": 8.9689,
"step": 8
},
{
"epoch": 0.16585084940973222,
"grad_norm": 34.102195739746094,
"learning_rate": 9.99189421568234e-05,
"loss": 8.3176,
"step": 9
},
{
"epoch": 0.18427872156636912,
"grad_norm": 24.88856315612793,
"learning_rate": 9.985593082467497e-05,
"loss": 7.8949,
"step": 10
},
{
"epoch": 0.20270659372300603,
"grad_norm": 23.59130859375,
"learning_rate": 9.977495951360264e-05,
"loss": 7.423,
"step": 11
},
{
"epoch": 0.22113446587964297,
"grad_norm": 23.152849197387695,
"learning_rate": 9.967606064391318e-05,
"loss": 7.1411,
"step": 12
},
{
"epoch": 0.23956233803627988,
"grad_norm": 16.93653678894043,
"learning_rate": 9.955927381397373e-05,
"loss": 7.9503,
"step": 13
},
{
"epoch": 0.2579902101929168,
"grad_norm": 18.210586547851562,
"learning_rate": 9.942464578435674e-05,
"loss": 8.7342,
"step": 14
},
{
"epoch": 0.2764180823495537,
"grad_norm": 24.980066299438477,
"learning_rate": 9.92722304591175e-05,
"loss": 7.7758,
"step": 15
},
{
"epoch": 0.2948459545061906,
"grad_norm": 23.48898696899414,
"learning_rate": 9.910208886421129e-05,
"loss": 6.4183,
"step": 16
},
{
"epoch": 0.31327382666282755,
"grad_norm": 20.72090721130371,
"learning_rate": 9.891428912305911e-05,
"loss": 5.7773,
"step": 17
},
{
"epoch": 0.33170169881946443,
"grad_norm": 17.702342987060547,
"learning_rate": 9.870890642927143e-05,
"loss": 5.4866,
"step": 18
},
{
"epoch": 0.35012957097610137,
"grad_norm": 14.644960403442383,
"learning_rate": 9.84860230165415e-05,
"loss": 5.2156,
"step": 19
},
{
"epoch": 0.36855744313273825,
"grad_norm": 13.854472160339355,
"learning_rate": 9.824572812571928e-05,
"loss": 4.9238,
"step": 20
},
{
"epoch": 0.3869853152893752,
"grad_norm": 13.135542869567871,
"learning_rate": 9.798811796908029e-05,
"loss": 4.7154,
"step": 21
},
{
"epoch": 0.40541318744601207,
"grad_norm": 13.009698867797852,
"learning_rate": 9.771329569180288e-05,
"loss": 4.4549,
"step": 22
},
{
"epoch": 0.423841059602649,
"grad_norm": 13.120542526245117,
"learning_rate": 9.742137133066958e-05,
"loss": 4.1579,
"step": 23
},
{
"epoch": 0.44226893175928594,
"grad_norm": 12.632692337036133,
"learning_rate": 9.711246177000938e-05,
"loss": 3.9329,
"step": 24
},
{
"epoch": 0.4606968039159228,
"grad_norm": 12.716289520263672,
"learning_rate": 9.678669069489793e-05,
"loss": 3.6266,
"step": 25
},
{
"epoch": 0.4606968039159228,
"eval_loss": 3.70853328704834,
"eval_runtime": 0.1911,
"eval_samples_per_second": 261.604,
"eval_steps_per_second": 68.017,
"step": 25
},
{
"epoch": 0.47912467607255976,
"grad_norm": 25.649930953979492,
"learning_rate": 9.644418854163509e-05,
"loss": 5.2592,
"step": 26
},
{
"epoch": 0.49755254822919665,
"grad_norm": 21.4190673828125,
"learning_rate": 9.608509244551916e-05,
"loss": 4.866,
"step": 27
},
{
"epoch": 0.5159804203858336,
"grad_norm": 13.054777145385742,
"learning_rate": 9.570954618593895e-05,
"loss": 3.5799,
"step": 28
},
{
"epoch": 0.5344082925424705,
"grad_norm": 8.924787521362305,
"learning_rate": 9.531770012880553e-05,
"loss": 2.9322,
"step": 29
},
{
"epoch": 0.5528361646991073,
"grad_norm": 7.91404390335083,
"learning_rate": 9.490971116634696e-05,
"loss": 2.8879,
"step": 30
},
{
"epoch": 0.5712640368557443,
"grad_norm": 7.769594192504883,
"learning_rate": 9.448574265428971e-05,
"loss": 2.7929,
"step": 31
},
{
"epoch": 0.5896919090123812,
"grad_norm": 6.016697883605957,
"learning_rate": 9.404596434645231e-05,
"loss": 2.6413,
"step": 32
},
{
"epoch": 0.6081197811690181,
"grad_norm": 5.778473377227783,
"learning_rate": 9.359055232677717e-05,
"loss": 2.5554,
"step": 33
},
{
"epoch": 0.6265476533256551,
"grad_norm": 5.234768390655518,
"learning_rate": 9.31196889388279e-05,
"loss": 2.4804,
"step": 34
},
{
"epoch": 0.644975525482292,
"grad_norm": 4.422307968139648,
"learning_rate": 9.263356271278027e-05,
"loss": 2.3563,
"step": 35
},
{
"epoch": 0.6634033976389289,
"grad_norm": 4.935121536254883,
"learning_rate": 9.213236828993619e-05,
"loss": 2.3584,
"step": 36
},
{
"epoch": 0.6818312697955657,
"grad_norm": 5.38230037689209,
"learning_rate": 9.161630634479079e-05,
"loss": 2.2312,
"step": 37
},
{
"epoch": 0.7002591419522027,
"grad_norm": 6.988575458526611,
"learning_rate": 9.10855835046838e-05,
"loss": 2.6297,
"step": 38
},
{
"epoch": 0.7186870141088396,
"grad_norm": 14.805834770202637,
"learning_rate": 9.054041226706758e-05,
"loss": 2.9602,
"step": 39
},
{
"epoch": 0.7371148862654765,
"grad_norm": 9.114249229431152,
"learning_rate": 8.998101091442468e-05,
"loss": 2.5981,
"step": 40
},
{
"epoch": 0.7555427584221135,
"grad_norm": 5.007589340209961,
"learning_rate": 8.940760342686917e-05,
"loss": 2.2517,
"step": 41
},
{
"epoch": 0.7739706305787504,
"grad_norm": 3.1597187519073486,
"learning_rate": 8.882041939246671e-05,
"loss": 2.1793,
"step": 42
},
{
"epoch": 0.7923985027353873,
"grad_norm": 2.7579259872436523,
"learning_rate": 8.821969391530922e-05,
"loss": 2.1703,
"step": 43
},
{
"epoch": 0.8108263748920241,
"grad_norm": 2.0153744220733643,
"learning_rate": 8.760566752138085e-05,
"loss": 2.1642,
"step": 44
},
{
"epoch": 0.8292542470486611,
"grad_norm": 2.2700977325439453,
"learning_rate": 8.697858606225335e-05,
"loss": 2.1068,
"step": 45
},
{
"epoch": 0.847682119205298,
"grad_norm": 1.8285726308822632,
"learning_rate": 8.633870061664878e-05,
"loss": 2.1288,
"step": 46
},
{
"epoch": 0.8661099913619349,
"grad_norm": 1.4672825336456299,
"learning_rate": 8.568626738990958e-05,
"loss": 2.0792,
"step": 47
},
{
"epoch": 0.8845378635185719,
"grad_norm": 2.0133278369903564,
"learning_rate": 8.50215476114158e-05,
"loss": 2.1263,
"step": 48
},
{
"epoch": 0.9029657356752088,
"grad_norm": 1.6172701120376587,
"learning_rate": 8.434480742999089e-05,
"loss": 2.0611,
"step": 49
},
{
"epoch": 0.9213936078318457,
"grad_norm": 1.496863603591919,
"learning_rate": 8.365631780733758e-05,
"loss": 2.0717,
"step": 50
},
{
"epoch": 0.9213936078318457,
"eval_loss": 2.095752477645874,
"eval_runtime": 0.1966,
"eval_samples_per_second": 254.354,
"eval_steps_per_second": 66.132,
"step": 50
},
{
"epoch": 0.9398214799884825,
"grad_norm": 3.3888142108917236,
"learning_rate": 8.295635440954695e-05,
"loss": 2.2467,
"step": 51
},
{
"epoch": 0.9582493521451195,
"grad_norm": 1.6675165891647339,
"learning_rate": 8.224519749672376e-05,
"loss": 2.1031,
"step": 52
},
{
"epoch": 0.9766772243017564,
"grad_norm": 1.0358479022979736,
"learning_rate": 8.152313181077242e-05,
"loss": 2.0694,
"step": 53
},
{
"epoch": 0.9951050964583933,
"grad_norm": 1.0793170928955078,
"learning_rate": 8.079044646138837e-05,
"loss": 2.0398,
"step": 54
},
{
"epoch": 1.0172761301468471,
"grad_norm": 5.010680675506592,
"learning_rate": 8.004743481030088e-05,
"loss": 4.036,
"step": 55
},
{
"epoch": 1.035704002303484,
"grad_norm": 3.263810873031616,
"learning_rate": 7.929439435381305e-05,
"loss": 2.2462,
"step": 56
},
{
"epoch": 1.054131874460121,
"grad_norm": 2.19437837600708,
"learning_rate": 7.853162660368662e-05,
"loss": 2.1076,
"step": 57
},
{
"epoch": 1.0725597466167578,
"grad_norm": 1.1197891235351562,
"learning_rate": 7.775943696641888e-05,
"loss": 2.0394,
"step": 58
},
{
"epoch": 1.0909876187733947,
"grad_norm": 1.35910964012146,
"learning_rate": 7.697813462096025e-05,
"loss": 2.0473,
"step": 59
},
{
"epoch": 1.1094154909300316,
"grad_norm": 1.9377872943878174,
"learning_rate": 7.618803239492121e-05,
"loss": 2.0849,
"step": 60
},
{
"epoch": 1.1278433630866687,
"grad_norm": 1.510143756866455,
"learning_rate": 7.538944663931862e-05,
"loss": 2.0566,
"step": 61
},
{
"epoch": 1.1462712352433055,
"grad_norm": 2.0934293270111084,
"learning_rate": 7.458269710191101e-05,
"loss": 1.9738,
"step": 62
},
{
"epoch": 1.1646991073999424,
"grad_norm": 0.8613946437835693,
"learning_rate": 7.376810679917411e-05,
"loss": 2.0698,
"step": 63
},
{
"epoch": 1.1831269795565793,
"grad_norm": 0.933632493019104,
"learning_rate": 7.294600188696732e-05,
"loss": 2.0709,
"step": 64
},
{
"epoch": 1.2015548517132162,
"grad_norm": 1.4427233934402466,
"learning_rate": 7.211671152994348e-05,
"loss": 1.9954,
"step": 65
},
{
"epoch": 1.219982723869853,
"grad_norm": 2.0800139904022217,
"learning_rate": 7.128056776975369e-05,
"loss": 1.9794,
"step": 66
},
{
"epoch": 1.23841059602649,
"grad_norm": 1.3812874555587769,
"learning_rate": 7.043790539210045e-05,
"loss": 2.0207,
"step": 67
},
{
"epoch": 1.256838468183127,
"grad_norm": 1.9489840269088745,
"learning_rate": 6.95890617926918e-05,
"loss": 2.0679,
"step": 68
},
{
"epoch": 1.275266340339764,
"grad_norm": 1.545703411102295,
"learning_rate": 6.873437684215077e-05,
"loss": 2.0753,
"step": 69
},
{
"epoch": 1.2936942124964008,
"grad_norm": 1.9554816484451294,
"learning_rate": 6.787419274993366e-05,
"loss": 2.0324,
"step": 70
},
{
"epoch": 1.3121220846530377,
"grad_norm": 2.3300676345825195,
"learning_rate": 6.700885392731187e-05,
"loss": 2.0034,
"step": 71
},
{
"epoch": 1.3305499568096746,
"grad_norm": 1.7361541986465454,
"learning_rate": 6.613870684947231e-05,
"loss": 2.0202,
"step": 72
},
{
"epoch": 1.3489778289663115,
"grad_norm": 1.2443159818649292,
"learning_rate": 6.526409991679134e-05,
"loss": 1.9933,
"step": 73
},
{
"epoch": 1.3674057011229483,
"grad_norm": 0.6612122654914856,
"learning_rate": 6.438538331533768e-05,
"loss": 1.9769,
"step": 74
},
{
"epoch": 1.3858335732795855,
"grad_norm": 0.6065022945404053,
"learning_rate": 6.350290887666078e-05,
"loss": 2.0084,
"step": 75
},
{
"epoch": 1.3858335732795855,
"eval_loss": 2.0034306049346924,
"eval_runtime": 0.1919,
"eval_samples_per_second": 260.533,
"eval_steps_per_second": 67.739,
"step": 75
},
{
"epoch": 1.4042614454362223,
"grad_norm": 0.7181031107902527,
"learning_rate": 6.261702993691994e-05,
"loss": 2.0097,
"step": 76
},
{
"epoch": 1.4226893175928592,
"grad_norm": 1.1980915069580078,
"learning_rate": 6.172810119541118e-05,
"loss": 2.0081,
"step": 77
},
{
"epoch": 1.441117189749496,
"grad_norm": 2.185781478881836,
"learning_rate": 6.083647857254837e-05,
"loss": 1.9817,
"step": 78
},
{
"epoch": 1.459545061906133,
"grad_norm": 0.7530511021614075,
"learning_rate": 5.9942519067355284e-05,
"loss": 1.9986,
"step": 79
},
{
"epoch": 1.4779729340627699,
"grad_norm": 1.7427482604980469,
"learning_rate": 5.904658061452585e-05,
"loss": 2.0496,
"step": 80
},
{
"epoch": 1.4964008062194067,
"grad_norm": 2.031916618347168,
"learning_rate": 5.814902194110988e-05,
"loss": 2.0361,
"step": 81
},
{
"epoch": 1.5148286783760438,
"grad_norm": 1.6156734228134155,
"learning_rate": 5.7250202422881336e-05,
"loss": 2.0416,
"step": 82
},
{
"epoch": 1.5332565505326807,
"grad_norm": 0.9678031206130981,
"learning_rate": 5.635048194044702e-05,
"loss": 2.0342,
"step": 83
},
{
"epoch": 1.5516844226893176,
"grad_norm": 0.4935534596443176,
"learning_rate": 5.5450220735153056e-05,
"loss": 1.9647,
"step": 84
},
{
"epoch": 1.5701122948459545,
"grad_norm": 1.1301989555358887,
"learning_rate": 5.4549779264846955e-05,
"loss": 1.9986,
"step": 85
},
{
"epoch": 1.5885401670025914,
"grad_norm": 2.0241589546203613,
"learning_rate": 5.3649518059552994e-05,
"loss": 2.0601,
"step": 86
},
{
"epoch": 1.6069680391592285,
"grad_norm": 1.204086184501648,
"learning_rate": 5.2749797577118675e-05,
"loss": 1.9727,
"step": 87
},
{
"epoch": 1.6253959113158651,
"grad_norm": 1.2840675115585327,
"learning_rate": 5.185097805889013e-05,
"loss": 2.0326,
"step": 88
},
{
"epoch": 1.6438237834725022,
"grad_norm": 1.003303050994873,
"learning_rate": 5.0953419385474155e-05,
"loss": 1.9445,
"step": 89
},
{
"epoch": 1.6622516556291391,
"grad_norm": 0.5838247537612915,
"learning_rate": 5.005748093264473e-05,
"loss": 1.9922,
"step": 90
},
{
"epoch": 1.680679527785776,
"grad_norm": 0.7593981623649597,
"learning_rate": 4.916352142745163e-05,
"loss": 1.966,
"step": 91
},
{
"epoch": 1.6991073999424129,
"grad_norm": 0.8177780508995056,
"learning_rate": 4.827189880458882e-05,
"loss": 1.9918,
"step": 92
},
{
"epoch": 1.7175352720990498,
"grad_norm": 0.8616487979888916,
"learning_rate": 4.7382970063080076e-05,
"loss": 2.0279,
"step": 93
},
{
"epoch": 1.7359631442556869,
"grad_norm": 1.1605690717697144,
"learning_rate": 4.649709112333923e-05,
"loss": 2.0104,
"step": 94
},
{
"epoch": 1.7543910164123235,
"grad_norm": 1.9952294826507568,
"learning_rate": 4.561461668466233e-05,
"loss": 1.9628,
"step": 95
},
{
"epoch": 1.7728188885689606,
"grad_norm": 1.4495019912719727,
"learning_rate": 4.473590008320868e-05,
"loss": 2.0058,
"step": 96
},
{
"epoch": 1.7912467607255975,
"grad_norm": 1.114969253540039,
"learning_rate": 4.386129315052768e-05,
"loss": 2.0131,
"step": 97
},
{
"epoch": 1.8096746328822344,
"grad_norm": 0.8008613586425781,
"learning_rate": 4.299114607268814e-05,
"loss": 1.9947,
"step": 98
},
{
"epoch": 1.8281025050388713,
"grad_norm": 0.7169589400291443,
"learning_rate": 4.2125807250066354e-05,
"loss": 1.9454,
"step": 99
},
{
"epoch": 1.8465303771955082,
"grad_norm": 1.009277582168579,
"learning_rate": 4.1265623157849235e-05,
"loss": 2.0115,
"step": 100
},
{
"epoch": 1.8465303771955082,
"eval_loss": 1.994510531425476,
"eval_runtime": 0.192,
"eval_samples_per_second": 260.351,
"eval_steps_per_second": 67.691,
"step": 100
},
{
"epoch": 1.8649582493521453,
"grad_norm": 0.5230905413627625,
"learning_rate": 4.041093820730821e-05,
"loss": 1.9376,
"step": 101
},
{
"epoch": 1.883386121508782,
"grad_norm": 1.3596158027648926,
"learning_rate": 3.956209460789957e-05,
"loss": 2.0129,
"step": 102
},
{
"epoch": 1.901813993665419,
"grad_norm": 1.3502169847488403,
"learning_rate": 3.871943223024631e-05,
"loss": 1.9809,
"step": 103
},
{
"epoch": 1.9202418658220557,
"grad_norm": 0.7711046934127808,
"learning_rate": 3.788328847005654e-05,
"loss": 2.0039,
"step": 104
},
{
"epoch": 1.9386697379786928,
"grad_norm": 1.2358088493347168,
"learning_rate": 3.7053998113032694e-05,
"loss": 1.9582,
"step": 105
},
{
"epoch": 1.9570976101353297,
"grad_norm": 0.9068220853805542,
"learning_rate": 3.623189320082592e-05,
"loss": 1.9918,
"step": 106
},
{
"epoch": 1.9755254822919666,
"grad_norm": 0.4698697328567505,
"learning_rate": 3.5417302898089e-05,
"loss": 1.9919,
"step": 107
},
{
"epoch": 1.9939533544486037,
"grad_norm": 0.7360281944274902,
"learning_rate": 3.461055336068141e-05,
"loss": 1.9719,
"step": 108
},
{
"epoch": 2.016124388137057,
"grad_norm": 2.62359356880188,
"learning_rate": 3.3811967605078796e-05,
"loss": 4.039,
"step": 109
},
{
"epoch": 2.0345522602936943,
"grad_norm": 1.5227746963500977,
"learning_rate": 3.3021865379039764e-05,
"loss": 2.0704,
"step": 110
},
{
"epoch": 2.052980132450331,
"grad_norm": 0.9037619829177856,
"learning_rate": 3.224056303358112e-05,
"loss": 2.0033,
"step": 111
},
{
"epoch": 2.071408004606968,
"grad_norm": 1.6077276468276978,
"learning_rate": 3.14683733963134e-05,
"loss": 1.9768,
"step": 112
},
{
"epoch": 2.089835876763605,
"grad_norm": 1.0122666358947754,
"learning_rate": 3.070560564618696e-05,
"loss": 2.03,
"step": 113
},
{
"epoch": 2.108263748920242,
"grad_norm": 1.7775009870529175,
"learning_rate": 2.995256518969914e-05,
"loss": 1.9901,
"step": 114
},
{
"epoch": 2.126691621076879,
"grad_norm": 0.5027958750724792,
"learning_rate": 2.9209553538611633e-05,
"loss": 2.0267,
"step": 115
},
{
"epoch": 2.1451194932335156,
"grad_norm": 1.9773569107055664,
"learning_rate": 2.8476868189227602e-05,
"loss": 1.9419,
"step": 116
},
{
"epoch": 2.1635473653901527,
"grad_norm": 0.8990527391433716,
"learning_rate": 2.7754802503276233e-05,
"loss": 1.9921,
"step": 117
},
{
"epoch": 2.1819752375467893,
"grad_norm": 0.931627094745636,
"learning_rate": 2.7043645590453064e-05,
"loss": 1.975,
"step": 118
},
{
"epoch": 2.2004031097034265,
"grad_norm": 0.4365338683128357,
"learning_rate": 2.6343682192662432e-05,
"loss": 1.9704,
"step": 119
},
{
"epoch": 2.218830981860063,
"grad_norm": 0.7581185698509216,
"learning_rate": 2.5655192570009124e-05,
"loss": 1.9586,
"step": 120
},
{
"epoch": 2.2372588540167,
"grad_norm": 0.7760223746299744,
"learning_rate": 2.497845238858419e-05,
"loss": 2.0055,
"step": 121
},
{
"epoch": 2.2556867261733373,
"grad_norm": 1.0918080806732178,
"learning_rate": 2.4313732610090437e-05,
"loss": 1.9807,
"step": 122
},
{
"epoch": 2.274114598329974,
"grad_norm": 1.8177530765533447,
"learning_rate": 2.366129938335123e-05,
"loss": 2.0595,
"step": 123
},
{
"epoch": 2.292542470486611,
"grad_norm": 1.023606777191162,
"learning_rate": 2.3021413937746657e-05,
"loss": 1.9886,
"step": 124
},
{
"epoch": 2.3109703426432477,
"grad_norm": 0.3760586082935333,
"learning_rate": 2.239433247861915e-05,
"loss": 1.9533,
"step": 125
},
{
"epoch": 2.3109703426432477,
"eval_loss": 1.9844677448272705,
"eval_runtime": 0.1939,
"eval_samples_per_second": 257.909,
"eval_steps_per_second": 67.056,
"step": 125
},
{
"epoch": 2.329398214799885,
"grad_norm": 1.1312675476074219,
"learning_rate": 2.1780306084690794e-05,
"loss": 1.9909,
"step": 126
},
{
"epoch": 2.3478260869565215,
"grad_norm": 0.3952158987522125,
"learning_rate": 2.1179580607533285e-05,
"loss": 1.952,
"step": 127
},
{
"epoch": 2.3662539591131586,
"grad_norm": 0.4795459806919098,
"learning_rate": 2.059239657313084e-05,
"loss": 1.9455,
"step": 128
},
{
"epoch": 2.3846818312697957,
"grad_norm": 1.1665923595428467,
"learning_rate": 2.0018989085575328e-05,
"loss": 2.0425,
"step": 129
},
{
"epoch": 2.4031097034264324,
"grad_norm": 0.8839353919029236,
"learning_rate": 1.9459587732932426e-05,
"loss": 2.0,
"step": 130
},
{
"epoch": 2.4215375755830695,
"grad_norm": 1.1831239461898804,
"learning_rate": 1.89144164953162e-05,
"loss": 2.0223,
"step": 131
},
{
"epoch": 2.439965447739706,
"grad_norm": 0.3851776123046875,
"learning_rate": 1.8383693655209223e-05,
"loss": 1.9546,
"step": 132
},
{
"epoch": 2.4583933198963432,
"grad_norm": 0.37334856390953064,
"learning_rate": 1.7867631710063816e-05,
"loss": 1.9876,
"step": 133
},
{
"epoch": 2.47682119205298,
"grad_norm": 1.424155592918396,
"learning_rate": 1.7366437287219743e-05,
"loss": 1.9977,
"step": 134
},
{
"epoch": 2.495249064209617,
"grad_norm": 1.1031992435455322,
"learning_rate": 1.6880311061172105e-05,
"loss": 1.9801,
"step": 135
},
{
"epoch": 2.513676936366254,
"grad_norm": 1.0327808856964111,
"learning_rate": 1.6409447673222828e-05,
"loss": 1.9884,
"step": 136
},
{
"epoch": 2.5321048085228908,
"grad_norm": 0.5112755298614502,
"learning_rate": 1.595403565354769e-05,
"loss": 1.9828,
"step": 137
},
{
"epoch": 2.550532680679528,
"grad_norm": 0.4185763895511627,
"learning_rate": 1.55142573457103e-05,
"loss": 1.9968,
"step": 138
},
{
"epoch": 2.5689605528361645,
"grad_norm": 0.47376272082328796,
"learning_rate": 1.509028883365305e-05,
"loss": 2.0109,
"step": 139
},
{
"epoch": 2.5873884249928016,
"grad_norm": 0.3518383502960205,
"learning_rate": 1.468229987119448e-05,
"loss": 1.9878,
"step": 140
},
{
"epoch": 2.6058162971494383,
"grad_norm": 0.7104719281196594,
"learning_rate": 1.4290453814061064e-05,
"loss": 1.9562,
"step": 141
},
{
"epoch": 2.6242441693060754,
"grad_norm": 0.49556511640548706,
"learning_rate": 1.391490755448084e-05,
"loss": 1.9646,
"step": 142
},
{
"epoch": 2.6426720414627125,
"grad_norm": 0.2546837329864502,
"learning_rate": 1.3555811458364907e-05,
"loss": 1.9803,
"step": 143
},
{
"epoch": 2.661099913619349,
"grad_norm": 0.785993218421936,
"learning_rate": 1.3213309305102078e-05,
"loss": 1.9414,
"step": 144
},
{
"epoch": 2.6795277857759863,
"grad_norm": 0.6566500663757324,
"learning_rate": 1.2887538229990625e-05,
"loss": 1.9518,
"step": 145
},
{
"epoch": 2.697955657932623,
"grad_norm": 0.5922719836235046,
"learning_rate": 1.2578628669330422e-05,
"loss": 2.0087,
"step": 146
},
{
"epoch": 2.71638353008926,
"grad_norm": 1.8382635116577148,
"learning_rate": 1.2286704308197134e-05,
"loss": 1.9363,
"step": 147
},
{
"epoch": 2.7348114022458967,
"grad_norm": 0.6045182943344116,
"learning_rate": 1.2011882030919708e-05,
"loss": 2.0082,
"step": 148
},
{
"epoch": 2.753239274402534,
"grad_norm": 0.38392022252082825,
"learning_rate": 1.175427187428072e-05,
"loss": 1.9739,
"step": 149
},
{
"epoch": 2.771667146559171,
"grad_norm": 1.0673091411590576,
"learning_rate": 1.1513976983458504e-05,
"loss": 1.992,
"step": 150
},
{
"epoch": 2.771667146559171,
"eval_loss": 1.9801422357559204,
"eval_runtime": 0.1911,
"eval_samples_per_second": 261.698,
"eval_steps_per_second": 68.042,
"step": 150
},
{
"epoch": 2.7900950187158076,
"grad_norm": 0.47503483295440674,
"learning_rate": 1.1291093570728562e-05,
"loss": 1.9973,
"step": 151
},
{
"epoch": 2.8085228908724447,
"grad_norm": 0.7801658511161804,
"learning_rate": 1.1085710876940912e-05,
"loss": 2.0129,
"step": 152
},
{
"epoch": 2.8269507630290813,
"grad_norm": 0.6917530298233032,
"learning_rate": 1.0897911135788709e-05,
"loss": 2.0055,
"step": 153
},
{
"epoch": 2.8453786351857184,
"grad_norm": 0.7652437686920166,
"learning_rate": 1.072776954088251e-05,
"loss": 2.0203,
"step": 154
},
{
"epoch": 2.863806507342355,
"grad_norm": 0.6981119513511658,
"learning_rate": 1.0575354215643268e-05,
"loss": 1.9213,
"step": 155
},
{
"epoch": 2.882234379498992,
"grad_norm": 0.7254536151885986,
"learning_rate": 1.0440726186026288e-05,
"loss": 2.0316,
"step": 156
},
{
"epoch": 2.9006622516556293,
"grad_norm": 0.5838481187820435,
"learning_rate": 1.0323939356086828e-05,
"loss": 1.9561,
"step": 157
},
{
"epoch": 2.919090123812266,
"grad_norm": 0.48237353563308716,
"learning_rate": 1.022504048639738e-05,
"loss": 1.9832,
"step": 158
},
{
"epoch": 2.937517995968903,
"grad_norm": 1.3573530912399292,
"learning_rate": 1.0144069175325029e-05,
"loss": 2.0506,
"step": 159
},
{
"epoch": 2.9559458681255397,
"grad_norm": 1.1986020803451538,
"learning_rate": 1.0081057843176601e-05,
"loss": 1.9252,
"step": 160
},
{
"epoch": 2.974373740282177,
"grad_norm": 0.5769727826118469,
"learning_rate": 1.0036031719217808e-05,
"loss": 1.9839,
"step": 161
},
{
"epoch": 2.9928016124388135,
"grad_norm": 0.28778672218322754,
"learning_rate": 1.0009008831571633e-05,
"loss": 2.0119,
"step": 162
},
{
"epoch": 3.0149726461272675,
"grad_norm": 2.086949110031128,
"learning_rate": 1e-05,
"loss": 3.7483,
"step": 163
}
],
"logging_steps": 1,
"max_steps": 163,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1513310000250880.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}