|
{ |
|
"best_metric": 1.9801422357559204, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-150", |
|
"epoch": 3.0149726461272675, |
|
"eval_steps": 25, |
|
"global_step": 163, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.018427872156636913, |
|
"grad_norm": 15.235661506652832, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 10.9897, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.018427872156636913, |
|
"eval_loss": 10.919320106506348, |
|
"eval_runtime": 0.2022, |
|
"eval_samples_per_second": 247.286, |
|
"eval_steps_per_second": 64.294, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.036855744313273826, |
|
"grad_norm": 17.35055160522461, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 11.0612, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.05528361646991074, |
|
"grad_norm": 21.442893981933594, |
|
"learning_rate": 5e-05, |
|
"loss": 10.9134, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.07371148862654765, |
|
"grad_norm": 26.32647132873535, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 10.8533, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.09213936078318456, |
|
"grad_norm": 28.515634536743164, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 10.7699, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.11056723293982149, |
|
"grad_norm": 30.190385818481445, |
|
"learning_rate": 0.0001, |
|
"loss": 10.458, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.1289951050964584, |
|
"grad_norm": 41.93853759765625, |
|
"learning_rate": 9.999099116842838e-05, |
|
"loss": 9.8715, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.1474229772530953, |
|
"grad_norm": 49.35661697387695, |
|
"learning_rate": 9.996396828078219e-05, |
|
"loss": 8.9689, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.16585084940973222, |
|
"grad_norm": 34.102195739746094, |
|
"learning_rate": 9.99189421568234e-05, |
|
"loss": 8.3176, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.18427872156636912, |
|
"grad_norm": 24.88856315612793, |
|
"learning_rate": 9.985593082467497e-05, |
|
"loss": 7.8949, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.20270659372300603, |
|
"grad_norm": 23.59130859375, |
|
"learning_rate": 9.977495951360264e-05, |
|
"loss": 7.423, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.22113446587964297, |
|
"grad_norm": 23.152849197387695, |
|
"learning_rate": 9.967606064391318e-05, |
|
"loss": 7.1411, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.23956233803627988, |
|
"grad_norm": 16.93653678894043, |
|
"learning_rate": 9.955927381397373e-05, |
|
"loss": 7.9503, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.2579902101929168, |
|
"grad_norm": 18.210586547851562, |
|
"learning_rate": 9.942464578435674e-05, |
|
"loss": 8.7342, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.2764180823495537, |
|
"grad_norm": 24.980066299438477, |
|
"learning_rate": 9.92722304591175e-05, |
|
"loss": 7.7758, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.2948459545061906, |
|
"grad_norm": 23.48898696899414, |
|
"learning_rate": 9.910208886421129e-05, |
|
"loss": 6.4183, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.31327382666282755, |
|
"grad_norm": 20.72090721130371, |
|
"learning_rate": 9.891428912305911e-05, |
|
"loss": 5.7773, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.33170169881946443, |
|
"grad_norm": 17.702342987060547, |
|
"learning_rate": 9.870890642927143e-05, |
|
"loss": 5.4866, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.35012957097610137, |
|
"grad_norm": 14.644960403442383, |
|
"learning_rate": 9.84860230165415e-05, |
|
"loss": 5.2156, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.36855744313273825, |
|
"grad_norm": 13.854472160339355, |
|
"learning_rate": 9.824572812571928e-05, |
|
"loss": 4.9238, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3869853152893752, |
|
"grad_norm": 13.135542869567871, |
|
"learning_rate": 9.798811796908029e-05, |
|
"loss": 4.7154, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.40541318744601207, |
|
"grad_norm": 13.009698867797852, |
|
"learning_rate": 9.771329569180288e-05, |
|
"loss": 4.4549, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.423841059602649, |
|
"grad_norm": 13.120542526245117, |
|
"learning_rate": 9.742137133066958e-05, |
|
"loss": 4.1579, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.44226893175928594, |
|
"grad_norm": 12.632692337036133, |
|
"learning_rate": 9.711246177000938e-05, |
|
"loss": 3.9329, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.4606968039159228, |
|
"grad_norm": 12.716289520263672, |
|
"learning_rate": 9.678669069489793e-05, |
|
"loss": 3.6266, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.4606968039159228, |
|
"eval_loss": 3.70853328704834, |
|
"eval_runtime": 0.1911, |
|
"eval_samples_per_second": 261.604, |
|
"eval_steps_per_second": 68.017, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.47912467607255976, |
|
"grad_norm": 25.649930953979492, |
|
"learning_rate": 9.644418854163509e-05, |
|
"loss": 5.2592, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.49755254822919665, |
|
"grad_norm": 21.4190673828125, |
|
"learning_rate": 9.608509244551916e-05, |
|
"loss": 4.866, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.5159804203858336, |
|
"grad_norm": 13.054777145385742, |
|
"learning_rate": 9.570954618593895e-05, |
|
"loss": 3.5799, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.5344082925424705, |
|
"grad_norm": 8.924787521362305, |
|
"learning_rate": 9.531770012880553e-05, |
|
"loss": 2.9322, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.5528361646991073, |
|
"grad_norm": 7.91404390335083, |
|
"learning_rate": 9.490971116634696e-05, |
|
"loss": 2.8879, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.5712640368557443, |
|
"grad_norm": 7.769594192504883, |
|
"learning_rate": 9.448574265428971e-05, |
|
"loss": 2.7929, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5896919090123812, |
|
"grad_norm": 6.016697883605957, |
|
"learning_rate": 9.404596434645231e-05, |
|
"loss": 2.6413, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.6081197811690181, |
|
"grad_norm": 5.778473377227783, |
|
"learning_rate": 9.359055232677717e-05, |
|
"loss": 2.5554, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.6265476533256551, |
|
"grad_norm": 5.234768390655518, |
|
"learning_rate": 9.31196889388279e-05, |
|
"loss": 2.4804, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.644975525482292, |
|
"grad_norm": 4.422307968139648, |
|
"learning_rate": 9.263356271278027e-05, |
|
"loss": 2.3563, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.6634033976389289, |
|
"grad_norm": 4.935121536254883, |
|
"learning_rate": 9.213236828993619e-05, |
|
"loss": 2.3584, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.6818312697955657, |
|
"grad_norm": 5.38230037689209, |
|
"learning_rate": 9.161630634479079e-05, |
|
"loss": 2.2312, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.7002591419522027, |
|
"grad_norm": 6.988575458526611, |
|
"learning_rate": 9.10855835046838e-05, |
|
"loss": 2.6297, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.7186870141088396, |
|
"grad_norm": 14.805834770202637, |
|
"learning_rate": 9.054041226706758e-05, |
|
"loss": 2.9602, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.7371148862654765, |
|
"grad_norm": 9.114249229431152, |
|
"learning_rate": 8.998101091442468e-05, |
|
"loss": 2.5981, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.7555427584221135, |
|
"grad_norm": 5.007589340209961, |
|
"learning_rate": 8.940760342686917e-05, |
|
"loss": 2.2517, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.7739706305787504, |
|
"grad_norm": 3.1597187519073486, |
|
"learning_rate": 8.882041939246671e-05, |
|
"loss": 2.1793, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.7923985027353873, |
|
"grad_norm": 2.7579259872436523, |
|
"learning_rate": 8.821969391530922e-05, |
|
"loss": 2.1703, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.8108263748920241, |
|
"grad_norm": 2.0153744220733643, |
|
"learning_rate": 8.760566752138085e-05, |
|
"loss": 2.1642, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.8292542470486611, |
|
"grad_norm": 2.2700977325439453, |
|
"learning_rate": 8.697858606225335e-05, |
|
"loss": 2.1068, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.847682119205298, |
|
"grad_norm": 1.8285726308822632, |
|
"learning_rate": 8.633870061664878e-05, |
|
"loss": 2.1288, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.8661099913619349, |
|
"grad_norm": 1.4672825336456299, |
|
"learning_rate": 8.568626738990958e-05, |
|
"loss": 2.0792, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.8845378635185719, |
|
"grad_norm": 2.0133278369903564, |
|
"learning_rate": 8.50215476114158e-05, |
|
"loss": 2.1263, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.9029657356752088, |
|
"grad_norm": 1.6172701120376587, |
|
"learning_rate": 8.434480742999089e-05, |
|
"loss": 2.0611, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.9213936078318457, |
|
"grad_norm": 1.496863603591919, |
|
"learning_rate": 8.365631780733758e-05, |
|
"loss": 2.0717, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9213936078318457, |
|
"eval_loss": 2.095752477645874, |
|
"eval_runtime": 0.1966, |
|
"eval_samples_per_second": 254.354, |
|
"eval_steps_per_second": 66.132, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.9398214799884825, |
|
"grad_norm": 3.3888142108917236, |
|
"learning_rate": 8.295635440954695e-05, |
|
"loss": 2.2467, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.9582493521451195, |
|
"grad_norm": 1.6675165891647339, |
|
"learning_rate": 8.224519749672376e-05, |
|
"loss": 2.1031, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.9766772243017564, |
|
"grad_norm": 1.0358479022979736, |
|
"learning_rate": 8.152313181077242e-05, |
|
"loss": 2.0694, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.9951050964583933, |
|
"grad_norm": 1.0793170928955078, |
|
"learning_rate": 8.079044646138837e-05, |
|
"loss": 2.0398, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 1.0172761301468471, |
|
"grad_norm": 5.010680675506592, |
|
"learning_rate": 8.004743481030088e-05, |
|
"loss": 4.036, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 1.035704002303484, |
|
"grad_norm": 3.263810873031616, |
|
"learning_rate": 7.929439435381305e-05, |
|
"loss": 2.2462, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 1.054131874460121, |
|
"grad_norm": 2.19437837600708, |
|
"learning_rate": 7.853162660368662e-05, |
|
"loss": 2.1076, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 1.0725597466167578, |
|
"grad_norm": 1.1197891235351562, |
|
"learning_rate": 7.775943696641888e-05, |
|
"loss": 2.0394, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 1.0909876187733947, |
|
"grad_norm": 1.35910964012146, |
|
"learning_rate": 7.697813462096025e-05, |
|
"loss": 2.0473, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 1.1094154909300316, |
|
"grad_norm": 1.9377872943878174, |
|
"learning_rate": 7.618803239492121e-05, |
|
"loss": 2.0849, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.1278433630866687, |
|
"grad_norm": 1.510143756866455, |
|
"learning_rate": 7.538944663931862e-05, |
|
"loss": 2.0566, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 1.1462712352433055, |
|
"grad_norm": 2.0934293270111084, |
|
"learning_rate": 7.458269710191101e-05, |
|
"loss": 1.9738, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 1.1646991073999424, |
|
"grad_norm": 0.8613946437835693, |
|
"learning_rate": 7.376810679917411e-05, |
|
"loss": 2.0698, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 1.1831269795565793, |
|
"grad_norm": 0.933632493019104, |
|
"learning_rate": 7.294600188696732e-05, |
|
"loss": 2.0709, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 1.2015548517132162, |
|
"grad_norm": 1.4427233934402466, |
|
"learning_rate": 7.211671152994348e-05, |
|
"loss": 1.9954, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 1.219982723869853, |
|
"grad_norm": 2.0800139904022217, |
|
"learning_rate": 7.128056776975369e-05, |
|
"loss": 1.9794, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 1.23841059602649, |
|
"grad_norm": 1.3812874555587769, |
|
"learning_rate": 7.043790539210045e-05, |
|
"loss": 2.0207, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 1.256838468183127, |
|
"grad_norm": 1.9489840269088745, |
|
"learning_rate": 6.95890617926918e-05, |
|
"loss": 2.0679, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 1.275266340339764, |
|
"grad_norm": 1.545703411102295, |
|
"learning_rate": 6.873437684215077e-05, |
|
"loss": 2.0753, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 1.2936942124964008, |
|
"grad_norm": 1.9554816484451294, |
|
"learning_rate": 6.787419274993366e-05, |
|
"loss": 2.0324, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.3121220846530377, |
|
"grad_norm": 2.3300676345825195, |
|
"learning_rate": 6.700885392731187e-05, |
|
"loss": 2.0034, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 1.3305499568096746, |
|
"grad_norm": 1.7361541986465454, |
|
"learning_rate": 6.613870684947231e-05, |
|
"loss": 2.0202, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 1.3489778289663115, |
|
"grad_norm": 1.2443159818649292, |
|
"learning_rate": 6.526409991679134e-05, |
|
"loss": 1.9933, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 1.3674057011229483, |
|
"grad_norm": 0.6612122654914856, |
|
"learning_rate": 6.438538331533768e-05, |
|
"loss": 1.9769, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 1.3858335732795855, |
|
"grad_norm": 0.6065022945404053, |
|
"learning_rate": 6.350290887666078e-05, |
|
"loss": 2.0084, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.3858335732795855, |
|
"eval_loss": 2.0034306049346924, |
|
"eval_runtime": 0.1919, |
|
"eval_samples_per_second": 260.533, |
|
"eval_steps_per_second": 67.739, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 1.4042614454362223, |
|
"grad_norm": 0.7181031107902527, |
|
"learning_rate": 6.261702993691994e-05, |
|
"loss": 2.0097, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 1.4226893175928592, |
|
"grad_norm": 1.1980915069580078, |
|
"learning_rate": 6.172810119541118e-05, |
|
"loss": 2.0081, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 1.441117189749496, |
|
"grad_norm": 2.185781478881836, |
|
"learning_rate": 6.083647857254837e-05, |
|
"loss": 1.9817, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 1.459545061906133, |
|
"grad_norm": 0.7530511021614075, |
|
"learning_rate": 5.9942519067355284e-05, |
|
"loss": 1.9986, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 1.4779729340627699, |
|
"grad_norm": 1.7427482604980469, |
|
"learning_rate": 5.904658061452585e-05, |
|
"loss": 2.0496, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.4964008062194067, |
|
"grad_norm": 2.031916618347168, |
|
"learning_rate": 5.814902194110988e-05, |
|
"loss": 2.0361, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 1.5148286783760438, |
|
"grad_norm": 1.6156734228134155, |
|
"learning_rate": 5.7250202422881336e-05, |
|
"loss": 2.0416, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 1.5332565505326807, |
|
"grad_norm": 0.9678031206130981, |
|
"learning_rate": 5.635048194044702e-05, |
|
"loss": 2.0342, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 1.5516844226893176, |
|
"grad_norm": 0.4935534596443176, |
|
"learning_rate": 5.5450220735153056e-05, |
|
"loss": 1.9647, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 1.5701122948459545, |
|
"grad_norm": 1.1301989555358887, |
|
"learning_rate": 5.4549779264846955e-05, |
|
"loss": 1.9986, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.5885401670025914, |
|
"grad_norm": 2.0241589546203613, |
|
"learning_rate": 5.3649518059552994e-05, |
|
"loss": 2.0601, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 1.6069680391592285, |
|
"grad_norm": 1.204086184501648, |
|
"learning_rate": 5.2749797577118675e-05, |
|
"loss": 1.9727, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 1.6253959113158651, |
|
"grad_norm": 1.2840675115585327, |
|
"learning_rate": 5.185097805889013e-05, |
|
"loss": 2.0326, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 1.6438237834725022, |
|
"grad_norm": 1.003303050994873, |
|
"learning_rate": 5.0953419385474155e-05, |
|
"loss": 1.9445, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 1.6622516556291391, |
|
"grad_norm": 0.5838247537612915, |
|
"learning_rate": 5.005748093264473e-05, |
|
"loss": 1.9922, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.680679527785776, |
|
"grad_norm": 0.7593981623649597, |
|
"learning_rate": 4.916352142745163e-05, |
|
"loss": 1.966, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 1.6991073999424129, |
|
"grad_norm": 0.8177780508995056, |
|
"learning_rate": 4.827189880458882e-05, |
|
"loss": 1.9918, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 1.7175352720990498, |
|
"grad_norm": 0.8616487979888916, |
|
"learning_rate": 4.7382970063080076e-05, |
|
"loss": 2.0279, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 1.7359631442556869, |
|
"grad_norm": 1.1605690717697144, |
|
"learning_rate": 4.649709112333923e-05, |
|
"loss": 2.0104, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 1.7543910164123235, |
|
"grad_norm": 1.9952294826507568, |
|
"learning_rate": 4.561461668466233e-05, |
|
"loss": 1.9628, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.7728188885689606, |
|
"grad_norm": 1.4495019912719727, |
|
"learning_rate": 4.473590008320868e-05, |
|
"loss": 2.0058, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 1.7912467607255975, |
|
"grad_norm": 1.114969253540039, |
|
"learning_rate": 4.386129315052768e-05, |
|
"loss": 2.0131, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 1.8096746328822344, |
|
"grad_norm": 0.8008613586425781, |
|
"learning_rate": 4.299114607268814e-05, |
|
"loss": 1.9947, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 1.8281025050388713, |
|
"grad_norm": 0.7169589400291443, |
|
"learning_rate": 4.2125807250066354e-05, |
|
"loss": 1.9454, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 1.8465303771955082, |
|
"grad_norm": 1.009277582168579, |
|
"learning_rate": 4.1265623157849235e-05, |
|
"loss": 2.0115, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.8465303771955082, |
|
"eval_loss": 1.994510531425476, |
|
"eval_runtime": 0.192, |
|
"eval_samples_per_second": 260.351, |
|
"eval_steps_per_second": 67.691, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.8649582493521453, |
|
"grad_norm": 0.5230905413627625, |
|
"learning_rate": 4.041093820730821e-05, |
|
"loss": 1.9376, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 1.883386121508782, |
|
"grad_norm": 1.3596158027648926, |
|
"learning_rate": 3.956209460789957e-05, |
|
"loss": 2.0129, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 1.901813993665419, |
|
"grad_norm": 1.3502169847488403, |
|
"learning_rate": 3.871943223024631e-05, |
|
"loss": 1.9809, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 1.9202418658220557, |
|
"grad_norm": 0.7711046934127808, |
|
"learning_rate": 3.788328847005654e-05, |
|
"loss": 2.0039, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 1.9386697379786928, |
|
"grad_norm": 1.2358088493347168, |
|
"learning_rate": 3.7053998113032694e-05, |
|
"loss": 1.9582, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.9570976101353297, |
|
"grad_norm": 0.9068220853805542, |
|
"learning_rate": 3.623189320082592e-05, |
|
"loss": 1.9918, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 1.9755254822919666, |
|
"grad_norm": 0.4698697328567505, |
|
"learning_rate": 3.5417302898089e-05, |
|
"loss": 1.9919, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 1.9939533544486037, |
|
"grad_norm": 0.7360281944274902, |
|
"learning_rate": 3.461055336068141e-05, |
|
"loss": 1.9719, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 2.016124388137057, |
|
"grad_norm": 2.62359356880188, |
|
"learning_rate": 3.3811967605078796e-05, |
|
"loss": 4.039, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 2.0345522602936943, |
|
"grad_norm": 1.5227746963500977, |
|
"learning_rate": 3.3021865379039764e-05, |
|
"loss": 2.0704, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.052980132450331, |
|
"grad_norm": 0.9037619829177856, |
|
"learning_rate": 3.224056303358112e-05, |
|
"loss": 2.0033, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 2.071408004606968, |
|
"grad_norm": 1.6077276468276978, |
|
"learning_rate": 3.14683733963134e-05, |
|
"loss": 1.9768, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 2.089835876763605, |
|
"grad_norm": 1.0122666358947754, |
|
"learning_rate": 3.070560564618696e-05, |
|
"loss": 2.03, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 2.108263748920242, |
|
"grad_norm": 1.7775009870529175, |
|
"learning_rate": 2.995256518969914e-05, |
|
"loss": 1.9901, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 2.126691621076879, |
|
"grad_norm": 0.5027958750724792, |
|
"learning_rate": 2.9209553538611633e-05, |
|
"loss": 2.0267, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 2.1451194932335156, |
|
"grad_norm": 1.9773569107055664, |
|
"learning_rate": 2.8476868189227602e-05, |
|
"loss": 1.9419, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 2.1635473653901527, |
|
"grad_norm": 0.8990527391433716, |
|
"learning_rate": 2.7754802503276233e-05, |
|
"loss": 1.9921, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 2.1819752375467893, |
|
"grad_norm": 0.931627094745636, |
|
"learning_rate": 2.7043645590453064e-05, |
|
"loss": 1.975, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 2.2004031097034265, |
|
"grad_norm": 0.4365338683128357, |
|
"learning_rate": 2.6343682192662432e-05, |
|
"loss": 1.9704, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 2.218830981860063, |
|
"grad_norm": 0.7581185698509216, |
|
"learning_rate": 2.5655192570009124e-05, |
|
"loss": 1.9586, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.2372588540167, |
|
"grad_norm": 0.7760223746299744, |
|
"learning_rate": 2.497845238858419e-05, |
|
"loss": 2.0055, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 2.2556867261733373, |
|
"grad_norm": 1.0918080806732178, |
|
"learning_rate": 2.4313732610090437e-05, |
|
"loss": 1.9807, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 2.274114598329974, |
|
"grad_norm": 1.8177530765533447, |
|
"learning_rate": 2.366129938335123e-05, |
|
"loss": 2.0595, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 2.292542470486611, |
|
"grad_norm": 1.023606777191162, |
|
"learning_rate": 2.3021413937746657e-05, |
|
"loss": 1.9886, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 2.3109703426432477, |
|
"grad_norm": 0.3760586082935333, |
|
"learning_rate": 2.239433247861915e-05, |
|
"loss": 1.9533, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.3109703426432477, |
|
"eval_loss": 1.9844677448272705, |
|
"eval_runtime": 0.1939, |
|
"eval_samples_per_second": 257.909, |
|
"eval_steps_per_second": 67.056, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 2.329398214799885, |
|
"grad_norm": 1.1312675476074219, |
|
"learning_rate": 2.1780306084690794e-05, |
|
"loss": 1.9909, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 2.3478260869565215, |
|
"grad_norm": 0.3952158987522125, |
|
"learning_rate": 2.1179580607533285e-05, |
|
"loss": 1.952, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 2.3662539591131586, |
|
"grad_norm": 0.4795459806919098, |
|
"learning_rate": 2.059239657313084e-05, |
|
"loss": 1.9455, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 2.3846818312697957, |
|
"grad_norm": 1.1665923595428467, |
|
"learning_rate": 2.0018989085575328e-05, |
|
"loss": 2.0425, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 2.4031097034264324, |
|
"grad_norm": 0.8839353919029236, |
|
"learning_rate": 1.9459587732932426e-05, |
|
"loss": 2.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.4215375755830695, |
|
"grad_norm": 1.1831239461898804, |
|
"learning_rate": 1.89144164953162e-05, |
|
"loss": 2.0223, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 2.439965447739706, |
|
"grad_norm": 0.3851776123046875, |
|
"learning_rate": 1.8383693655209223e-05, |
|
"loss": 1.9546, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 2.4583933198963432, |
|
"grad_norm": 0.37334856390953064, |
|
"learning_rate": 1.7867631710063816e-05, |
|
"loss": 1.9876, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 2.47682119205298, |
|
"grad_norm": 1.424155592918396, |
|
"learning_rate": 1.7366437287219743e-05, |
|
"loss": 1.9977, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 2.495249064209617, |
|
"grad_norm": 1.1031992435455322, |
|
"learning_rate": 1.6880311061172105e-05, |
|
"loss": 1.9801, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 2.513676936366254, |
|
"grad_norm": 1.0327808856964111, |
|
"learning_rate": 1.6409447673222828e-05, |
|
"loss": 1.9884, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 2.5321048085228908, |
|
"grad_norm": 0.5112755298614502, |
|
"learning_rate": 1.595403565354769e-05, |
|
"loss": 1.9828, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 2.550532680679528, |
|
"grad_norm": 0.4185763895511627, |
|
"learning_rate": 1.55142573457103e-05, |
|
"loss": 1.9968, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 2.5689605528361645, |
|
"grad_norm": 0.47376272082328796, |
|
"learning_rate": 1.509028883365305e-05, |
|
"loss": 2.0109, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 2.5873884249928016, |
|
"grad_norm": 0.3518383502960205, |
|
"learning_rate": 1.468229987119448e-05, |
|
"loss": 1.9878, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.6058162971494383, |
|
"grad_norm": 0.7104719281196594, |
|
"learning_rate": 1.4290453814061064e-05, |
|
"loss": 1.9562, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 2.6242441693060754, |
|
"grad_norm": 0.49556511640548706, |
|
"learning_rate": 1.391490755448084e-05, |
|
"loss": 1.9646, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 2.6426720414627125, |
|
"grad_norm": 0.2546837329864502, |
|
"learning_rate": 1.3555811458364907e-05, |
|
"loss": 1.9803, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 2.661099913619349, |
|
"grad_norm": 0.785993218421936, |
|
"learning_rate": 1.3213309305102078e-05, |
|
"loss": 1.9414, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 2.6795277857759863, |
|
"grad_norm": 0.6566500663757324, |
|
"learning_rate": 1.2887538229990625e-05, |
|
"loss": 1.9518, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 2.697955657932623, |
|
"grad_norm": 0.5922719836235046, |
|
"learning_rate": 1.2578628669330422e-05, |
|
"loss": 2.0087, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 2.71638353008926, |
|
"grad_norm": 1.8382635116577148, |
|
"learning_rate": 1.2286704308197134e-05, |
|
"loss": 1.9363, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 2.7348114022458967, |
|
"grad_norm": 0.6045182943344116, |
|
"learning_rate": 1.2011882030919708e-05, |
|
"loss": 2.0082, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 2.753239274402534, |
|
"grad_norm": 0.38392022252082825, |
|
"learning_rate": 1.175427187428072e-05, |
|
"loss": 1.9739, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 2.771667146559171, |
|
"grad_norm": 1.0673091411590576, |
|
"learning_rate": 1.1513976983458504e-05, |
|
"loss": 1.992, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.771667146559171, |
|
"eval_loss": 1.9801422357559204, |
|
"eval_runtime": 0.1911, |
|
"eval_samples_per_second": 261.698, |
|
"eval_steps_per_second": 68.042, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.7900950187158076, |
|
"grad_norm": 0.47503483295440674, |
|
"learning_rate": 1.1291093570728562e-05, |
|
"loss": 1.9973, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 2.8085228908724447, |
|
"grad_norm": 0.7801658511161804, |
|
"learning_rate": 1.1085710876940912e-05, |
|
"loss": 2.0129, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 2.8269507630290813, |
|
"grad_norm": 0.6917530298233032, |
|
"learning_rate": 1.0897911135788709e-05, |
|
"loss": 2.0055, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 2.8453786351857184, |
|
"grad_norm": 0.7652437686920166, |
|
"learning_rate": 1.072776954088251e-05, |
|
"loss": 2.0203, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 2.863806507342355, |
|
"grad_norm": 0.6981119513511658, |
|
"learning_rate": 1.0575354215643268e-05, |
|
"loss": 1.9213, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 2.882234379498992, |
|
"grad_norm": 0.7254536151885986, |
|
"learning_rate": 1.0440726186026288e-05, |
|
"loss": 2.0316, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 2.9006622516556293, |
|
"grad_norm": 0.5838481187820435, |
|
"learning_rate": 1.0323939356086828e-05, |
|
"loss": 1.9561, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 2.919090123812266, |
|
"grad_norm": 0.48237353563308716, |
|
"learning_rate": 1.022504048639738e-05, |
|
"loss": 1.9832, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 2.937517995968903, |
|
"grad_norm": 1.3573530912399292, |
|
"learning_rate": 1.0144069175325029e-05, |
|
"loss": 2.0506, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 2.9559458681255397, |
|
"grad_norm": 1.1986020803451538, |
|
"learning_rate": 1.0081057843176601e-05, |
|
"loss": 1.9252, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.974373740282177, |
|
"grad_norm": 0.5769727826118469, |
|
"learning_rate": 1.0036031719217808e-05, |
|
"loss": 1.9839, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 2.9928016124388135, |
|
"grad_norm": 0.28778672218322754, |
|
"learning_rate": 1.0009008831571633e-05, |
|
"loss": 2.0119, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 3.0149726461272675, |
|
"grad_norm": 2.086949110031128, |
|
"learning_rate": 1e-05, |
|
"loss": 3.7483, |
|
"step": 163 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 163, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 1, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1513310000250880.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|