mqwen25_7ft / trainer_state.json
akhooli's picture
Upload folder using huggingface_hub
0882d48 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4186289900575615,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0020931449502878076,
"grad_norm": 2.813615083694458,
"learning_rate": 5e-05,
"loss": 3.0406,
"step": 1
},
{
"epoch": 0.004186289900575615,
"grad_norm": 2.6805036067962646,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 2
},
{
"epoch": 0.006279434850863423,
"grad_norm": 2.032153367996216,
"learning_rate": 0.00015,
"loss": 2.6258,
"step": 3
},
{
"epoch": 0.00837257980115123,
"grad_norm": 1.7369762659072876,
"learning_rate": 0.0002,
"loss": 2.4318,
"step": 4
},
{
"epoch": 0.010465724751439037,
"grad_norm": 1.5764646530151367,
"learning_rate": 0.00025,
"loss": 2.1646,
"step": 5
},
{
"epoch": 0.012558869701726845,
"grad_norm": 1.1295244693756104,
"learning_rate": 0.0003,
"loss": 2.0415,
"step": 6
},
{
"epoch": 0.014652014652014652,
"grad_norm": 0.9872271418571472,
"learning_rate": 0.00035,
"loss": 2.0001,
"step": 7
},
{
"epoch": 0.01674515960230246,
"grad_norm": 1.6292792558670044,
"learning_rate": 0.0004,
"loss": 1.9348,
"step": 8
},
{
"epoch": 0.018838304552590265,
"grad_norm": 0.6931092143058777,
"learning_rate": 0.00045000000000000004,
"loss": 1.8677,
"step": 9
},
{
"epoch": 0.020931449502878074,
"grad_norm": 0.5845860242843628,
"learning_rate": 0.0005,
"loss": 1.8057,
"step": 10
},
{
"epoch": 0.023024594453165882,
"grad_norm": 0.6705211997032166,
"learning_rate": 0.0004989293361884369,
"loss": 1.7457,
"step": 11
},
{
"epoch": 0.02511773940345369,
"grad_norm": 0.6003366112709045,
"learning_rate": 0.0004978586723768737,
"loss": 1.7477,
"step": 12
},
{
"epoch": 0.027210884353741496,
"grad_norm": 0.764362633228302,
"learning_rate": 0.0004967880085653105,
"loss": 1.6971,
"step": 13
},
{
"epoch": 0.029304029304029304,
"grad_norm": 0.9480692148208618,
"learning_rate": 0.0004957173447537474,
"loss": 1.7387,
"step": 14
},
{
"epoch": 0.03139717425431711,
"grad_norm": 0.8233430981636047,
"learning_rate": 0.0004946466809421842,
"loss": 1.679,
"step": 15
},
{
"epoch": 0.03349031920460492,
"grad_norm": 0.4710249602794647,
"learning_rate": 0.000493576017130621,
"loss": 1.687,
"step": 16
},
{
"epoch": 0.035583464154892726,
"grad_norm": 0.5001205801963806,
"learning_rate": 0.0004925053533190578,
"loss": 1.7042,
"step": 17
},
{
"epoch": 0.03767660910518053,
"grad_norm": 0.37574276328086853,
"learning_rate": 0.0004914346895074946,
"loss": 1.6976,
"step": 18
},
{
"epoch": 0.03976975405546834,
"grad_norm": 0.4445561170578003,
"learning_rate": 0.0004903640256959315,
"loss": 1.711,
"step": 19
},
{
"epoch": 0.04186289900575615,
"grad_norm": 0.32608023285865784,
"learning_rate": 0.0004892933618843683,
"loss": 1.6491,
"step": 20
},
{
"epoch": 0.04395604395604396,
"grad_norm": 0.34311097860336304,
"learning_rate": 0.0004882226980728052,
"loss": 1.6588,
"step": 21
},
{
"epoch": 0.046049188906331764,
"grad_norm": 0.31974583864212036,
"learning_rate": 0.000487152034261242,
"loss": 1.6907,
"step": 22
},
{
"epoch": 0.04814233385661957,
"grad_norm": 0.3181461989879608,
"learning_rate": 0.0004860813704496788,
"loss": 1.6776,
"step": 23
},
{
"epoch": 0.05023547880690738,
"grad_norm": 0.31020134687423706,
"learning_rate": 0.00048501070663811566,
"loss": 1.6665,
"step": 24
},
{
"epoch": 0.052328623757195186,
"grad_norm": 0.3187050521373749,
"learning_rate": 0.00048394004282655245,
"loss": 1.647,
"step": 25
},
{
"epoch": 0.05442176870748299,
"grad_norm": 0.3272475600242615,
"learning_rate": 0.0004828693790149893,
"loss": 1.6769,
"step": 26
},
{
"epoch": 0.0565149136577708,
"grad_norm": 0.29509666562080383,
"learning_rate": 0.00048179871520342613,
"loss": 1.6292,
"step": 27
},
{
"epoch": 0.05860805860805861,
"grad_norm": 0.29900938272476196,
"learning_rate": 0.00048072805139186297,
"loss": 1.6318,
"step": 28
},
{
"epoch": 0.06070120355834641,
"grad_norm": 0.3012602627277374,
"learning_rate": 0.0004796573875802998,
"loss": 1.5997,
"step": 29
},
{
"epoch": 0.06279434850863422,
"grad_norm": 0.3533616065979004,
"learning_rate": 0.00047858672376873665,
"loss": 1.632,
"step": 30
},
{
"epoch": 0.06488749345892203,
"grad_norm": 0.2721816599369049,
"learning_rate": 0.00047751605995717344,
"loss": 1.6432,
"step": 31
},
{
"epoch": 0.06698063840920984,
"grad_norm": 0.29362842440605164,
"learning_rate": 0.0004764453961456103,
"loss": 1.6608,
"step": 32
},
{
"epoch": 0.06907378335949764,
"grad_norm": 0.27665096521377563,
"learning_rate": 0.0004753747323340471,
"loss": 1.6286,
"step": 33
},
{
"epoch": 0.07116692830978545,
"grad_norm": 0.28791311383247375,
"learning_rate": 0.0004743040685224839,
"loss": 1.6093,
"step": 34
},
{
"epoch": 0.07326007326007326,
"grad_norm": 0.31565895676612854,
"learning_rate": 0.0004732334047109208,
"loss": 1.6672,
"step": 35
},
{
"epoch": 0.07535321821036106,
"grad_norm": 0.26670706272125244,
"learning_rate": 0.00047216274089935764,
"loss": 1.6251,
"step": 36
},
{
"epoch": 0.07744636316064887,
"grad_norm": 0.2804130017757416,
"learning_rate": 0.0004710920770877944,
"loss": 1.5653,
"step": 37
},
{
"epoch": 0.07953950811093669,
"grad_norm": 0.27214744687080383,
"learning_rate": 0.00047002141327623126,
"loss": 1.5726,
"step": 38
},
{
"epoch": 0.08163265306122448,
"grad_norm": 0.28986403346061707,
"learning_rate": 0.0004689507494646681,
"loss": 1.601,
"step": 39
},
{
"epoch": 0.0837257980115123,
"grad_norm": 0.3080230951309204,
"learning_rate": 0.0004678800856531049,
"loss": 1.5939,
"step": 40
},
{
"epoch": 0.08581894296180011,
"grad_norm": 0.2734631597995758,
"learning_rate": 0.0004668094218415418,
"loss": 1.5951,
"step": 41
},
{
"epoch": 0.08791208791208792,
"grad_norm": 0.28978678584098816,
"learning_rate": 0.0004657387580299786,
"loss": 1.6146,
"step": 42
},
{
"epoch": 0.09000523286237572,
"grad_norm": 0.27776286005973816,
"learning_rate": 0.00046466809421841546,
"loss": 1.6194,
"step": 43
},
{
"epoch": 0.09209837781266353,
"grad_norm": 0.2763765752315521,
"learning_rate": 0.00046359743040685225,
"loss": 1.555,
"step": 44
},
{
"epoch": 0.09419152276295134,
"grad_norm": 0.28580474853515625,
"learning_rate": 0.0004625267665952891,
"loss": 1.5959,
"step": 45
},
{
"epoch": 0.09628466771323914,
"grad_norm": 0.27958357334136963,
"learning_rate": 0.00046145610278372593,
"loss": 1.5754,
"step": 46
},
{
"epoch": 0.09837781266352695,
"grad_norm": 0.2925872504711151,
"learning_rate": 0.0004603854389721627,
"loss": 1.656,
"step": 47
},
{
"epoch": 0.10047095761381476,
"grad_norm": 0.27339980006217957,
"learning_rate": 0.0004593147751605996,
"loss": 1.6213,
"step": 48
},
{
"epoch": 0.10256410256410256,
"grad_norm": 0.2932608127593994,
"learning_rate": 0.00045824411134903645,
"loss": 1.5749,
"step": 49
},
{
"epoch": 0.10465724751439037,
"grad_norm": 0.26638683676719666,
"learning_rate": 0.00045717344753747323,
"loss": 1.5951,
"step": 50
},
{
"epoch": 0.10675039246467818,
"grad_norm": 0.28242963552474976,
"learning_rate": 0.0004561027837259101,
"loss": 1.5676,
"step": 51
},
{
"epoch": 0.10884353741496598,
"grad_norm": 0.2681107819080353,
"learning_rate": 0.0004550321199143469,
"loss": 1.5513,
"step": 52
},
{
"epoch": 0.1109366823652538,
"grad_norm": 0.27786529064178467,
"learning_rate": 0.0004539614561027837,
"loss": 1.4974,
"step": 53
},
{
"epoch": 0.1130298273155416,
"grad_norm": 0.2654118835926056,
"learning_rate": 0.00045289079229122054,
"loss": 1.6262,
"step": 54
},
{
"epoch": 0.1151229722658294,
"grad_norm": 0.30704954266548157,
"learning_rate": 0.00045182012847965744,
"loss": 1.564,
"step": 55
},
{
"epoch": 0.11721611721611722,
"grad_norm": 0.27236270904541016,
"learning_rate": 0.0004507494646680942,
"loss": 1.6199,
"step": 56
},
{
"epoch": 0.11930926216640503,
"grad_norm": 0.2929720878601074,
"learning_rate": 0.00044967880085653106,
"loss": 1.6008,
"step": 57
},
{
"epoch": 0.12140240711669283,
"grad_norm": 0.28251537680625916,
"learning_rate": 0.0004486081370449679,
"loss": 1.5103,
"step": 58
},
{
"epoch": 0.12349555206698064,
"grad_norm": 0.27533096075057983,
"learning_rate": 0.0004475374732334047,
"loss": 1.5274,
"step": 59
},
{
"epoch": 0.12558869701726844,
"grad_norm": 0.30340835452079773,
"learning_rate": 0.00044646680942184153,
"loss": 1.5842,
"step": 60
},
{
"epoch": 0.12768184196755625,
"grad_norm": 0.2956872880458832,
"learning_rate": 0.00044539614561027837,
"loss": 1.5504,
"step": 61
},
{
"epoch": 0.12977498691784406,
"grad_norm": 0.2717457413673401,
"learning_rate": 0.00044432548179871526,
"loss": 1.5728,
"step": 62
},
{
"epoch": 0.13186813186813187,
"grad_norm": 0.27269890904426575,
"learning_rate": 0.00044325481798715205,
"loss": 1.5891,
"step": 63
},
{
"epoch": 0.13396127681841968,
"grad_norm": 0.294362872838974,
"learning_rate": 0.0004421841541755889,
"loss": 1.5494,
"step": 64
},
{
"epoch": 0.1360544217687075,
"grad_norm": 0.3467015326023102,
"learning_rate": 0.00044111349036402573,
"loss": 1.6157,
"step": 65
},
{
"epoch": 0.13814756671899528,
"grad_norm": 0.26985207200050354,
"learning_rate": 0.0004400428265524625,
"loss": 1.5489,
"step": 66
},
{
"epoch": 0.1402407116692831,
"grad_norm": 0.30386754870414734,
"learning_rate": 0.00043897216274089935,
"loss": 1.547,
"step": 67
},
{
"epoch": 0.1423338566195709,
"grad_norm": 0.2737506330013275,
"learning_rate": 0.0004379014989293362,
"loss": 1.541,
"step": 68
},
{
"epoch": 0.14442700156985872,
"grad_norm": 0.2916475832462311,
"learning_rate": 0.00043683083511777303,
"loss": 1.6008,
"step": 69
},
{
"epoch": 0.14652014652014653,
"grad_norm": 0.27791959047317505,
"learning_rate": 0.0004357601713062099,
"loss": 1.6368,
"step": 70
},
{
"epoch": 0.14861329147043434,
"grad_norm": 0.2925644516944885,
"learning_rate": 0.0004346895074946467,
"loss": 1.5606,
"step": 71
},
{
"epoch": 0.15070643642072212,
"grad_norm": 0.2825354039669037,
"learning_rate": 0.0004336188436830835,
"loss": 1.5759,
"step": 72
},
{
"epoch": 0.15279958137100993,
"grad_norm": 0.27884945273399353,
"learning_rate": 0.00043254817987152034,
"loss": 1.624,
"step": 73
},
{
"epoch": 0.15489272632129775,
"grad_norm": 0.29608336091041565,
"learning_rate": 0.0004314775160599572,
"loss": 1.5619,
"step": 74
},
{
"epoch": 0.15698587127158556,
"grad_norm": 0.2830757200717926,
"learning_rate": 0.00043040685224839397,
"loss": 1.6233,
"step": 75
},
{
"epoch": 0.15907901622187337,
"grad_norm": 0.3144885301589966,
"learning_rate": 0.00042933618843683086,
"loss": 1.567,
"step": 76
},
{
"epoch": 0.16117216117216118,
"grad_norm": 0.29038679599761963,
"learning_rate": 0.0004282655246252677,
"loss": 1.5317,
"step": 77
},
{
"epoch": 0.16326530612244897,
"grad_norm": 0.34527644515037537,
"learning_rate": 0.0004271948608137045,
"loss": 1.5485,
"step": 78
},
{
"epoch": 0.16535845107273678,
"grad_norm": 0.2915840446949005,
"learning_rate": 0.0004261241970021413,
"loss": 1.5792,
"step": 79
},
{
"epoch": 0.1674515960230246,
"grad_norm": 0.30239176750183105,
"learning_rate": 0.00042505353319057817,
"loss": 1.5533,
"step": 80
},
{
"epoch": 0.1695447409733124,
"grad_norm": 0.28941529989242554,
"learning_rate": 0.00042398286937901495,
"loss": 1.5635,
"step": 81
},
{
"epoch": 0.17163788592360021,
"grad_norm": 0.27628207206726074,
"learning_rate": 0.0004229122055674518,
"loss": 1.5542,
"step": 82
},
{
"epoch": 0.17373103087388803,
"grad_norm": 0.28659799695014954,
"learning_rate": 0.0004218415417558887,
"loss": 1.5969,
"step": 83
},
{
"epoch": 0.17582417582417584,
"grad_norm": 0.2995677888393402,
"learning_rate": 0.00042077087794432553,
"loss": 1.5339,
"step": 84
},
{
"epoch": 0.17791732077446362,
"grad_norm": 0.28352785110473633,
"learning_rate": 0.0004197002141327623,
"loss": 1.5739,
"step": 85
},
{
"epoch": 0.18001046572475143,
"grad_norm": 0.296410471200943,
"learning_rate": 0.00041862955032119915,
"loss": 1.5432,
"step": 86
},
{
"epoch": 0.18210361067503925,
"grad_norm": 0.3075838088989258,
"learning_rate": 0.000417558886509636,
"loss": 1.5749,
"step": 87
},
{
"epoch": 0.18419675562532706,
"grad_norm": 0.29746511578559875,
"learning_rate": 0.0004164882226980728,
"loss": 1.5509,
"step": 88
},
{
"epoch": 0.18628990057561487,
"grad_norm": 0.3004538118839264,
"learning_rate": 0.0004154175588865097,
"loss": 1.5732,
"step": 89
},
{
"epoch": 0.18838304552590268,
"grad_norm": 0.287615031003952,
"learning_rate": 0.0004143468950749465,
"loss": 1.5482,
"step": 90
},
{
"epoch": 0.19047619047619047,
"grad_norm": 0.2988753020763397,
"learning_rate": 0.0004132762312633833,
"loss": 1.5656,
"step": 91
},
{
"epoch": 0.19256933542647828,
"grad_norm": 0.29836592078208923,
"learning_rate": 0.00041220556745182014,
"loss": 1.5759,
"step": 92
},
{
"epoch": 0.1946624803767661,
"grad_norm": 0.3167785406112671,
"learning_rate": 0.000411134903640257,
"loss": 1.5362,
"step": 93
},
{
"epoch": 0.1967556253270539,
"grad_norm": 0.27747228741645813,
"learning_rate": 0.00041006423982869377,
"loss": 1.5212,
"step": 94
},
{
"epoch": 0.1988487702773417,
"grad_norm": 0.30162835121154785,
"learning_rate": 0.0004089935760171306,
"loss": 1.5362,
"step": 95
},
{
"epoch": 0.20094191522762953,
"grad_norm": 0.28325414657592773,
"learning_rate": 0.0004079229122055675,
"loss": 1.4925,
"step": 96
},
{
"epoch": 0.2030350601779173,
"grad_norm": 0.28862977027893066,
"learning_rate": 0.0004068522483940043,
"loss": 1.5731,
"step": 97
},
{
"epoch": 0.20512820512820512,
"grad_norm": 0.3084706962108612,
"learning_rate": 0.0004057815845824411,
"loss": 1.5632,
"step": 98
},
{
"epoch": 0.20722135007849293,
"grad_norm": 0.29341885447502136,
"learning_rate": 0.00040471092077087797,
"loss": 1.5436,
"step": 99
},
{
"epoch": 0.20931449502878074,
"grad_norm": 0.3143270015716553,
"learning_rate": 0.00040364025695931475,
"loss": 1.5839,
"step": 100
},
{
"epoch": 0.21140763997906856,
"grad_norm": 0.30276885628700256,
"learning_rate": 0.0004025695931477516,
"loss": 1.6037,
"step": 101
},
{
"epoch": 0.21350078492935637,
"grad_norm": 0.31467488408088684,
"learning_rate": 0.00040149892933618843,
"loss": 1.5227,
"step": 102
},
{
"epoch": 0.21559392987964415,
"grad_norm": 0.28636667132377625,
"learning_rate": 0.0004004282655246253,
"loss": 1.5022,
"step": 103
},
{
"epoch": 0.21768707482993196,
"grad_norm": 0.28783223032951355,
"learning_rate": 0.0003993576017130621,
"loss": 1.5432,
"step": 104
},
{
"epoch": 0.21978021978021978,
"grad_norm": 0.3006027042865753,
"learning_rate": 0.00039828693790149895,
"loss": 1.5305,
"step": 105
},
{
"epoch": 0.2218733647305076,
"grad_norm": 0.3113887310028076,
"learning_rate": 0.0003972162740899358,
"loss": 1.4946,
"step": 106
},
{
"epoch": 0.2239665096807954,
"grad_norm": 0.3233683109283447,
"learning_rate": 0.0003961456102783726,
"loss": 1.5176,
"step": 107
},
{
"epoch": 0.2260596546310832,
"grad_norm": 0.29007241129875183,
"learning_rate": 0.0003950749464668094,
"loss": 1.5661,
"step": 108
},
{
"epoch": 0.228152799581371,
"grad_norm": 0.3270627558231354,
"learning_rate": 0.00039400428265524626,
"loss": 1.5414,
"step": 109
},
{
"epoch": 0.2302459445316588,
"grad_norm": 0.2789075970649719,
"learning_rate": 0.0003929336188436831,
"loss": 1.5324,
"step": 110
},
{
"epoch": 0.23233908948194662,
"grad_norm": 0.3245764672756195,
"learning_rate": 0.00039186295503211994,
"loss": 1.5541,
"step": 111
},
{
"epoch": 0.23443223443223443,
"grad_norm": 0.28058871626853943,
"learning_rate": 0.0003907922912205568,
"loss": 1.5398,
"step": 112
},
{
"epoch": 0.23652537938252224,
"grad_norm": 0.29182901978492737,
"learning_rate": 0.00038972162740899356,
"loss": 1.4795,
"step": 113
},
{
"epoch": 0.23861852433281006,
"grad_norm": 0.3088870346546173,
"learning_rate": 0.0003886509635974304,
"loss": 1.5326,
"step": 114
},
{
"epoch": 0.24071166928309787,
"grad_norm": 0.3134807050228119,
"learning_rate": 0.00038758029978586725,
"loss": 1.5346,
"step": 115
},
{
"epoch": 0.24280481423338565,
"grad_norm": 0.30298101902008057,
"learning_rate": 0.00038650963597430403,
"loss": 1.5134,
"step": 116
},
{
"epoch": 0.24489795918367346,
"grad_norm": 0.2860242426395416,
"learning_rate": 0.0003854389721627409,
"loss": 1.5306,
"step": 117
},
{
"epoch": 0.24699110413396128,
"grad_norm": 0.2905466556549072,
"learning_rate": 0.00038436830835117777,
"loss": 1.5234,
"step": 118
},
{
"epoch": 0.2490842490842491,
"grad_norm": 0.28561463952064514,
"learning_rate": 0.00038329764453961455,
"loss": 1.4752,
"step": 119
},
{
"epoch": 0.25117739403453687,
"grad_norm": 0.2877683639526367,
"learning_rate": 0.0003822269807280514,
"loss": 1.598,
"step": 120
},
{
"epoch": 0.2532705389848247,
"grad_norm": 0.2815863788127899,
"learning_rate": 0.00038115631691648823,
"loss": 1.4643,
"step": 121
},
{
"epoch": 0.2553636839351125,
"grad_norm": 0.28134405612945557,
"learning_rate": 0.000380085653104925,
"loss": 1.5037,
"step": 122
},
{
"epoch": 0.25745682888540034,
"grad_norm": 0.2940825819969177,
"learning_rate": 0.00037901498929336186,
"loss": 1.4963,
"step": 123
},
{
"epoch": 0.2595499738356881,
"grad_norm": 0.28303319215774536,
"learning_rate": 0.00037794432548179875,
"loss": 1.4708,
"step": 124
},
{
"epoch": 0.2616431187859759,
"grad_norm": 0.3112112581729889,
"learning_rate": 0.0003768736616702356,
"loss": 1.4981,
"step": 125
},
{
"epoch": 0.26373626373626374,
"grad_norm": 0.2902218997478485,
"learning_rate": 0.0003758029978586724,
"loss": 1.4654,
"step": 126
},
{
"epoch": 0.2658294086865515,
"grad_norm": 0.31582140922546387,
"learning_rate": 0.0003747323340471092,
"loss": 1.551,
"step": 127
},
{
"epoch": 0.26792255363683937,
"grad_norm": 0.28578075766563416,
"learning_rate": 0.00037366167023554606,
"loss": 1.5018,
"step": 128
},
{
"epoch": 0.27001569858712715,
"grad_norm": 0.33017498254776,
"learning_rate": 0.00037259100642398284,
"loss": 1.5177,
"step": 129
},
{
"epoch": 0.272108843537415,
"grad_norm": 0.2954592704772949,
"learning_rate": 0.0003715203426124197,
"loss": 1.5126,
"step": 130
},
{
"epoch": 0.2742019884877028,
"grad_norm": 0.34393608570098877,
"learning_rate": 0.0003704496788008566,
"loss": 1.5061,
"step": 131
},
{
"epoch": 0.27629513343799056,
"grad_norm": 0.3111407160758972,
"learning_rate": 0.00036937901498929336,
"loss": 1.5599,
"step": 132
},
{
"epoch": 0.2783882783882784,
"grad_norm": 0.2914719581604004,
"learning_rate": 0.0003683083511777302,
"loss": 1.5165,
"step": 133
},
{
"epoch": 0.2804814233385662,
"grad_norm": 0.29973331093788147,
"learning_rate": 0.00036723768736616704,
"loss": 1.4669,
"step": 134
},
{
"epoch": 0.282574568288854,
"grad_norm": 0.3091066777706146,
"learning_rate": 0.00036616702355460383,
"loss": 1.5048,
"step": 135
},
{
"epoch": 0.2846677132391418,
"grad_norm": 0.31531593203544617,
"learning_rate": 0.00036509635974304067,
"loss": 1.5086,
"step": 136
},
{
"epoch": 0.2867608581894296,
"grad_norm": 0.29967445135116577,
"learning_rate": 0.0003640256959314775,
"loss": 1.546,
"step": 137
},
{
"epoch": 0.28885400313971743,
"grad_norm": 0.33331945538520813,
"learning_rate": 0.00036295503211991435,
"loss": 1.4824,
"step": 138
},
{
"epoch": 0.2909471480900052,
"grad_norm": 0.3064332902431488,
"learning_rate": 0.0003618843683083512,
"loss": 1.573,
"step": 139
},
{
"epoch": 0.29304029304029305,
"grad_norm": 0.3276407718658447,
"learning_rate": 0.00036081370449678803,
"loss": 1.5517,
"step": 140
},
{
"epoch": 0.29513343799058084,
"grad_norm": 0.2774730622768402,
"learning_rate": 0.0003597430406852248,
"loss": 1.4853,
"step": 141
},
{
"epoch": 0.2972265829408687,
"grad_norm": 0.33176928758621216,
"learning_rate": 0.00035867237687366166,
"loss": 1.5346,
"step": 142
},
{
"epoch": 0.29931972789115646,
"grad_norm": 0.29119884967803955,
"learning_rate": 0.0003576017130620985,
"loss": 1.4962,
"step": 143
},
{
"epoch": 0.30141287284144425,
"grad_norm": 0.2862621545791626,
"learning_rate": 0.0003565310492505354,
"loss": 1.511,
"step": 144
},
{
"epoch": 0.3035060177917321,
"grad_norm": 0.31120261549949646,
"learning_rate": 0.0003554603854389722,
"loss": 1.5114,
"step": 145
},
{
"epoch": 0.30559916274201987,
"grad_norm": 0.3067992329597473,
"learning_rate": 0.000354389721627409,
"loss": 1.4757,
"step": 146
},
{
"epoch": 0.3076923076923077,
"grad_norm": 0.30063048005104065,
"learning_rate": 0.00035331905781584586,
"loss": 1.5617,
"step": 147
},
{
"epoch": 0.3097854526425955,
"grad_norm": 0.3026478588581085,
"learning_rate": 0.00035224839400428264,
"loss": 1.446,
"step": 148
},
{
"epoch": 0.31187859759288333,
"grad_norm": 0.2846631705760956,
"learning_rate": 0.0003511777301927195,
"loss": 1.4821,
"step": 149
},
{
"epoch": 0.3139717425431711,
"grad_norm": 0.3027445673942566,
"learning_rate": 0.0003501070663811563,
"loss": 1.5073,
"step": 150
},
{
"epoch": 0.3160648874934589,
"grad_norm": 0.29627394676208496,
"learning_rate": 0.00034903640256959316,
"loss": 1.496,
"step": 151
},
{
"epoch": 0.31815803244374674,
"grad_norm": 0.3188508450984955,
"learning_rate": 0.00034796573875803,
"loss": 1.5395,
"step": 152
},
{
"epoch": 0.3202511773940345,
"grad_norm": 0.29025983810424805,
"learning_rate": 0.00034689507494646684,
"loss": 1.5177,
"step": 153
},
{
"epoch": 0.32234432234432236,
"grad_norm": 0.31973665952682495,
"learning_rate": 0.00034582441113490363,
"loss": 1.4774,
"step": 154
},
{
"epoch": 0.32443746729461015,
"grad_norm": 0.29737603664398193,
"learning_rate": 0.00034475374732334047,
"loss": 1.5141,
"step": 155
},
{
"epoch": 0.32653061224489793,
"grad_norm": 0.31660401821136475,
"learning_rate": 0.0003436830835117773,
"loss": 1.4917,
"step": 156
},
{
"epoch": 0.3286237571951858,
"grad_norm": 0.29376548528671265,
"learning_rate": 0.0003426124197002141,
"loss": 1.4832,
"step": 157
},
{
"epoch": 0.33071690214547356,
"grad_norm": 0.3015284836292267,
"learning_rate": 0.000341541755888651,
"loss": 1.5408,
"step": 158
},
{
"epoch": 0.3328100470957614,
"grad_norm": 0.3028362989425659,
"learning_rate": 0.00034047109207708783,
"loss": 1.5326,
"step": 159
},
{
"epoch": 0.3349031920460492,
"grad_norm": 0.292458176612854,
"learning_rate": 0.0003394004282655246,
"loss": 1.5642,
"step": 160
},
{
"epoch": 0.336996336996337,
"grad_norm": 0.29941415786743164,
"learning_rate": 0.00033832976445396146,
"loss": 1.5114,
"step": 161
},
{
"epoch": 0.3390894819466248,
"grad_norm": 0.2882905602455139,
"learning_rate": 0.0003372591006423983,
"loss": 1.5531,
"step": 162
},
{
"epoch": 0.3411826268969126,
"grad_norm": 0.294680655002594,
"learning_rate": 0.00033618843683083514,
"loss": 1.4926,
"step": 163
},
{
"epoch": 0.34327577184720043,
"grad_norm": 0.3013262152671814,
"learning_rate": 0.0003351177730192719,
"loss": 1.5403,
"step": 164
},
{
"epoch": 0.3453689167974882,
"grad_norm": 0.29358139634132385,
"learning_rate": 0.0003340471092077088,
"loss": 1.503,
"step": 165
},
{
"epoch": 0.34746206174777605,
"grad_norm": 0.2902645766735077,
"learning_rate": 0.00033297644539614566,
"loss": 1.5107,
"step": 166
},
{
"epoch": 0.34955520669806384,
"grad_norm": 0.2952733635902405,
"learning_rate": 0.00033190578158458244,
"loss": 1.525,
"step": 167
},
{
"epoch": 0.3516483516483517,
"grad_norm": 0.29654550552368164,
"learning_rate": 0.0003308351177730193,
"loss": 1.4989,
"step": 168
},
{
"epoch": 0.35374149659863946,
"grad_norm": 0.28225046396255493,
"learning_rate": 0.0003297644539614561,
"loss": 1.46,
"step": 169
},
{
"epoch": 0.35583464154892724,
"grad_norm": 0.30447182059288025,
"learning_rate": 0.0003286937901498929,
"loss": 1.5389,
"step": 170
},
{
"epoch": 0.3579277864992151,
"grad_norm": 0.29335105419158936,
"learning_rate": 0.00032762312633832975,
"loss": 1.4962,
"step": 171
},
{
"epoch": 0.36002093144950287,
"grad_norm": 0.3066118061542511,
"learning_rate": 0.00032655246252676664,
"loss": 1.5122,
"step": 172
},
{
"epoch": 0.3621140763997907,
"grad_norm": 0.3051617443561554,
"learning_rate": 0.00032548179871520343,
"loss": 1.4534,
"step": 173
},
{
"epoch": 0.3642072213500785,
"grad_norm": 0.3032102584838867,
"learning_rate": 0.00032441113490364027,
"loss": 1.4937,
"step": 174
},
{
"epoch": 0.3663003663003663,
"grad_norm": 0.28743627667427063,
"learning_rate": 0.0003233404710920771,
"loss": 1.5288,
"step": 175
},
{
"epoch": 0.3683935112506541,
"grad_norm": 0.2957185208797455,
"learning_rate": 0.0003222698072805139,
"loss": 1.4617,
"step": 176
},
{
"epoch": 0.3704866562009419,
"grad_norm": 0.30057474970817566,
"learning_rate": 0.00032119914346895073,
"loss": 1.5141,
"step": 177
},
{
"epoch": 0.37257980115122974,
"grad_norm": 0.3146776556968689,
"learning_rate": 0.0003201284796573876,
"loss": 1.5033,
"step": 178
},
{
"epoch": 0.3746729461015175,
"grad_norm": 0.3096458911895752,
"learning_rate": 0.0003190578158458244,
"loss": 1.5224,
"step": 179
},
{
"epoch": 0.37676609105180536,
"grad_norm": 0.30057886242866516,
"learning_rate": 0.00031798715203426126,
"loss": 1.447,
"step": 180
},
{
"epoch": 0.37885923600209315,
"grad_norm": 0.3033558428287506,
"learning_rate": 0.0003169164882226981,
"loss": 1.5301,
"step": 181
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.31797683238983154,
"learning_rate": 0.0003158458244111349,
"loss": 1.467,
"step": 182
},
{
"epoch": 0.38304552590266877,
"grad_norm": 0.28933098912239075,
"learning_rate": 0.0003147751605995717,
"loss": 1.4411,
"step": 183
},
{
"epoch": 0.38513867085295656,
"grad_norm": 0.3075162470340729,
"learning_rate": 0.00031370449678800856,
"loss": 1.4969,
"step": 184
},
{
"epoch": 0.3872318158032444,
"grad_norm": 0.2986271381378174,
"learning_rate": 0.0003126338329764454,
"loss": 1.4833,
"step": 185
},
{
"epoch": 0.3893249607535322,
"grad_norm": 0.30813565850257874,
"learning_rate": 0.00031156316916488224,
"loss": 1.4931,
"step": 186
},
{
"epoch": 0.39141810570381996,
"grad_norm": 0.30407366156578064,
"learning_rate": 0.0003104925053533191,
"loss": 1.5209,
"step": 187
},
{
"epoch": 0.3935112506541078,
"grad_norm": 0.3050621747970581,
"learning_rate": 0.0003094218415417559,
"loss": 1.5341,
"step": 188
},
{
"epoch": 0.3956043956043956,
"grad_norm": 0.29337289929389954,
"learning_rate": 0.0003083511777301927,
"loss": 1.4728,
"step": 189
},
{
"epoch": 0.3976975405546834,
"grad_norm": 0.3019981384277344,
"learning_rate": 0.00030728051391862955,
"loss": 1.4761,
"step": 190
},
{
"epoch": 0.3997906855049712,
"grad_norm": 0.30941662192344666,
"learning_rate": 0.0003062098501070664,
"loss": 1.5217,
"step": 191
},
{
"epoch": 0.40188383045525905,
"grad_norm": 0.3021605312824249,
"learning_rate": 0.00030513918629550323,
"loss": 1.5251,
"step": 192
},
{
"epoch": 0.40397697540554683,
"grad_norm": 0.30456283688545227,
"learning_rate": 0.00030406852248394007,
"loss": 1.5013,
"step": 193
},
{
"epoch": 0.4060701203558346,
"grad_norm": 0.3142157793045044,
"learning_rate": 0.0003029978586723769,
"loss": 1.446,
"step": 194
},
{
"epoch": 0.40816326530612246,
"grad_norm": 0.29888784885406494,
"learning_rate": 0.0003019271948608137,
"loss": 1.4682,
"step": 195
},
{
"epoch": 0.41025641025641024,
"grad_norm": 0.29900768399238586,
"learning_rate": 0.00030085653104925053,
"loss": 1.4986,
"step": 196
},
{
"epoch": 0.4123495552066981,
"grad_norm": 0.2862175405025482,
"learning_rate": 0.0002997858672376874,
"loss": 1.4253,
"step": 197
},
{
"epoch": 0.41444270015698587,
"grad_norm": 0.323761910200119,
"learning_rate": 0.00029871520342612416,
"loss": 1.4583,
"step": 198
},
{
"epoch": 0.4165358451072737,
"grad_norm": 0.2848580479621887,
"learning_rate": 0.00029764453961456105,
"loss": 1.4632,
"step": 199
},
{
"epoch": 0.4186289900575615,
"grad_norm": 0.2909344434738159,
"learning_rate": 0.0002965738758029979,
"loss": 1.4738,
"step": 200
}
],
"logging_steps": 1,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.592309546614784e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}