Odin-9B / trainer_state.json
lucyknada's picture
Upload folder using huggingface_hub
6815390 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9926958831341302,
"eval_steps": 500,
"global_step": 752,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0026542800265428003,
"grad_norm": 6.981875026564461,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.5142,
"step": 1
},
{
"epoch": 0.0053085600530856005,
"grad_norm": 7.3585790096388095,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.5399,
"step": 2
},
{
"epoch": 0.007962840079628402,
"grad_norm": 6.187456707748293,
"learning_rate": 3e-06,
"loss": 1.4331,
"step": 3
},
{
"epoch": 0.010617120106171201,
"grad_norm": 3.322201654326747,
"learning_rate": 4.000000000000001e-06,
"loss": 1.4069,
"step": 4
},
{
"epoch": 0.013271400132714002,
"grad_norm": 6.898750176066953,
"learning_rate": 5e-06,
"loss": 1.398,
"step": 5
},
{
"epoch": 0.015925680159256803,
"grad_norm": 9.84996090672252,
"learning_rate": 6e-06,
"loss": 1.4207,
"step": 6
},
{
"epoch": 0.0185799601857996,
"grad_norm": 5.911295227152632,
"learning_rate": 7e-06,
"loss": 1.396,
"step": 7
},
{
"epoch": 0.021234240212342402,
"grad_norm": 4.487244776192921,
"learning_rate": 8.000000000000001e-06,
"loss": 1.3842,
"step": 8
},
{
"epoch": 0.023888520238885203,
"grad_norm": 3.963029697998983,
"learning_rate": 9e-06,
"loss": 1.4094,
"step": 9
},
{
"epoch": 0.026542800265428004,
"grad_norm": 3.9061159778838777,
"learning_rate": 1e-05,
"loss": 1.3289,
"step": 10
},
{
"epoch": 0.029197080291970802,
"grad_norm": 4.06704445096823,
"learning_rate": 9.999988945517944e-06,
"loss": 1.3803,
"step": 11
},
{
"epoch": 0.03185136031851361,
"grad_norm": 2.604059841393431,
"learning_rate": 9.999955782120656e-06,
"loss": 1.3673,
"step": 12
},
{
"epoch": 0.034505640345056404,
"grad_norm": 2.257369986938731,
"learning_rate": 9.999900509954779e-06,
"loss": 1.3601,
"step": 13
},
{
"epoch": 0.0371599203715992,
"grad_norm": 2.1847487718443896,
"learning_rate": 9.999823129264712e-06,
"loss": 1.2897,
"step": 14
},
{
"epoch": 0.039814200398142006,
"grad_norm": 2.07192636064569,
"learning_rate": 9.99972364039262e-06,
"loss": 1.3908,
"step": 15
},
{
"epoch": 0.042468480424684804,
"grad_norm": 1.9501283234498499,
"learning_rate": 9.99960204377842e-06,
"loss": 1.3159,
"step": 16
},
{
"epoch": 0.0451227604512276,
"grad_norm": 2.493269875780714,
"learning_rate": 9.999458339959787e-06,
"loss": 1.294,
"step": 17
},
{
"epoch": 0.047777040477770406,
"grad_norm": 2.5949272964482057,
"learning_rate": 9.999292529572152e-06,
"loss": 1.3185,
"step": 18
},
{
"epoch": 0.050431320504313204,
"grad_norm": 2.2388099180255643,
"learning_rate": 9.99910461334869e-06,
"loss": 1.2132,
"step": 19
},
{
"epoch": 0.05308560053085601,
"grad_norm": 2.2899507956205762,
"learning_rate": 9.99889459212033e-06,
"loss": 1.293,
"step": 20
},
{
"epoch": 0.055739880557398806,
"grad_norm": 1.9172996859984803,
"learning_rate": 9.998662466815743e-06,
"loss": 1.2282,
"step": 21
},
{
"epoch": 0.058394160583941604,
"grad_norm": 1.695255684950505,
"learning_rate": 9.99840823846134e-06,
"loss": 1.2453,
"step": 22
},
{
"epoch": 0.06104844061048441,
"grad_norm": 1.9499380269432547,
"learning_rate": 9.998131908181262e-06,
"loss": 1.3126,
"step": 23
},
{
"epoch": 0.06370272063702721,
"grad_norm": 2.235039095134041,
"learning_rate": 9.997833477197386e-06,
"loss": 1.3,
"step": 24
},
{
"epoch": 0.06635700066357,
"grad_norm": 1.7768874556246268,
"learning_rate": 9.997512946829314e-06,
"loss": 1.3027,
"step": 25
},
{
"epoch": 0.06901128069011281,
"grad_norm": 1.8710962469378072,
"learning_rate": 9.997170318494362e-06,
"loss": 1.2571,
"step": 26
},
{
"epoch": 0.07166556071665561,
"grad_norm": 2.1786369089634734,
"learning_rate": 9.996805593707566e-06,
"loss": 1.2633,
"step": 27
},
{
"epoch": 0.0743198407431984,
"grad_norm": 2.3145346512057805,
"learning_rate": 9.996418774081658e-06,
"loss": 1.2439,
"step": 28
},
{
"epoch": 0.07697412076974121,
"grad_norm": 1.740845781272116,
"learning_rate": 9.996009861327077e-06,
"loss": 1.2437,
"step": 29
},
{
"epoch": 0.07962840079628401,
"grad_norm": 1.9183185283288997,
"learning_rate": 9.99557885725195e-06,
"loss": 1.333,
"step": 30
},
{
"epoch": 0.0822826808228268,
"grad_norm": 2.1688553875791987,
"learning_rate": 9.995125763762089e-06,
"loss": 1.3145,
"step": 31
},
{
"epoch": 0.08493696084936961,
"grad_norm": 1.9658120398634014,
"learning_rate": 9.994650582860978e-06,
"loss": 1.2682,
"step": 32
},
{
"epoch": 0.08759124087591241,
"grad_norm": 2.136505316782775,
"learning_rate": 9.994153316649769e-06,
"loss": 1.24,
"step": 33
},
{
"epoch": 0.0902455209024552,
"grad_norm": 1.670383957571605,
"learning_rate": 9.99363396732727e-06,
"loss": 1.2421,
"step": 34
},
{
"epoch": 0.09289980092899801,
"grad_norm": 1.9007693724974954,
"learning_rate": 9.993092537189936e-06,
"loss": 1.1936,
"step": 35
},
{
"epoch": 0.09555408095554081,
"grad_norm": 1.8427231589681057,
"learning_rate": 9.992529028631859e-06,
"loss": 1.2568,
"step": 36
},
{
"epoch": 0.0982083609820836,
"grad_norm": 1.9014135968523682,
"learning_rate": 9.991943444144758e-06,
"loss": 1.231,
"step": 37
},
{
"epoch": 0.10086264100862641,
"grad_norm": 1.714724530777744,
"learning_rate": 9.991335786317964e-06,
"loss": 1.2559,
"step": 38
},
{
"epoch": 0.10351692103516921,
"grad_norm": 1.9540837660082362,
"learning_rate": 9.990706057838417e-06,
"loss": 1.2583,
"step": 39
},
{
"epoch": 0.10617120106171202,
"grad_norm": 1.7120831927587263,
"learning_rate": 9.990054261490643e-06,
"loss": 1.2095,
"step": 40
},
{
"epoch": 0.10882548108825481,
"grad_norm": 1.7124302876215762,
"learning_rate": 9.989380400156752e-06,
"loss": 1.2361,
"step": 41
},
{
"epoch": 0.11147976111479761,
"grad_norm": 1.7683889355402804,
"learning_rate": 9.98868447681642e-06,
"loss": 1.2134,
"step": 42
},
{
"epoch": 0.11413404114134042,
"grad_norm": 1.8716677445605339,
"learning_rate": 9.987966494546873e-06,
"loss": 1.3081,
"step": 43
},
{
"epoch": 0.11678832116788321,
"grad_norm": 1.810504125507985,
"learning_rate": 9.987226456522884e-06,
"loss": 1.2789,
"step": 44
},
{
"epoch": 0.11944260119442601,
"grad_norm": 2.107999452852097,
"learning_rate": 9.986464366016743e-06,
"loss": 1.2965,
"step": 45
},
{
"epoch": 0.12209688122096882,
"grad_norm": 1.9463843496195974,
"learning_rate": 9.985680226398261e-06,
"loss": 1.2455,
"step": 46
},
{
"epoch": 0.12475116124751161,
"grad_norm": 1.8557616158057193,
"learning_rate": 9.984874041134738e-06,
"loss": 1.2432,
"step": 47
},
{
"epoch": 0.12740544127405443,
"grad_norm": 1.7060682110649106,
"learning_rate": 9.984045813790959e-06,
"loss": 1.1864,
"step": 48
},
{
"epoch": 0.1300597213005972,
"grad_norm": 1.9204675247056242,
"learning_rate": 9.983195548029173e-06,
"loss": 1.2525,
"step": 49
},
{
"epoch": 0.13271400132714,
"grad_norm": 1.8711916549013854,
"learning_rate": 9.98232324760908e-06,
"loss": 1.2836,
"step": 50
},
{
"epoch": 0.1353682813536828,
"grad_norm": 1.6842894937517436,
"learning_rate": 9.981428916387812e-06,
"loss": 1.183,
"step": 51
},
{
"epoch": 0.13802256138022562,
"grad_norm": 1.6306230130296617,
"learning_rate": 9.980512558319915e-06,
"loss": 1.2369,
"step": 52
},
{
"epoch": 0.14067684140676842,
"grad_norm": 1.566857719000752,
"learning_rate": 9.979574177457337e-06,
"loss": 1.1844,
"step": 53
},
{
"epoch": 0.14333112143331123,
"grad_norm": 2.0361026967903966,
"learning_rate": 9.978613777949401e-06,
"loss": 1.2064,
"step": 54
},
{
"epoch": 0.145985401459854,
"grad_norm": 1.8265620367248863,
"learning_rate": 9.977631364042796e-06,
"loss": 1.2432,
"step": 55
},
{
"epoch": 0.1486396814863968,
"grad_norm": 1.7838441388683621,
"learning_rate": 9.976626940081553e-06,
"loss": 1.2852,
"step": 56
},
{
"epoch": 0.1512939615129396,
"grad_norm": 1.7544005767854343,
"learning_rate": 9.975600510507025e-06,
"loss": 1.1735,
"step": 57
},
{
"epoch": 0.15394824153948242,
"grad_norm": 1.80732471729061,
"learning_rate": 9.974552079857873e-06,
"loss": 1.2198,
"step": 58
},
{
"epoch": 0.15660252156602522,
"grad_norm": 1.7204547120415132,
"learning_rate": 9.973481652770039e-06,
"loss": 1.2409,
"step": 59
},
{
"epoch": 0.15925680159256803,
"grad_norm": 1.7446267682486616,
"learning_rate": 9.972389233976729e-06,
"loss": 1.236,
"step": 60
},
{
"epoch": 0.1619110816191108,
"grad_norm": 1.8367615413386507,
"learning_rate": 9.971274828308396e-06,
"loss": 1.2333,
"step": 61
},
{
"epoch": 0.1645653616456536,
"grad_norm": 2.024298503175875,
"learning_rate": 9.970138440692706e-06,
"loss": 1.1798,
"step": 62
},
{
"epoch": 0.1672196416721964,
"grad_norm": 1.6639343310164172,
"learning_rate": 9.968980076154533e-06,
"loss": 1.2429,
"step": 63
},
{
"epoch": 0.16987392169873922,
"grad_norm": 1.7526879469365466,
"learning_rate": 9.967799739815925e-06,
"loss": 1.2448,
"step": 64
},
{
"epoch": 0.17252820172528202,
"grad_norm": 1.551089471898675,
"learning_rate": 9.966597436896085e-06,
"loss": 1.2221,
"step": 65
},
{
"epoch": 0.17518248175182483,
"grad_norm": 3.285115069392907,
"learning_rate": 9.965373172711343e-06,
"loss": 1.2576,
"step": 66
},
{
"epoch": 0.17783676177836763,
"grad_norm": 1.9429706042692902,
"learning_rate": 9.964126952675148e-06,
"loss": 1.2211,
"step": 67
},
{
"epoch": 0.1804910418049104,
"grad_norm": 1.6014290221743892,
"learning_rate": 9.962858782298023e-06,
"loss": 1.2105,
"step": 68
},
{
"epoch": 0.1831453218314532,
"grad_norm": 1.7597646377706344,
"learning_rate": 9.961568667187556e-06,
"loss": 1.2401,
"step": 69
},
{
"epoch": 0.18579960185799602,
"grad_norm": 1.6408652164338016,
"learning_rate": 9.960256613048367e-06,
"loss": 1.1577,
"step": 70
},
{
"epoch": 0.18845388188453882,
"grad_norm": 2.4349687274686334,
"learning_rate": 9.958922625682088e-06,
"loss": 1.193,
"step": 71
},
{
"epoch": 0.19110816191108163,
"grad_norm": 1.9037201335080784,
"learning_rate": 9.957566710987338e-06,
"loss": 1.1489,
"step": 72
},
{
"epoch": 0.19376244193762443,
"grad_norm": 2.1368452991743014,
"learning_rate": 9.956188874959686e-06,
"loss": 1.3215,
"step": 73
},
{
"epoch": 0.1964167219641672,
"grad_norm": 1.7342842074642177,
"learning_rate": 9.954789123691643e-06,
"loss": 1.2288,
"step": 74
},
{
"epoch": 0.19907100199071,
"grad_norm": 1.883985974459675,
"learning_rate": 9.953367463372615e-06,
"loss": 1.2349,
"step": 75
},
{
"epoch": 0.20172528201725282,
"grad_norm": 1.8151478617151462,
"learning_rate": 9.951923900288888e-06,
"loss": 1.1481,
"step": 76
},
{
"epoch": 0.20437956204379562,
"grad_norm": 1.8542475078063598,
"learning_rate": 9.950458440823602e-06,
"loss": 1.262,
"step": 77
},
{
"epoch": 0.20703384207033843,
"grad_norm": 1.77553753540162,
"learning_rate": 9.948971091456715e-06,
"loss": 1.1834,
"step": 78
},
{
"epoch": 0.20968812209688123,
"grad_norm": 1.8152497279053155,
"learning_rate": 9.947461858764978e-06,
"loss": 1.1749,
"step": 79
},
{
"epoch": 0.21234240212342403,
"grad_norm": 1.5929993680573362,
"learning_rate": 9.945930749421903e-06,
"loss": 1.2696,
"step": 80
},
{
"epoch": 0.2149966821499668,
"grad_norm": 2.1883175245684092,
"learning_rate": 9.944377770197741e-06,
"loss": 1.2375,
"step": 81
},
{
"epoch": 0.21765096217650962,
"grad_norm": 1.7556050567556294,
"learning_rate": 9.942802927959444e-06,
"loss": 1.2017,
"step": 82
},
{
"epoch": 0.22030524220305242,
"grad_norm": 1.7392066404895135,
"learning_rate": 9.941206229670634e-06,
"loss": 1.1788,
"step": 83
},
{
"epoch": 0.22295952222959523,
"grad_norm": 2.230563622877349,
"learning_rate": 9.939587682391587e-06,
"loss": 1.2629,
"step": 84
},
{
"epoch": 0.22561380225613803,
"grad_norm": 1.7257531276366218,
"learning_rate": 9.937947293279178e-06,
"loss": 1.1574,
"step": 85
},
{
"epoch": 0.22826808228268083,
"grad_norm": 3.7946522494948134,
"learning_rate": 9.93628506958687e-06,
"loss": 1.2539,
"step": 86
},
{
"epoch": 0.2309223623092236,
"grad_norm": 1.7746755901383093,
"learning_rate": 9.934601018664672e-06,
"loss": 1.1672,
"step": 87
},
{
"epoch": 0.23357664233576642,
"grad_norm": 1.6160184542663385,
"learning_rate": 9.932895147959106e-06,
"loss": 1.2047,
"step": 88
},
{
"epoch": 0.23623092236230922,
"grad_norm": 1.597818533914632,
"learning_rate": 9.931167465013182e-06,
"loss": 1.2087,
"step": 89
},
{
"epoch": 0.23888520238885202,
"grad_norm": 1.8200709191179871,
"learning_rate": 9.929417977466356e-06,
"loss": 1.2594,
"step": 90
},
{
"epoch": 0.24153948241539483,
"grad_norm": 1.5869876859286098,
"learning_rate": 9.927646693054498e-06,
"loss": 1.2923,
"step": 91
},
{
"epoch": 0.24419376244193763,
"grad_norm": 1.6678230174198274,
"learning_rate": 9.925853619609858e-06,
"loss": 1.1979,
"step": 92
},
{
"epoch": 0.2468480424684804,
"grad_norm": 1.7206885835934083,
"learning_rate": 9.924038765061042e-06,
"loss": 1.2248,
"step": 93
},
{
"epoch": 0.24950232249502322,
"grad_norm": 1.8965216866987153,
"learning_rate": 9.922202137432954e-06,
"loss": 1.1793,
"step": 94
},
{
"epoch": 0.252156602521566,
"grad_norm": 1.7827181222199764,
"learning_rate": 9.920343744846786e-06,
"loss": 1.2539,
"step": 95
},
{
"epoch": 0.25481088254810885,
"grad_norm": 1.704509646049322,
"learning_rate": 9.918463595519963e-06,
"loss": 1.1845,
"step": 96
},
{
"epoch": 0.25746516257465163,
"grad_norm": 1.8008684596562938,
"learning_rate": 9.916561697766114e-06,
"loss": 1.1873,
"step": 97
},
{
"epoch": 0.2601194426011944,
"grad_norm": 1.5268474470110187,
"learning_rate": 9.91463805999504e-06,
"loss": 1.1634,
"step": 98
},
{
"epoch": 0.26277372262773724,
"grad_norm": 1.69776670129652,
"learning_rate": 9.912692690712667e-06,
"loss": 1.2496,
"step": 99
},
{
"epoch": 0.26542800265428,
"grad_norm": 19.950768938401303,
"learning_rate": 9.910725598521014e-06,
"loss": 1.2266,
"step": 100
},
{
"epoch": 0.26808228268082285,
"grad_norm": 1.888189833523382,
"learning_rate": 9.908736792118157e-06,
"loss": 1.1783,
"step": 101
},
{
"epoch": 0.2707365627073656,
"grad_norm": 1.6553587447766995,
"learning_rate": 9.906726280298185e-06,
"loss": 1.1888,
"step": 102
},
{
"epoch": 0.2733908427339084,
"grad_norm": 1.7645503651144456,
"learning_rate": 9.904694071951167e-06,
"loss": 1.2331,
"step": 103
},
{
"epoch": 0.27604512276045123,
"grad_norm": 1.7561319773931536,
"learning_rate": 9.902640176063103e-06,
"loss": 1.2429,
"step": 104
},
{
"epoch": 0.278699402786994,
"grad_norm": 1.826781329048666,
"learning_rate": 9.900564601715898e-06,
"loss": 1.2053,
"step": 105
},
{
"epoch": 0.28135368281353684,
"grad_norm": 2.0178364653670777,
"learning_rate": 9.89846735808731e-06,
"loss": 1.1855,
"step": 106
},
{
"epoch": 0.2840079628400796,
"grad_norm": 2.1853732110604027,
"learning_rate": 9.896348454450918e-06,
"loss": 1.1514,
"step": 107
},
{
"epoch": 0.28666224286662245,
"grad_norm": 1.863102490412834,
"learning_rate": 9.894207900176074e-06,
"loss": 1.1582,
"step": 108
},
{
"epoch": 0.28931652289316523,
"grad_norm": 2.1558166021806504,
"learning_rate": 9.892045704727864e-06,
"loss": 1.2692,
"step": 109
},
{
"epoch": 0.291970802919708,
"grad_norm": 1.623855596114215,
"learning_rate": 9.889861877667071e-06,
"loss": 1.1406,
"step": 110
},
{
"epoch": 0.29462508294625084,
"grad_norm": 1.925359975573577,
"learning_rate": 9.887656428650123e-06,
"loss": 1.144,
"step": 111
},
{
"epoch": 0.2972793629727936,
"grad_norm": 2.0780064875742634,
"learning_rate": 9.885429367429062e-06,
"loss": 1.2095,
"step": 112
},
{
"epoch": 0.29993364299933645,
"grad_norm": 1.757836459981376,
"learning_rate": 9.883180703851488e-06,
"loss": 1.2129,
"step": 113
},
{
"epoch": 0.3025879230258792,
"grad_norm": 1.6497771335719753,
"learning_rate": 9.880910447860527e-06,
"loss": 1.1528,
"step": 114
},
{
"epoch": 0.30524220305242206,
"grad_norm": 1.9314161378924497,
"learning_rate": 9.878618609494781e-06,
"loss": 1.2038,
"step": 115
},
{
"epoch": 0.30789648307896483,
"grad_norm": 1.5945997988558909,
"learning_rate": 9.876305198888284e-06,
"loss": 1.1349,
"step": 116
},
{
"epoch": 0.3105507631055076,
"grad_norm": 1.7095400428823162,
"learning_rate": 9.873970226270458e-06,
"loss": 1.1543,
"step": 117
},
{
"epoch": 0.31320504313205044,
"grad_norm": 1.6150384960254696,
"learning_rate": 9.871613701966067e-06,
"loss": 1.1527,
"step": 118
},
{
"epoch": 0.3158593231585932,
"grad_norm": 3.288441610824325,
"learning_rate": 9.869235636395177e-06,
"loss": 1.2411,
"step": 119
},
{
"epoch": 0.31851360318513605,
"grad_norm": 1.6258023683537948,
"learning_rate": 9.866836040073099e-06,
"loss": 1.2002,
"step": 120
},
{
"epoch": 0.32116788321167883,
"grad_norm": 1.6467592688369062,
"learning_rate": 9.86441492361035e-06,
"loss": 1.2134,
"step": 121
},
{
"epoch": 0.3238221632382216,
"grad_norm": 1.5988307616959179,
"learning_rate": 9.861972297712606e-06,
"loss": 1.2259,
"step": 122
},
{
"epoch": 0.32647644326476444,
"grad_norm": 1.9915164437167947,
"learning_rate": 9.859508173180653e-06,
"loss": 1.2369,
"step": 123
},
{
"epoch": 0.3291307232913072,
"grad_norm": 1.751874113048822,
"learning_rate": 9.857022560910338e-06,
"loss": 1.1954,
"step": 124
},
{
"epoch": 0.33178500331785005,
"grad_norm": 1.589249974809787,
"learning_rate": 9.854515471892527e-06,
"loss": 1.1434,
"step": 125
},
{
"epoch": 0.3344392833443928,
"grad_norm": 1.6571603039493696,
"learning_rate": 9.851986917213044e-06,
"loss": 1.1276,
"step": 126
},
{
"epoch": 0.33709356337093566,
"grad_norm": 1.821753338127428,
"learning_rate": 9.849436908052636e-06,
"loss": 1.1889,
"step": 127
},
{
"epoch": 0.33974784339747843,
"grad_norm": 1.55634782143693,
"learning_rate": 9.846865455686915e-06,
"loss": 1.1833,
"step": 128
},
{
"epoch": 0.3424021234240212,
"grad_norm": 1.4585503339043484,
"learning_rate": 9.844272571486313e-06,
"loss": 1.1979,
"step": 129
},
{
"epoch": 0.34505640345056404,
"grad_norm": 1.6155812940652678,
"learning_rate": 9.84165826691602e-06,
"loss": 1.179,
"step": 130
},
{
"epoch": 0.3477106834771068,
"grad_norm": 1.609059975302855,
"learning_rate": 9.839022553535957e-06,
"loss": 1.2091,
"step": 131
},
{
"epoch": 0.35036496350364965,
"grad_norm": 1.6996931072095949,
"learning_rate": 9.836365443000697e-06,
"loss": 1.1223,
"step": 132
},
{
"epoch": 0.35301924353019243,
"grad_norm": 1.616355220759201,
"learning_rate": 9.833686947059436e-06,
"loss": 1.0918,
"step": 133
},
{
"epoch": 0.35567352355673526,
"grad_norm": 1.6096571582268207,
"learning_rate": 9.830987077555925e-06,
"loss": 1.1654,
"step": 134
},
{
"epoch": 0.35832780358327804,
"grad_norm": 1.565339921018465,
"learning_rate": 9.828265846428428e-06,
"loss": 1.1634,
"step": 135
},
{
"epoch": 0.3609820836098208,
"grad_norm": 1.546016830156871,
"learning_rate": 9.825523265709667e-06,
"loss": 1.1751,
"step": 136
},
{
"epoch": 0.36363636363636365,
"grad_norm": 2.0425258490960836,
"learning_rate": 9.822759347526766e-06,
"loss": 1.1841,
"step": 137
},
{
"epoch": 0.3662906436629064,
"grad_norm": 1.7147276492095496,
"learning_rate": 9.819974104101198e-06,
"loss": 1.2335,
"step": 138
},
{
"epoch": 0.36894492368944926,
"grad_norm": 1.5869898030324339,
"learning_rate": 9.817167547748729e-06,
"loss": 1.2584,
"step": 139
},
{
"epoch": 0.37159920371599203,
"grad_norm": 1.6478179096824475,
"learning_rate": 9.814339690879376e-06,
"loss": 1.1961,
"step": 140
},
{
"epoch": 0.37425348374253486,
"grad_norm": 1.5596319041645448,
"learning_rate": 9.811490545997331e-06,
"loss": 1.2046,
"step": 141
},
{
"epoch": 0.37690776376907764,
"grad_norm": 1.81548347845434,
"learning_rate": 9.808620125700925e-06,
"loss": 1.2137,
"step": 142
},
{
"epoch": 0.3795620437956204,
"grad_norm": 1.5700607431043994,
"learning_rate": 9.80572844268256e-06,
"loss": 1.2,
"step": 143
},
{
"epoch": 0.38221632382216325,
"grad_norm": 1.6383722320139935,
"learning_rate": 9.802815509728662e-06,
"loss": 1.1747,
"step": 144
},
{
"epoch": 0.384870603848706,
"grad_norm": 1.8125605110455933,
"learning_rate": 9.799881339719615e-06,
"loss": 1.1867,
"step": 145
},
{
"epoch": 0.38752488387524886,
"grad_norm": 1.7582804382886328,
"learning_rate": 9.796925945629711e-06,
"loss": 1.3162,
"step": 146
},
{
"epoch": 0.39017916390179164,
"grad_norm": 2.0734414221603665,
"learning_rate": 9.793949340527091e-06,
"loss": 1.234,
"step": 147
},
{
"epoch": 0.3928334439283344,
"grad_norm": 1.5379766795331946,
"learning_rate": 9.790951537573686e-06,
"loss": 1.1185,
"step": 148
},
{
"epoch": 0.39548772395487725,
"grad_norm": 1.6227118631483388,
"learning_rate": 9.787932550025158e-06,
"loss": 1.1523,
"step": 149
},
{
"epoch": 0.39814200398142,
"grad_norm": 1.5530246573576652,
"learning_rate": 9.784892391230847e-06,
"loss": 1.1405,
"step": 150
},
{
"epoch": 0.40079628400796286,
"grad_norm": 1.6380120481890832,
"learning_rate": 9.781831074633703e-06,
"loss": 1.2153,
"step": 151
},
{
"epoch": 0.40345056403450563,
"grad_norm": 1.9786402997029178,
"learning_rate": 9.778748613770234e-06,
"loss": 1.2213,
"step": 152
},
{
"epoch": 0.40610484406104846,
"grad_norm": 1.5864201917409944,
"learning_rate": 9.775645022270448e-06,
"loss": 1.1674,
"step": 153
},
{
"epoch": 0.40875912408759124,
"grad_norm": 1.6501859286504295,
"learning_rate": 9.772520313857777e-06,
"loss": 1.1805,
"step": 154
},
{
"epoch": 0.411413404114134,
"grad_norm": 1.5132303328319994,
"learning_rate": 9.769374502349038e-06,
"loss": 1.15,
"step": 155
},
{
"epoch": 0.41406768414067685,
"grad_norm": 1.6907615300646948,
"learning_rate": 9.766207601654356e-06,
"loss": 1.1848,
"step": 156
},
{
"epoch": 0.4167219641672196,
"grad_norm": 1.7001401085077357,
"learning_rate": 9.763019625777111e-06,
"loss": 1.2335,
"step": 157
},
{
"epoch": 0.41937624419376246,
"grad_norm": 1.7600942954439958,
"learning_rate": 9.759810588813872e-06,
"loss": 1.1743,
"step": 158
},
{
"epoch": 0.42203052422030524,
"grad_norm": 1.6515697771161784,
"learning_rate": 9.756580504954334e-06,
"loss": 1.2276,
"step": 159
},
{
"epoch": 0.42468480424684807,
"grad_norm": 1.6967375558942543,
"learning_rate": 9.753329388481261e-06,
"loss": 1.2082,
"step": 160
},
{
"epoch": 0.42733908427339085,
"grad_norm": 1.7152427713922846,
"learning_rate": 9.750057253770413e-06,
"loss": 1.1458,
"step": 161
},
{
"epoch": 0.4299933642999336,
"grad_norm": 1.625060781845651,
"learning_rate": 9.746764115290496e-06,
"loss": 1.2033,
"step": 162
},
{
"epoch": 0.43264764432647645,
"grad_norm": 1.7069397928194143,
"learning_rate": 9.743449987603082e-06,
"loss": 1.2342,
"step": 163
},
{
"epoch": 0.43530192435301923,
"grad_norm": 1.708629436384557,
"learning_rate": 9.740114885362562e-06,
"loss": 1.2442,
"step": 164
},
{
"epoch": 0.43795620437956206,
"grad_norm": 1.5857203714681123,
"learning_rate": 9.736758823316062e-06,
"loss": 1.2097,
"step": 165
},
{
"epoch": 0.44061048440610484,
"grad_norm": 1.657268677184339,
"learning_rate": 9.733381816303395e-06,
"loss": 1.1215,
"step": 166
},
{
"epoch": 0.4432647644326476,
"grad_norm": 1.4640436666626744,
"learning_rate": 9.729983879256988e-06,
"loss": 1.1646,
"step": 167
},
{
"epoch": 0.44591904445919045,
"grad_norm": 1.6268091054804499,
"learning_rate": 9.726565027201813e-06,
"loss": 1.2264,
"step": 168
},
{
"epoch": 0.4485733244857332,
"grad_norm": 1.5858930123997803,
"learning_rate": 9.723125275255325e-06,
"loss": 1.1661,
"step": 169
},
{
"epoch": 0.45122760451227606,
"grad_norm": 1.5759031230494174,
"learning_rate": 9.719664638627395e-06,
"loss": 1.1558,
"step": 170
},
{
"epoch": 0.45388188453881884,
"grad_norm": 1.7486351365651316,
"learning_rate": 9.716183132620242e-06,
"loss": 1.19,
"step": 171
},
{
"epoch": 0.45653616456536167,
"grad_norm": 1.7251231178841304,
"learning_rate": 9.712680772628365e-06,
"loss": 1.2261,
"step": 172
},
{
"epoch": 0.45919044459190445,
"grad_norm": 1.6118734678264717,
"learning_rate": 9.70915757413847e-06,
"loss": 1.2014,
"step": 173
},
{
"epoch": 0.4618447246184472,
"grad_norm": 1.5762577213086215,
"learning_rate": 9.705613552729416e-06,
"loss": 1.1487,
"step": 174
},
{
"epoch": 0.46449900464499005,
"grad_norm": 1.5672859542358526,
"learning_rate": 9.702048724072128e-06,
"loss": 1.1892,
"step": 175
},
{
"epoch": 0.46715328467153283,
"grad_norm": 1.616073022266597,
"learning_rate": 9.698463103929542e-06,
"loss": 1.1718,
"step": 176
},
{
"epoch": 0.46980756469807566,
"grad_norm": 1.605222482810264,
"learning_rate": 9.694856708156526e-06,
"loss": 1.1022,
"step": 177
},
{
"epoch": 0.47246184472461844,
"grad_norm": 1.483617625625729,
"learning_rate": 9.691229552699817e-06,
"loss": 1.1204,
"step": 178
},
{
"epoch": 0.4751161247511613,
"grad_norm": 1.6018473502205803,
"learning_rate": 9.68758165359794e-06,
"loss": 1.1816,
"step": 179
},
{
"epoch": 0.47777040477770405,
"grad_norm": 1.5779836150848479,
"learning_rate": 9.683913026981155e-06,
"loss": 1.1871,
"step": 180
},
{
"epoch": 0.4804246848042468,
"grad_norm": 1.6463102663610685,
"learning_rate": 9.680223689071364e-06,
"loss": 1.1139,
"step": 181
},
{
"epoch": 0.48307896483078966,
"grad_norm": 1.7091000919337074,
"learning_rate": 9.676513656182059e-06,
"loss": 1.1695,
"step": 182
},
{
"epoch": 0.48573324485733244,
"grad_norm": 1.633509337933534,
"learning_rate": 9.672782944718234e-06,
"loss": 1.1811,
"step": 183
},
{
"epoch": 0.48838752488387527,
"grad_norm": 1.5767561431519088,
"learning_rate": 9.669031571176322e-06,
"loss": 1.2062,
"step": 184
},
{
"epoch": 0.49104180491041804,
"grad_norm": 1.6306907003550404,
"learning_rate": 9.665259552144122e-06,
"loss": 1.1829,
"step": 185
},
{
"epoch": 0.4936960849369608,
"grad_norm": 1.517988528061533,
"learning_rate": 9.66146690430072e-06,
"loss": 1.2014,
"step": 186
},
{
"epoch": 0.49635036496350365,
"grad_norm": 1.598871387440831,
"learning_rate": 9.657653644416417e-06,
"loss": 1.1496,
"step": 187
},
{
"epoch": 0.49900464499004643,
"grad_norm": 2.400377785726973,
"learning_rate": 9.65381978935266e-06,
"loss": 1.1905,
"step": 188
},
{
"epoch": 0.5016589250165893,
"grad_norm": 1.5306038174802905,
"learning_rate": 9.649965356061961e-06,
"loss": 1.1225,
"step": 189
},
{
"epoch": 0.504313205043132,
"grad_norm": 1.7432637039460837,
"learning_rate": 9.646090361587828e-06,
"loss": 1.2338,
"step": 190
},
{
"epoch": 0.5069674850696748,
"grad_norm": 1.549174772320108,
"learning_rate": 9.642194823064679e-06,
"loss": 1.1395,
"step": 191
},
{
"epoch": 0.5096217650962177,
"grad_norm": 1.4556718082039433,
"learning_rate": 9.63827875771778e-06,
"loss": 1.1054,
"step": 192
},
{
"epoch": 0.5122760451227605,
"grad_norm": 1.546232476076245,
"learning_rate": 9.634342182863163e-06,
"loss": 1.1821,
"step": 193
},
{
"epoch": 0.5149303251493033,
"grad_norm": 1.6428065540768686,
"learning_rate": 9.630385115907545e-06,
"loss": 1.2078,
"step": 194
},
{
"epoch": 0.517584605175846,
"grad_norm": 1.5932949193165389,
"learning_rate": 9.626407574348258e-06,
"loss": 1.1646,
"step": 195
},
{
"epoch": 0.5202388852023888,
"grad_norm": 1.5803201555935116,
"learning_rate": 9.622409575773162e-06,
"loss": 1.166,
"step": 196
},
{
"epoch": 0.5228931652289317,
"grad_norm": 1.5292820314306055,
"learning_rate": 9.618391137860583e-06,
"loss": 1.2152,
"step": 197
},
{
"epoch": 0.5255474452554745,
"grad_norm": 1.3727930028186761,
"learning_rate": 9.614352278379217e-06,
"loss": 1.1402,
"step": 198
},
{
"epoch": 0.5282017252820173,
"grad_norm": 1.6819090165661312,
"learning_rate": 9.610293015188067e-06,
"loss": 1.1665,
"step": 199
},
{
"epoch": 0.53085600530856,
"grad_norm": 1.5006037012098021,
"learning_rate": 9.606213366236354e-06,
"loss": 1.1877,
"step": 200
},
{
"epoch": 0.5335102853351028,
"grad_norm": 1.6016624668408799,
"learning_rate": 9.60211334956344e-06,
"loss": 1.1498,
"step": 201
},
{
"epoch": 0.5361645653616457,
"grad_norm": 1.7368140744461305,
"learning_rate": 9.597992983298748e-06,
"loss": 1.1922,
"step": 202
},
{
"epoch": 0.5388188453881885,
"grad_norm": 1.6176251602621352,
"learning_rate": 9.593852285661684e-06,
"loss": 1.1459,
"step": 203
},
{
"epoch": 0.5414731254147312,
"grad_norm": 1.3750495471617235,
"learning_rate": 9.589691274961556e-06,
"loss": 1.0835,
"step": 204
},
{
"epoch": 0.544127405441274,
"grad_norm": 1.6906485599869903,
"learning_rate": 9.585509969597491e-06,
"loss": 1.22,
"step": 205
},
{
"epoch": 0.5467816854678168,
"grad_norm": 1.5439326128894457,
"learning_rate": 9.581308388058354e-06,
"loss": 1.1364,
"step": 206
},
{
"epoch": 0.5494359654943597,
"grad_norm": 1.5251041120197495,
"learning_rate": 9.577086548922671e-06,
"loss": 1.2201,
"step": 207
},
{
"epoch": 0.5520902455209025,
"grad_norm": 1.511712369802414,
"learning_rate": 9.572844470858537e-06,
"loss": 1.1091,
"step": 208
},
{
"epoch": 0.5547445255474452,
"grad_norm": 1.8573483808679467,
"learning_rate": 9.568582172623544e-06,
"loss": 1.2284,
"step": 209
},
{
"epoch": 0.557398805573988,
"grad_norm": 1.4309251806187955,
"learning_rate": 9.56429967306469e-06,
"loss": 1.1646,
"step": 210
},
{
"epoch": 0.5600530856005309,
"grad_norm": 1.6268260856080405,
"learning_rate": 9.559996991118304e-06,
"loss": 1.1812,
"step": 211
},
{
"epoch": 0.5627073656270737,
"grad_norm": 1.6752285964398912,
"learning_rate": 9.55567414580995e-06,
"loss": 1.19,
"step": 212
},
{
"epoch": 0.5653616456536165,
"grad_norm": 1.6202125494829664,
"learning_rate": 9.551331156254358e-06,
"loss": 1.2001,
"step": 213
},
{
"epoch": 0.5680159256801592,
"grad_norm": 1.4441208249265054,
"learning_rate": 9.546968041655326e-06,
"loss": 1.2011,
"step": 214
},
{
"epoch": 0.570670205706702,
"grad_norm": 1.4681168393734876,
"learning_rate": 9.542584821305643e-06,
"loss": 1.118,
"step": 215
},
{
"epoch": 0.5733244857332449,
"grad_norm": 1.67215223118757,
"learning_rate": 9.538181514587004e-06,
"loss": 1.1441,
"step": 216
},
{
"epoch": 0.5759787657597877,
"grad_norm": 1.840004210878956,
"learning_rate": 9.533758140969913e-06,
"loss": 1.1689,
"step": 217
},
{
"epoch": 0.5786330457863305,
"grad_norm": 2.0817799067244387,
"learning_rate": 9.529314720013618e-06,
"loss": 1.1879,
"step": 218
},
{
"epoch": 0.5812873258128732,
"grad_norm": 1.6384013753881452,
"learning_rate": 9.524851271366002e-06,
"loss": 1.1157,
"step": 219
},
{
"epoch": 0.583941605839416,
"grad_norm": 1.6847540459176993,
"learning_rate": 9.520367814763514e-06,
"loss": 1.1583,
"step": 220
},
{
"epoch": 0.5865958858659589,
"grad_norm": 1.553367758905212,
"learning_rate": 9.515864370031066e-06,
"loss": 1.0916,
"step": 221
},
{
"epoch": 0.5892501658925017,
"grad_norm": 1.6595661898312408,
"learning_rate": 9.511340957081957e-06,
"loss": 1.1912,
"step": 222
},
{
"epoch": 0.5919044459190445,
"grad_norm": 1.6816767854984012,
"learning_rate": 9.506797595917787e-06,
"loss": 1.1948,
"step": 223
},
{
"epoch": 0.5945587259455872,
"grad_norm": 1.4766094174812612,
"learning_rate": 9.502234306628354e-06,
"loss": 1.1607,
"step": 224
},
{
"epoch": 0.59721300597213,
"grad_norm": 1.5815513019760774,
"learning_rate": 9.49765110939158e-06,
"loss": 1.1248,
"step": 225
},
{
"epoch": 0.5998672859986729,
"grad_norm": 1.6485658910927394,
"learning_rate": 9.493048024473413e-06,
"loss": 1.2191,
"step": 226
},
{
"epoch": 0.6025215660252157,
"grad_norm": 1.424065848656427,
"learning_rate": 9.488425072227738e-06,
"loss": 1.2521,
"step": 227
},
{
"epoch": 0.6051758460517584,
"grad_norm": 1.4486333802405926,
"learning_rate": 9.483782273096295e-06,
"loss": 1.1734,
"step": 228
},
{
"epoch": 0.6078301260783012,
"grad_norm": 1.6817918601770532,
"learning_rate": 9.47911964760858e-06,
"loss": 1.1695,
"step": 229
},
{
"epoch": 0.6104844061048441,
"grad_norm": 1.6160290558732326,
"learning_rate": 9.474437216381756e-06,
"loss": 1.154,
"step": 230
},
{
"epoch": 0.6131386861313869,
"grad_norm": 1.4261795572898603,
"learning_rate": 9.469735000120564e-06,
"loss": 1.1544,
"step": 231
},
{
"epoch": 0.6157929661579297,
"grad_norm": 1.458151411666846,
"learning_rate": 9.46501301961723e-06,
"loss": 1.2065,
"step": 232
},
{
"epoch": 0.6184472461844724,
"grad_norm": 1.5627499060274408,
"learning_rate": 9.460271295751373e-06,
"loss": 1.1579,
"step": 233
},
{
"epoch": 0.6211015262110152,
"grad_norm": 1.86944032236805,
"learning_rate": 9.455509849489915e-06,
"loss": 1.1519,
"step": 234
},
{
"epoch": 0.6237558062375581,
"grad_norm": 1.979766174904363,
"learning_rate": 9.450728701886985e-06,
"loss": 1.2358,
"step": 235
},
{
"epoch": 0.6264100862641009,
"grad_norm": 1.5229843416844162,
"learning_rate": 9.445927874083825e-06,
"loss": 1.1207,
"step": 236
},
{
"epoch": 0.6290643662906437,
"grad_norm": 1.5916340950774943,
"learning_rate": 9.441107387308701e-06,
"loss": 1.2486,
"step": 237
},
{
"epoch": 0.6317186463171864,
"grad_norm": 1.4982052500691954,
"learning_rate": 9.436267262876808e-06,
"loss": 1.1445,
"step": 238
},
{
"epoch": 0.6343729263437292,
"grad_norm": 1.868028818978397,
"learning_rate": 9.431407522190176e-06,
"loss": 1.2215,
"step": 239
},
{
"epoch": 0.6370272063702721,
"grad_norm": 1.5000893386206633,
"learning_rate": 9.426528186737566e-06,
"loss": 1.1748,
"step": 240
},
{
"epoch": 0.6396814863968149,
"grad_norm": 1.6105517075622542,
"learning_rate": 9.421629278094394e-06,
"loss": 1.1444,
"step": 241
},
{
"epoch": 0.6423357664233577,
"grad_norm": 1.6245044582496362,
"learning_rate": 9.416710817922615e-06,
"loss": 1.2016,
"step": 242
},
{
"epoch": 0.6449900464499004,
"grad_norm": 1.582791773770731,
"learning_rate": 9.411772827970642e-06,
"loss": 1.1595,
"step": 243
},
{
"epoch": 0.6476443264764432,
"grad_norm": 1.5289298221123744,
"learning_rate": 9.406815330073244e-06,
"loss": 1.196,
"step": 244
},
{
"epoch": 0.6502986065029861,
"grad_norm": 1.494805179412693,
"learning_rate": 9.40183834615145e-06,
"loss": 1.119,
"step": 245
},
{
"epoch": 0.6529528865295289,
"grad_norm": 1.6857955705395817,
"learning_rate": 9.396841898212452e-06,
"loss": 1.1222,
"step": 246
},
{
"epoch": 0.6556071665560717,
"grad_norm": 1.465569644664737,
"learning_rate": 9.391826008349507e-06,
"loss": 1.1196,
"step": 247
},
{
"epoch": 0.6582614465826144,
"grad_norm": 1.6038700287536702,
"learning_rate": 9.38679069874184e-06,
"loss": 1.1596,
"step": 248
},
{
"epoch": 0.6609157266091573,
"grad_norm": 1.754259412074635,
"learning_rate": 9.381735991654547e-06,
"loss": 1.185,
"step": 249
},
{
"epoch": 0.6635700066357001,
"grad_norm": 1.5054976516017162,
"learning_rate": 9.376661909438496e-06,
"loss": 1.14,
"step": 250
},
{
"epoch": 0.6662242866622429,
"grad_norm": 1.6591136646331954,
"learning_rate": 9.371568474530228e-06,
"loss": 1.1453,
"step": 251
},
{
"epoch": 0.6688785666887856,
"grad_norm": 1.602614315373211,
"learning_rate": 9.366455709451857e-06,
"loss": 1.115,
"step": 252
},
{
"epoch": 0.6715328467153284,
"grad_norm": 1.3802344389470933,
"learning_rate": 9.36132363681097e-06,
"loss": 1.0926,
"step": 253
},
{
"epoch": 0.6741871267418713,
"grad_norm": 1.5028041314507699,
"learning_rate": 9.356172279300528e-06,
"loss": 1.1388,
"step": 254
},
{
"epoch": 0.6768414067684141,
"grad_norm": 1.4603385973006835,
"learning_rate": 9.35100165969877e-06,
"loss": 1.1261,
"step": 255
},
{
"epoch": 0.6794956867949569,
"grad_norm": 2.222737357031752,
"learning_rate": 9.3458118008691e-06,
"loss": 1.1181,
"step": 256
},
{
"epoch": 0.6821499668214996,
"grad_norm": 1.5628150576174966,
"learning_rate": 9.340602725760003e-06,
"loss": 1.1269,
"step": 257
},
{
"epoch": 0.6848042468480424,
"grad_norm": 1.7660936315398623,
"learning_rate": 9.335374457404928e-06,
"loss": 1.1567,
"step": 258
},
{
"epoch": 0.6874585268745853,
"grad_norm": 1.5095573241471834,
"learning_rate": 9.330127018922195e-06,
"loss": 1.1407,
"step": 259
},
{
"epoch": 0.6901128069011281,
"grad_norm": 1.4506359372228914,
"learning_rate": 9.324860433514888e-06,
"loss": 1.1668,
"step": 260
},
{
"epoch": 0.6927670869276709,
"grad_norm": 1.536882345986633,
"learning_rate": 9.319574724470756e-06,
"loss": 1.1581,
"step": 261
},
{
"epoch": 0.6954213669542136,
"grad_norm": 1.4356269422691534,
"learning_rate": 9.314269915162115e-06,
"loss": 1.1075,
"step": 262
},
{
"epoch": 0.6980756469807564,
"grad_norm": 1.373904876593965,
"learning_rate": 9.308946029045726e-06,
"loss": 1.1121,
"step": 263
},
{
"epoch": 0.7007299270072993,
"grad_norm": 1.5328812905843867,
"learning_rate": 9.303603089662717e-06,
"loss": 1.0921,
"step": 264
},
{
"epoch": 0.7033842070338421,
"grad_norm": 1.5072781837506157,
"learning_rate": 9.298241120638451e-06,
"loss": 1.1198,
"step": 265
},
{
"epoch": 0.7060384870603849,
"grad_norm": 1.5995295442728128,
"learning_rate": 9.292860145682451e-06,
"loss": 1.1472,
"step": 266
},
{
"epoch": 0.7086927670869276,
"grad_norm": 1.586589487215959,
"learning_rate": 9.287460188588272e-06,
"loss": 1.2081,
"step": 267
},
{
"epoch": 0.7113470471134705,
"grad_norm": 1.6738675413951511,
"learning_rate": 9.282041273233402e-06,
"loss": 1.1676,
"step": 268
},
{
"epoch": 0.7140013271400133,
"grad_norm": 1.5986869946296454,
"learning_rate": 9.276603423579164e-06,
"loss": 1.213,
"step": 269
},
{
"epoch": 0.7166556071665561,
"grad_norm": 1.5027119454217344,
"learning_rate": 9.271146663670605e-06,
"loss": 1.1622,
"step": 270
},
{
"epoch": 0.7193098871930989,
"grad_norm": 1.4752249291840163,
"learning_rate": 9.265671017636384e-06,
"loss": 1.0725,
"step": 271
},
{
"epoch": 0.7219641672196416,
"grad_norm": 1.6425492982199013,
"learning_rate": 9.260176509688673e-06,
"loss": 1.2088,
"step": 272
},
{
"epoch": 0.7246184472461845,
"grad_norm": 1.671119694405482,
"learning_rate": 9.254663164123052e-06,
"loss": 1.1584,
"step": 273
},
{
"epoch": 0.7272727272727273,
"grad_norm": 3.2839083971639016,
"learning_rate": 9.249131005318388e-06,
"loss": 1.0801,
"step": 274
},
{
"epoch": 0.7299270072992701,
"grad_norm": 1.590670276122513,
"learning_rate": 9.243580057736743e-06,
"loss": 1.1157,
"step": 275
},
{
"epoch": 0.7325812873258128,
"grad_norm": 1.4517652800533363,
"learning_rate": 9.238010345923257e-06,
"loss": 1.1446,
"step": 276
},
{
"epoch": 0.7352355673523556,
"grad_norm": 1.9696673043614277,
"learning_rate": 9.232421894506043e-06,
"loss": 1.1857,
"step": 277
},
{
"epoch": 0.7378898473788985,
"grad_norm": 1.4778960277561557,
"learning_rate": 9.226814728196072e-06,
"loss": 1.1397,
"step": 278
},
{
"epoch": 0.7405441274054413,
"grad_norm": 1.6498804570471097,
"learning_rate": 9.221188871787076e-06,
"loss": 1.1625,
"step": 279
},
{
"epoch": 0.7431984074319841,
"grad_norm": 1.5796993896804141,
"learning_rate": 9.215544350155423e-06,
"loss": 1.1459,
"step": 280
},
{
"epoch": 0.7458526874585268,
"grad_norm": 1.5226644568838132,
"learning_rate": 9.209881188260021e-06,
"loss": 1.1894,
"step": 281
},
{
"epoch": 0.7485069674850697,
"grad_norm": 1.6645552718061039,
"learning_rate": 9.204199411142196e-06,
"loss": 1.0811,
"step": 282
},
{
"epoch": 0.7511612475116125,
"grad_norm": 1.6581847965929961,
"learning_rate": 9.198499043925591e-06,
"loss": 1.1706,
"step": 283
},
{
"epoch": 0.7538155275381553,
"grad_norm": 1.5270964606037345,
"learning_rate": 9.192780111816048e-06,
"loss": 1.1009,
"step": 284
},
{
"epoch": 0.7564698075646981,
"grad_norm": 1.6698962782227256,
"learning_rate": 9.1870426401015e-06,
"loss": 1.1708,
"step": 285
},
{
"epoch": 0.7591240875912408,
"grad_norm": 1.7012646465038568,
"learning_rate": 9.18128665415186e-06,
"loss": 1.1728,
"step": 286
},
{
"epoch": 0.7617783676177837,
"grad_norm": 1.4354980241800914,
"learning_rate": 9.175512179418903e-06,
"loss": 1.1138,
"step": 287
},
{
"epoch": 0.7644326476443265,
"grad_norm": 1.5648924104277102,
"learning_rate": 9.169719241436162e-06,
"loss": 1.0936,
"step": 288
},
{
"epoch": 0.7670869276708693,
"grad_norm": 1.535950564272176,
"learning_rate": 9.163907865818806e-06,
"loss": 1.0884,
"step": 289
},
{
"epoch": 0.769741207697412,
"grad_norm": 1.4657493870841045,
"learning_rate": 9.158078078263536e-06,
"loss": 1.0962,
"step": 290
},
{
"epoch": 0.7723954877239548,
"grad_norm": 1.5960566218254721,
"learning_rate": 9.152229904548464e-06,
"loss": 1.1003,
"step": 291
},
{
"epoch": 0.7750497677504977,
"grad_norm": 1.5026317273526155,
"learning_rate": 9.146363370533004e-06,
"loss": 1.1334,
"step": 292
},
{
"epoch": 0.7777040477770405,
"grad_norm": 1.4667451034506551,
"learning_rate": 9.14047850215775e-06,
"loss": 1.188,
"step": 293
},
{
"epoch": 0.7803583278035833,
"grad_norm": 2.5527846830656773,
"learning_rate": 9.134575325444377e-06,
"loss": 1.1489,
"step": 294
},
{
"epoch": 0.783012607830126,
"grad_norm": 1.5656317760690617,
"learning_rate": 9.128653866495504e-06,
"loss": 1.1049,
"step": 295
},
{
"epoch": 0.7856668878566688,
"grad_norm": 1.4532042000319447,
"learning_rate": 9.122714151494599e-06,
"loss": 1.1156,
"step": 296
},
{
"epoch": 0.7883211678832117,
"grad_norm": 1.4759483242959985,
"learning_rate": 9.116756206705848e-06,
"loss": 1.1396,
"step": 297
},
{
"epoch": 0.7909754479097545,
"grad_norm": 1.4531099151254951,
"learning_rate": 9.110780058474052e-06,
"loss": 1.1011,
"step": 298
},
{
"epoch": 0.7936297279362973,
"grad_norm": 1.509245001105786,
"learning_rate": 9.104785733224498e-06,
"loss": 1.1052,
"step": 299
},
{
"epoch": 0.79628400796284,
"grad_norm": 1.4742686115404562,
"learning_rate": 9.09877325746285e-06,
"loss": 1.1627,
"step": 300
},
{
"epoch": 0.7989382879893829,
"grad_norm": 1.4451227706627736,
"learning_rate": 9.092742657775031e-06,
"loss": 1.1118,
"step": 301
},
{
"epoch": 0.8015925680159257,
"grad_norm": 1.575230566769605,
"learning_rate": 9.086693960827106e-06,
"loss": 1.1625,
"step": 302
},
{
"epoch": 0.8042468480424685,
"grad_norm": 1.6679637319120473,
"learning_rate": 9.080627193365155e-06,
"loss": 1.1452,
"step": 303
},
{
"epoch": 0.8069011280690113,
"grad_norm": 1.4072750238146392,
"learning_rate": 9.07454238221517e-06,
"loss": 1.1121,
"step": 304
},
{
"epoch": 0.809555408095554,
"grad_norm": 1.399645387242144,
"learning_rate": 9.068439554282924e-06,
"loss": 1.1101,
"step": 305
},
{
"epoch": 0.8122096881220969,
"grad_norm": 1.9740369624876526,
"learning_rate": 9.06231873655386e-06,
"loss": 1.0986,
"step": 306
},
{
"epoch": 0.8148639681486397,
"grad_norm": 1.4581046261229995,
"learning_rate": 9.056179956092961e-06,
"loss": 1.1228,
"step": 307
},
{
"epoch": 0.8175182481751825,
"grad_norm": 2.628430909687979,
"learning_rate": 9.050023240044649e-06,
"loss": 1.0783,
"step": 308
},
{
"epoch": 0.8201725282017253,
"grad_norm": 1.6691124773863195,
"learning_rate": 9.043848615632643e-06,
"loss": 1.167,
"step": 309
},
{
"epoch": 0.822826808228268,
"grad_norm": 1.7459906965590473,
"learning_rate": 9.03765611015985e-06,
"loss": 1.2287,
"step": 310
},
{
"epoch": 0.8254810882548109,
"grad_norm": 1.5373249007323673,
"learning_rate": 9.031445751008252e-06,
"loss": 1.1446,
"step": 311
},
{
"epoch": 0.8281353682813537,
"grad_norm": 1.526522854497616,
"learning_rate": 9.025217565638766e-06,
"loss": 1.1609,
"step": 312
},
{
"epoch": 0.8307896483078965,
"grad_norm": 1.3715974716678416,
"learning_rate": 9.018971581591141e-06,
"loss": 1.1761,
"step": 313
},
{
"epoch": 0.8334439283344393,
"grad_norm": 1.733161587991312,
"learning_rate": 9.012707826483823e-06,
"loss": 1.1241,
"step": 314
},
{
"epoch": 0.836098208360982,
"grad_norm": 1.5851407690090333,
"learning_rate": 9.006426328013838e-06,
"loss": 1.1898,
"step": 315
},
{
"epoch": 0.8387524883875249,
"grad_norm": 1.492565448115301,
"learning_rate": 9.000127113956673e-06,
"loss": 1.1281,
"step": 316
},
{
"epoch": 0.8414067684140677,
"grad_norm": 1.4675427619453145,
"learning_rate": 8.993810212166147e-06,
"loss": 1.1078,
"step": 317
},
{
"epoch": 0.8440610484406105,
"grad_norm": 1.7806802137808329,
"learning_rate": 8.987475650574289e-06,
"loss": 1.1113,
"step": 318
},
{
"epoch": 0.8467153284671532,
"grad_norm": 1.7957085592461643,
"learning_rate": 8.98112345719122e-06,
"loss": 1.0371,
"step": 319
},
{
"epoch": 0.8493696084936961,
"grad_norm": 1.6891739001445774,
"learning_rate": 8.974753660105023e-06,
"loss": 1.1939,
"step": 320
},
{
"epoch": 0.8520238885202389,
"grad_norm": 1.361414937851007,
"learning_rate": 8.968366287481621e-06,
"loss": 1.0606,
"step": 321
},
{
"epoch": 0.8546781685467817,
"grad_norm": 1.5477011631255944,
"learning_rate": 8.961961367564652e-06,
"loss": 1.1343,
"step": 322
},
{
"epoch": 0.8573324485733245,
"grad_norm": 1.398038196798421,
"learning_rate": 8.955538928675343e-06,
"loss": 1.0537,
"step": 323
},
{
"epoch": 0.8599867285998672,
"grad_norm": 1.4829616588106211,
"learning_rate": 8.94909899921239e-06,
"loss": 1.1244,
"step": 324
},
{
"epoch": 0.8626410086264101,
"grad_norm": 1.458234865181319,
"learning_rate": 8.94264160765183e-06,
"loss": 1.0945,
"step": 325
},
{
"epoch": 0.8652952886529529,
"grad_norm": 1.48674147774638,
"learning_rate": 8.936166782546907e-06,
"loss": 1.0698,
"step": 326
},
{
"epoch": 0.8679495686794957,
"grad_norm": 1.3468414140104497,
"learning_rate": 8.929674552527956e-06,
"loss": 1.0428,
"step": 327
},
{
"epoch": 0.8706038487060385,
"grad_norm": 1.4927690225590464,
"learning_rate": 8.923164946302274e-06,
"loss": 1.1367,
"step": 328
},
{
"epoch": 0.8732581287325812,
"grad_norm": 1.3780023302624582,
"learning_rate": 8.91663799265399e-06,
"loss": 1.1048,
"step": 329
},
{
"epoch": 0.8759124087591241,
"grad_norm": 1.8364949062401694,
"learning_rate": 8.910093720443945e-06,
"loss": 1.1962,
"step": 330
},
{
"epoch": 0.8785666887856669,
"grad_norm": 1.6803377587803117,
"learning_rate": 8.903532158609548e-06,
"loss": 1.1919,
"step": 331
},
{
"epoch": 0.8812209688122097,
"grad_norm": 1.5621810302199315,
"learning_rate": 8.89695333616467e-06,
"loss": 1.1177,
"step": 332
},
{
"epoch": 0.8838752488387525,
"grad_norm": 1.4462262022449852,
"learning_rate": 8.890357282199504e-06,
"loss": 1.1321,
"step": 333
},
{
"epoch": 0.8865295288652952,
"grad_norm": 1.4286236136174415,
"learning_rate": 8.883744025880429e-06,
"loss": 1.1717,
"step": 334
},
{
"epoch": 0.8891838088918381,
"grad_norm": 1.4484748876813012,
"learning_rate": 8.877113596449895e-06,
"loss": 1.1004,
"step": 335
},
{
"epoch": 0.8918380889183809,
"grad_norm": 1.4164984401949983,
"learning_rate": 8.87046602322629e-06,
"loss": 1.079,
"step": 336
},
{
"epoch": 0.8944923689449237,
"grad_norm": 1.3708607011124272,
"learning_rate": 8.863801335603802e-06,
"loss": 1.133,
"step": 337
},
{
"epoch": 0.8971466489714665,
"grad_norm": 1.3626382714893748,
"learning_rate": 8.857119563052301e-06,
"loss": 1.0734,
"step": 338
},
{
"epoch": 0.8998009289980093,
"grad_norm": 1.5082034601534042,
"learning_rate": 8.850420735117202e-06,
"loss": 1.1691,
"step": 339
},
{
"epoch": 0.9024552090245521,
"grad_norm": 1.3234730893075355,
"learning_rate": 8.843704881419333e-06,
"loss": 1.046,
"step": 340
},
{
"epoch": 0.9051094890510949,
"grad_norm": 1.4896833219647911,
"learning_rate": 8.836972031654807e-06,
"loss": 1.1586,
"step": 341
},
{
"epoch": 0.9077637690776377,
"grad_norm": 1.3697029850159739,
"learning_rate": 8.83022221559489e-06,
"loss": 1.0817,
"step": 342
},
{
"epoch": 0.9104180491041804,
"grad_norm": 1.747564979115208,
"learning_rate": 8.823455463085873e-06,
"loss": 1.0905,
"step": 343
},
{
"epoch": 0.9130723291307233,
"grad_norm": 1.5649272934153584,
"learning_rate": 8.816671804048933e-06,
"loss": 1.0434,
"step": 344
},
{
"epoch": 0.9157266091572661,
"grad_norm": 1.4823250348157,
"learning_rate": 8.809871268480004e-06,
"loss": 1.0895,
"step": 345
},
{
"epoch": 0.9183808891838089,
"grad_norm": 1.4264959835661182,
"learning_rate": 8.803053886449644e-06,
"loss": 1.1502,
"step": 346
},
{
"epoch": 0.9210351692103517,
"grad_norm": 1.5424239648407791,
"learning_rate": 8.796219688102906e-06,
"loss": 1.0734,
"step": 347
},
{
"epoch": 0.9236894492368944,
"grad_norm": 1.594778792432936,
"learning_rate": 8.789368703659199e-06,
"loss": 1.06,
"step": 348
},
{
"epoch": 0.9263437292634373,
"grad_norm": 1.425756455063989,
"learning_rate": 8.782500963412156e-06,
"loss": 1.1091,
"step": 349
},
{
"epoch": 0.9289980092899801,
"grad_norm": 1.4480941030784251,
"learning_rate": 8.775616497729502e-06,
"loss": 1.1146,
"step": 350
},
{
"epoch": 0.9316522893165229,
"grad_norm": 1.9595578470904635,
"learning_rate": 8.768715337052918e-06,
"loss": 1.1353,
"step": 351
},
{
"epoch": 0.9343065693430657,
"grad_norm": 1.6462295827570508,
"learning_rate": 8.761797511897907e-06,
"loss": 1.1376,
"step": 352
},
{
"epoch": 0.9369608493696084,
"grad_norm": 1.393588576405631,
"learning_rate": 8.754863052853658e-06,
"loss": 1.1317,
"step": 353
},
{
"epoch": 0.9396151293961513,
"grad_norm": 2.230474529090937,
"learning_rate": 8.747911990582912e-06,
"loss": 1.1086,
"step": 354
},
{
"epoch": 0.9422694094226941,
"grad_norm": 1.809443765521074,
"learning_rate": 8.740944355821827e-06,
"loss": 1.1018,
"step": 355
},
{
"epoch": 0.9449236894492369,
"grad_norm": 1.6826959358419462,
"learning_rate": 8.733960179379842e-06,
"loss": 1.1766,
"step": 356
},
{
"epoch": 0.9475779694757797,
"grad_norm": 1.429793323082417,
"learning_rate": 8.726959492139535e-06,
"loss": 1.062,
"step": 357
},
{
"epoch": 0.9502322495023225,
"grad_norm": 1.3304241051942485,
"learning_rate": 8.719942325056496e-06,
"loss": 1.0864,
"step": 358
},
{
"epoch": 0.9528865295288653,
"grad_norm": 1.838527760485716,
"learning_rate": 8.712908709159183e-06,
"loss": 1.08,
"step": 359
},
{
"epoch": 0.9555408095554081,
"grad_norm": 1.8095644142003555,
"learning_rate": 8.70585867554879e-06,
"loss": 1.0622,
"step": 360
},
{
"epoch": 0.9581950895819509,
"grad_norm": 1.3961944428914481,
"learning_rate": 8.698792255399104e-06,
"loss": 1.1279,
"step": 361
},
{
"epoch": 0.9608493696084937,
"grad_norm": 1.4265196608054989,
"learning_rate": 8.691709479956373e-06,
"loss": 1.0786,
"step": 362
},
{
"epoch": 0.9635036496350365,
"grad_norm": 1.5175948559199692,
"learning_rate": 8.68461038053916e-06,
"loss": 1.1046,
"step": 363
},
{
"epoch": 0.9661579296615793,
"grad_norm": 1.5709878342434411,
"learning_rate": 8.67749498853821e-06,
"loss": 1.0947,
"step": 364
},
{
"epoch": 0.9688122096881221,
"grad_norm": 1.5372734019009258,
"learning_rate": 8.670363335416319e-06,
"loss": 1.0346,
"step": 365
},
{
"epoch": 0.9714664897146649,
"grad_norm": 1.3550031766754063,
"learning_rate": 8.663215452708173e-06,
"loss": 1.0868,
"step": 366
},
{
"epoch": 0.9741207697412076,
"grad_norm": 1.5040356499297907,
"learning_rate": 8.656051372020232e-06,
"loss": 1.1083,
"step": 367
},
{
"epoch": 0.9767750497677505,
"grad_norm": 1.5264462091802162,
"learning_rate": 8.648871125030576e-06,
"loss": 1.1647,
"step": 368
},
{
"epoch": 0.9794293297942933,
"grad_norm": 1.8183949324824284,
"learning_rate": 8.64167474348877e-06,
"loss": 1.0809,
"step": 369
},
{
"epoch": 0.9820836098208361,
"grad_norm": 2.6148655405710874,
"learning_rate": 8.634462259215719e-06,
"loss": 1.1195,
"step": 370
},
{
"epoch": 0.9847378898473789,
"grad_norm": 1.5140959417993884,
"learning_rate": 8.627233704103538e-06,
"loss": 1.0768,
"step": 371
},
{
"epoch": 0.9873921698739216,
"grad_norm": 1.3953146864224168,
"learning_rate": 8.619989110115398e-06,
"loss": 1.0998,
"step": 372
},
{
"epoch": 0.9900464499004645,
"grad_norm": 1.5342377987936564,
"learning_rate": 8.612728509285395e-06,
"loss": 1.1627,
"step": 373
},
{
"epoch": 0.9927007299270073,
"grad_norm": 1.6359257997310512,
"learning_rate": 8.6054519337184e-06,
"loss": 1.0947,
"step": 374
},
{
"epoch": 0.9953550099535501,
"grad_norm": 1.468781234700457,
"learning_rate": 8.59815941558992e-06,
"loss": 1.0958,
"step": 375
},
{
"epoch": 0.9980092899800929,
"grad_norm": 1.8585796860334978,
"learning_rate": 8.590850987145964e-06,
"loss": 1.1439,
"step": 376
},
{
"epoch": 1.0006635700066357,
"grad_norm": 1.4419076627140608,
"learning_rate": 8.583526680702888e-06,
"loss": 1.1053,
"step": 377
},
{
"epoch": 1.0033178500331785,
"grad_norm": 1.9598302676254797,
"learning_rate": 8.576186528647253e-06,
"loss": 1.1538,
"step": 378
},
{
"epoch": 1.00199203187251,
"grad_norm": 2.6817151626469315,
"learning_rate": 8.568830563435695e-06,
"loss": 0.8784,
"step": 379
},
{
"epoch": 1.00464807436919,
"grad_norm": 2.3656261104141683,
"learning_rate": 8.561458817594767e-06,
"loss": 0.8496,
"step": 380
},
{
"epoch": 1.0073041168658698,
"grad_norm": 3.236925490140009,
"learning_rate": 8.554071323720802e-06,
"loss": 0.835,
"step": 381
},
{
"epoch": 1.0099601593625498,
"grad_norm": 3.843007571124674,
"learning_rate": 8.546668114479769e-06,
"loss": 0.8405,
"step": 382
},
{
"epoch": 1.0126162018592297,
"grad_norm": 1.9111030291143845,
"learning_rate": 8.53924922260712e-06,
"loss": 0.8939,
"step": 383
},
{
"epoch": 1.0152722443559097,
"grad_norm": 2.377637900130101,
"learning_rate": 8.531814680907664e-06,
"loss": 0.8582,
"step": 384
},
{
"epoch": 1.0179282868525896,
"grad_norm": 2.0107910554499444,
"learning_rate": 8.5243645222554e-06,
"loss": 0.8601,
"step": 385
},
{
"epoch": 1.0205843293492696,
"grad_norm": 1.6935928768441335,
"learning_rate": 8.51689877959339e-06,
"loss": 0.8809,
"step": 386
},
{
"epoch": 1.0232403718459495,
"grad_norm": 1.704066721140233,
"learning_rate": 8.509417485933598e-06,
"loss": 0.8165,
"step": 387
},
{
"epoch": 1.0258964143426295,
"grad_norm": 1.6573154345299554,
"learning_rate": 8.501920674356755e-06,
"loss": 0.775,
"step": 388
},
{
"epoch": 1.0285524568393094,
"grad_norm": 1.6237544976562632,
"learning_rate": 8.494408378012208e-06,
"loss": 0.8115,
"step": 389
},
{
"epoch": 1.0312084993359893,
"grad_norm": 1.7621516061992955,
"learning_rate": 8.48688063011778e-06,
"loss": 0.867,
"step": 390
},
{
"epoch": 1.0338645418326693,
"grad_norm": 1.656679120365536,
"learning_rate": 8.479337463959607e-06,
"loss": 0.8387,
"step": 391
},
{
"epoch": 1.0365205843293492,
"grad_norm": 1.8462379110802485,
"learning_rate": 8.471778912892008e-06,
"loss": 0.7986,
"step": 392
},
{
"epoch": 1.0391766268260292,
"grad_norm": 1.5346458154712161,
"learning_rate": 8.46420501033733e-06,
"loss": 0.7449,
"step": 393
},
{
"epoch": 1.0418326693227091,
"grad_norm": 1.6072720048731164,
"learning_rate": 8.456615789785804e-06,
"loss": 0.789,
"step": 394
},
{
"epoch": 1.044488711819389,
"grad_norm": 1.6542954637743557,
"learning_rate": 8.449011284795389e-06,
"loss": 0.8418,
"step": 395
},
{
"epoch": 1.047144754316069,
"grad_norm": 1.6140250309056903,
"learning_rate": 8.441391528991629e-06,
"loss": 0.787,
"step": 396
},
{
"epoch": 1.049800796812749,
"grad_norm": 1.6969918225387495,
"learning_rate": 8.433756556067506e-06,
"loss": 0.7224,
"step": 397
},
{
"epoch": 1.052456839309429,
"grad_norm": 1.6791557339222636,
"learning_rate": 8.42610639978329e-06,
"loss": 0.7711,
"step": 398
},
{
"epoch": 1.0551128818061088,
"grad_norm": 1.8496221919094664,
"learning_rate": 8.418441093966387e-06,
"loss": 0.8002,
"step": 399
},
{
"epoch": 1.0577689243027888,
"grad_norm": 1.6730347681300224,
"learning_rate": 8.410760672511188e-06,
"loss": 0.7967,
"step": 400
},
{
"epoch": 1.0604249667994687,
"grad_norm": 1.6351550046155985,
"learning_rate": 8.403065169378932e-06,
"loss": 0.7733,
"step": 401
},
{
"epoch": 1.0630810092961487,
"grad_norm": 1.7116450424772824,
"learning_rate": 8.395354618597533e-06,
"loss": 0.7989,
"step": 402
},
{
"epoch": 1.0657370517928286,
"grad_norm": 1.7530198918200703,
"learning_rate": 8.387629054261454e-06,
"loss": 0.8113,
"step": 403
},
{
"epoch": 1.0683930942895086,
"grad_norm": 1.7205729697156094,
"learning_rate": 8.379888510531536e-06,
"loss": 0.7841,
"step": 404
},
{
"epoch": 1.0710491367861885,
"grad_norm": 1.5855647522801817,
"learning_rate": 8.37213302163486e-06,
"loss": 0.7764,
"step": 405
},
{
"epoch": 1.0737051792828685,
"grad_norm": 1.7069540659005535,
"learning_rate": 8.364362621864595e-06,
"loss": 0.7622,
"step": 406
},
{
"epoch": 1.0763612217795484,
"grad_norm": 2.16867793043578,
"learning_rate": 8.356577345579836e-06,
"loss": 0.809,
"step": 407
},
{
"epoch": 1.0790172642762283,
"grad_norm": 1.6510857902080103,
"learning_rate": 8.348777227205462e-06,
"loss": 0.8271,
"step": 408
},
{
"epoch": 1.0816733067729083,
"grad_norm": 1.6345528498547808,
"learning_rate": 8.34096230123198e-06,
"loss": 0.8222,
"step": 409
},
{
"epoch": 1.0843293492695882,
"grad_norm": 1.6586084882710281,
"learning_rate": 8.333132602215374e-06,
"loss": 0.8207,
"step": 410
},
{
"epoch": 1.0869853917662682,
"grad_norm": 1.7037280491243554,
"learning_rate": 8.325288164776952e-06,
"loss": 0.8023,
"step": 411
},
{
"epoch": 1.0896414342629481,
"grad_norm": 1.7599459329779006,
"learning_rate": 8.31742902360319e-06,
"loss": 0.8487,
"step": 412
},
{
"epoch": 1.092297476759628,
"grad_norm": 1.6713698600025593,
"learning_rate": 8.309555213445583e-06,
"loss": 0.7517,
"step": 413
},
{
"epoch": 1.094953519256308,
"grad_norm": 1.9144563268361539,
"learning_rate": 8.301666769120488e-06,
"loss": 0.7743,
"step": 414
},
{
"epoch": 1.097609561752988,
"grad_norm": 1.803684622272647,
"learning_rate": 8.29376372550897e-06,
"loss": 0.8916,
"step": 415
},
{
"epoch": 1.100265604249668,
"grad_norm": 1.7244484243906384,
"learning_rate": 8.28584611755665e-06,
"loss": 0.8751,
"step": 416
},
{
"epoch": 1.1029216467463479,
"grad_norm": 1.571447077389202,
"learning_rate": 8.277913980273556e-06,
"loss": 0.792,
"step": 417
},
{
"epoch": 1.1055776892430278,
"grad_norm": 1.6265661139125767,
"learning_rate": 8.269967348733947e-06,
"loss": 0.8271,
"step": 418
},
{
"epoch": 1.1082337317397077,
"grad_norm": 1.4867715710827545,
"learning_rate": 8.262006258076187e-06,
"loss": 0.7518,
"step": 419
},
{
"epoch": 1.1108897742363877,
"grad_norm": 1.9065325615287738,
"learning_rate": 8.25403074350257e-06,
"loss": 0.8217,
"step": 420
},
{
"epoch": 1.1135458167330676,
"grad_norm": 1.4856711522583446,
"learning_rate": 8.246040840279165e-06,
"loss": 0.7575,
"step": 421
},
{
"epoch": 1.1162018592297476,
"grad_norm": 1.6522511921393792,
"learning_rate": 8.238036583735673e-06,
"loss": 0.8373,
"step": 422
},
{
"epoch": 1.1188579017264275,
"grad_norm": 1.6086804575482818,
"learning_rate": 8.230018009265255e-06,
"loss": 0.7999,
"step": 423
},
{
"epoch": 1.1215139442231075,
"grad_norm": 1.7684786767218659,
"learning_rate": 8.221985152324385e-06,
"loss": 0.8025,
"step": 424
},
{
"epoch": 1.1241699867197874,
"grad_norm": 1.6725764333554063,
"learning_rate": 8.213938048432697e-06,
"loss": 0.8117,
"step": 425
},
{
"epoch": 1.1268260292164674,
"grad_norm": 1.7827929534114502,
"learning_rate": 8.205876733172813e-06,
"loss": 0.8309,
"step": 426
},
{
"epoch": 1.1294820717131473,
"grad_norm": 1.698478383799961,
"learning_rate": 8.197801242190204e-06,
"loss": 0.8268,
"step": 427
},
{
"epoch": 1.1321381142098272,
"grad_norm": 1.8118203307554201,
"learning_rate": 8.189711611193012e-06,
"loss": 0.849,
"step": 428
},
{
"epoch": 1.1347941567065072,
"grad_norm": 2.070219778475728,
"learning_rate": 8.181607875951911e-06,
"loss": 0.7663,
"step": 429
},
{
"epoch": 1.1374501992031871,
"grad_norm": 1.6850427736472506,
"learning_rate": 8.17349007229994e-06,
"loss": 0.7611,
"step": 430
},
{
"epoch": 1.140106241699867,
"grad_norm": 1.6749598308998042,
"learning_rate": 8.165358236132347e-06,
"loss": 0.8187,
"step": 431
},
{
"epoch": 1.1427622841965472,
"grad_norm": 1.6288782095132437,
"learning_rate": 8.157212403406424e-06,
"loss": 0.8636,
"step": 432
},
{
"epoch": 1.1454183266932272,
"grad_norm": 4.442514317024235,
"learning_rate": 8.149052610141357e-06,
"loss": 0.7602,
"step": 433
},
{
"epoch": 1.1480743691899071,
"grad_norm": 1.8429556263296754,
"learning_rate": 8.14087889241806e-06,
"loss": 0.8371,
"step": 434
},
{
"epoch": 1.150730411686587,
"grad_norm": 1.7045658311688014,
"learning_rate": 8.132691286379022e-06,
"loss": 0.8294,
"step": 435
},
{
"epoch": 1.153386454183267,
"grad_norm": 1.6060394678635606,
"learning_rate": 8.124489828228136e-06,
"loss": 0.7894,
"step": 436
},
{
"epoch": 1.156042496679947,
"grad_norm": 1.6620423492382332,
"learning_rate": 8.116274554230557e-06,
"loss": 0.8314,
"step": 437
},
{
"epoch": 1.158698539176627,
"grad_norm": 1.6318153684467354,
"learning_rate": 8.108045500712518e-06,
"loss": 0.7925,
"step": 438
},
{
"epoch": 1.1613545816733069,
"grad_norm": 1.549105187586498,
"learning_rate": 8.099802704061194e-06,
"loss": 0.7802,
"step": 439
},
{
"epoch": 1.1640106241699868,
"grad_norm": 1.5295943417332036,
"learning_rate": 8.091546200724521e-06,
"loss": 0.783,
"step": 440
},
{
"epoch": 1.1666666666666667,
"grad_norm": 1.6920608032596305,
"learning_rate": 8.083276027211049e-06,
"loss": 0.8102,
"step": 441
},
{
"epoch": 1.1693227091633467,
"grad_norm": 1.6969436680242431,
"learning_rate": 8.07499222008977e-06,
"loss": 0.8349,
"step": 442
},
{
"epoch": 1.1719787516600266,
"grad_norm": 1.667367336891202,
"learning_rate": 8.066694815989961e-06,
"loss": 0.8588,
"step": 443
},
{
"epoch": 1.1746347941567066,
"grad_norm": 1.5443285933176338,
"learning_rate": 8.058383851601027e-06,
"loss": 0.8374,
"step": 444
},
{
"epoch": 1.1772908366533865,
"grad_norm": 1.6330085580242453,
"learning_rate": 8.05005936367233e-06,
"loss": 0.8429,
"step": 445
},
{
"epoch": 1.1799468791500665,
"grad_norm": 1.5343417288603696,
"learning_rate": 8.041721389013029e-06,
"loss": 0.7969,
"step": 446
},
{
"epoch": 1.1826029216467464,
"grad_norm": 1.6583884933795217,
"learning_rate": 8.033369964491924e-06,
"loss": 0.8525,
"step": 447
},
{
"epoch": 1.1852589641434264,
"grad_norm": 1.716175663053838,
"learning_rate": 8.025005127037282e-06,
"loss": 0.7671,
"step": 448
},
{
"epoch": 1.1879150066401063,
"grad_norm": 1.6417405981278685,
"learning_rate": 8.016626913636681e-06,
"loss": 0.7946,
"step": 449
},
{
"epoch": 1.1905710491367862,
"grad_norm": 1.7766412095191615,
"learning_rate": 8.008235361336845e-06,
"loss": 0.7745,
"step": 450
},
{
"epoch": 1.1932270916334662,
"grad_norm": 1.6563705890201488,
"learning_rate": 7.999830507243478e-06,
"loss": 0.8702,
"step": 451
},
{
"epoch": 1.1958831341301461,
"grad_norm": 1.5442276292341202,
"learning_rate": 7.991412388521108e-06,
"loss": 0.7552,
"step": 452
},
{
"epoch": 1.198539176626826,
"grad_norm": 1.6952272228903122,
"learning_rate": 7.982981042392907e-06,
"loss": 0.8314,
"step": 453
},
{
"epoch": 1.201195219123506,
"grad_norm": 1.5514245470035095,
"learning_rate": 7.974536506140546e-06,
"loss": 0.8379,
"step": 454
},
{
"epoch": 1.203851261620186,
"grad_norm": 1.5461371054169675,
"learning_rate": 7.966078817104012e-06,
"loss": 0.8277,
"step": 455
},
{
"epoch": 1.206507304116866,
"grad_norm": 1.537281161736465,
"learning_rate": 7.957608012681452e-06,
"loss": 0.7524,
"step": 456
},
{
"epoch": 1.2091633466135459,
"grad_norm": 1.6260111327687876,
"learning_rate": 7.94912413032901e-06,
"loss": 0.7796,
"step": 457
},
{
"epoch": 1.2118193891102258,
"grad_norm": 1.5526207364437223,
"learning_rate": 7.940627207560655e-06,
"loss": 0.7348,
"step": 458
},
{
"epoch": 1.2144754316069057,
"grad_norm": 1.5637447605556818,
"learning_rate": 7.932117281948021e-06,
"loss": 0.8037,
"step": 459
},
{
"epoch": 1.2171314741035857,
"grad_norm": 1.6178024259327493,
"learning_rate": 7.923594391120237e-06,
"loss": 0.8831,
"step": 460
},
{
"epoch": 1.2197875166002656,
"grad_norm": 1.850653096718784,
"learning_rate": 7.915058572763757e-06,
"loss": 0.7854,
"step": 461
},
{
"epoch": 1.2224435590969456,
"grad_norm": 1.7078288767363847,
"learning_rate": 7.906509864622202e-06,
"loss": 0.8495,
"step": 462
},
{
"epoch": 1.2250996015936255,
"grad_norm": 1.5513381210763566,
"learning_rate": 7.897948304496189e-06,
"loss": 0.7137,
"step": 463
},
{
"epoch": 1.2277556440903055,
"grad_norm": 1.6755385049155458,
"learning_rate": 7.889373930243166e-06,
"loss": 0.8259,
"step": 464
},
{
"epoch": 1.2304116865869854,
"grad_norm": 1.6856311582426975,
"learning_rate": 7.880786779777233e-06,
"loss": 0.7716,
"step": 465
},
{
"epoch": 1.2330677290836654,
"grad_norm": 1.7089109714412376,
"learning_rate": 7.872186891068997e-06,
"loss": 0.8888,
"step": 466
},
{
"epoch": 1.2357237715803453,
"grad_norm": 1.6298087069461054,
"learning_rate": 7.86357430214538e-06,
"loss": 0.7837,
"step": 467
},
{
"epoch": 1.2383798140770252,
"grad_norm": 1.5968984282075225,
"learning_rate": 7.854949051089467e-06,
"loss": 0.7803,
"step": 468
},
{
"epoch": 1.2410358565737052,
"grad_norm": 1.7290872403697073,
"learning_rate": 7.846311176040331e-06,
"loss": 0.7977,
"step": 469
},
{
"epoch": 1.2436918990703851,
"grad_norm": 1.6477947347063955,
"learning_rate": 7.837660715192867e-06,
"loss": 0.8181,
"step": 470
},
{
"epoch": 1.246347941567065,
"grad_norm": 1.7746140005250393,
"learning_rate": 7.82899770679762e-06,
"loss": 0.8288,
"step": 471
},
{
"epoch": 1.249003984063745,
"grad_norm": 1.7127466701681282,
"learning_rate": 7.820322189160618e-06,
"loss": 0.7727,
"step": 472
},
{
"epoch": 1.251660026560425,
"grad_norm": 1.9553401579042835,
"learning_rate": 7.811634200643202e-06,
"loss": 0.8062,
"step": 473
},
{
"epoch": 1.254316069057105,
"grad_norm": 1.7316246001340312,
"learning_rate": 7.80293377966186e-06,
"loss": 0.7913,
"step": 474
},
{
"epoch": 1.2569721115537849,
"grad_norm": 1.6106273608789883,
"learning_rate": 7.794220964688048e-06,
"loss": 0.7915,
"step": 475
},
{
"epoch": 1.2596281540504648,
"grad_norm": 1.6018273021238745,
"learning_rate": 7.78549579424803e-06,
"loss": 0.8312,
"step": 476
},
{
"epoch": 1.2622841965471447,
"grad_norm": 1.5745527953605545,
"learning_rate": 7.776758306922703e-06,
"loss": 0.8125,
"step": 477
},
{
"epoch": 1.2649402390438247,
"grad_norm": 1.7593012652879958,
"learning_rate": 7.768008541347423e-06,
"loss": 0.8354,
"step": 478
},
{
"epoch": 1.2675962815405046,
"grad_norm": 1.5641289543545533,
"learning_rate": 7.759246536211843e-06,
"loss": 0.7744,
"step": 479
},
{
"epoch": 1.2702523240371846,
"grad_norm": 1.6174503354343208,
"learning_rate": 7.750472330259735e-06,
"loss": 0.8251,
"step": 480
},
{
"epoch": 1.2729083665338645,
"grad_norm": 1.6488155603473844,
"learning_rate": 7.741685962288817e-06,
"loss": 0.8155,
"step": 481
},
{
"epoch": 1.2755644090305445,
"grad_norm": 1.7502255456864444,
"learning_rate": 7.732887471150589e-06,
"loss": 0.8199,
"step": 482
},
{
"epoch": 1.2782204515272244,
"grad_norm": 1.6325894467431792,
"learning_rate": 7.72407689575016e-06,
"loss": 0.7949,
"step": 483
},
{
"epoch": 1.2808764940239044,
"grad_norm": 1.5450834054808966,
"learning_rate": 7.715254275046062e-06,
"loss": 0.7488,
"step": 484
},
{
"epoch": 1.2835325365205843,
"grad_norm": 1.7564206935012117,
"learning_rate": 7.7064196480501e-06,
"loss": 0.7563,
"step": 485
},
{
"epoch": 1.2861885790172642,
"grad_norm": 1.4967916410847861,
"learning_rate": 7.697573053827163e-06,
"loss": 0.7613,
"step": 486
},
{
"epoch": 1.2888446215139442,
"grad_norm": 1.6339851679991706,
"learning_rate": 7.688714531495061e-06,
"loss": 0.8494,
"step": 487
},
{
"epoch": 1.2915006640106241,
"grad_norm": 1.5726472413180292,
"learning_rate": 7.67984412022434e-06,
"loss": 0.811,
"step": 488
},
{
"epoch": 1.294156706507304,
"grad_norm": 1.704796493237578,
"learning_rate": 7.670961859238124e-06,
"loss": 0.758,
"step": 489
},
{
"epoch": 1.296812749003984,
"grad_norm": 1.8152659977035888,
"learning_rate": 7.66206778781193e-06,
"loss": 0.7954,
"step": 490
},
{
"epoch": 1.299468791500664,
"grad_norm": 1.617282796396521,
"learning_rate": 7.653161945273497e-06,
"loss": 0.7816,
"step": 491
},
{
"epoch": 1.302124833997344,
"grad_norm": 1.8207616949669707,
"learning_rate": 7.644244371002619e-06,
"loss": 0.8187,
"step": 492
},
{
"epoch": 1.3047808764940239,
"grad_norm": 1.7889230655898831,
"learning_rate": 7.635315104430959e-06,
"loss": 0.7913,
"step": 493
},
{
"epoch": 1.3074369189907038,
"grad_norm": 1.6026212175311343,
"learning_rate": 7.626374185041887e-06,
"loss": 0.8469,
"step": 494
},
{
"epoch": 1.3100929614873837,
"grad_norm": 1.6034434747796849,
"learning_rate": 7.617421652370293e-06,
"loss": 0.8067,
"step": 495
},
{
"epoch": 1.3127490039840637,
"grad_norm": 1.7116713587335526,
"learning_rate": 7.608457546002423e-06,
"loss": 0.8237,
"step": 496
},
{
"epoch": 1.3154050464807436,
"grad_norm": 1.6713572165788912,
"learning_rate": 7.599481905575699e-06,
"loss": 0.8205,
"step": 497
},
{
"epoch": 1.3180610889774236,
"grad_norm": 1.6492136689563734,
"learning_rate": 7.5904947707785434e-06,
"loss": 0.8062,
"step": 498
},
{
"epoch": 1.3207171314741035,
"grad_norm": 1.8633234143797999,
"learning_rate": 7.581496181350203e-06,
"loss": 0.8574,
"step": 499
},
{
"epoch": 1.3233731739707835,
"grad_norm": 1.5222850331019526,
"learning_rate": 7.572486177080576e-06,
"loss": 0.8052,
"step": 500
},
{
"epoch": 1.3260292164674634,
"grad_norm": 1.5407209608086607,
"learning_rate": 7.563464797810038e-06,
"loss": 0.8536,
"step": 501
},
{
"epoch": 1.3286852589641434,
"grad_norm": 1.5919884093082233,
"learning_rate": 7.554432083429253e-06,
"loss": 0.7941,
"step": 502
},
{
"epoch": 1.3313413014608233,
"grad_norm": 1.6480739901612147,
"learning_rate": 7.545388073879018e-06,
"loss": 0.8236,
"step": 503
},
{
"epoch": 1.3339973439575032,
"grad_norm": 1.6210798461631093,
"learning_rate": 7.536332809150066e-06,
"loss": 0.823,
"step": 504
},
{
"epoch": 1.3366533864541832,
"grad_norm": 1.5948934469455742,
"learning_rate": 7.527266329282905e-06,
"loss": 0.7437,
"step": 505
},
{
"epoch": 1.3393094289508631,
"grad_norm": 1.6266868717112688,
"learning_rate": 7.518188674367628e-06,
"loss": 0.8009,
"step": 506
},
{
"epoch": 1.341965471447543,
"grad_norm": 1.7075965106124282,
"learning_rate": 7.509099884543745e-06,
"loss": 0.7933,
"step": 507
},
{
"epoch": 1.3446215139442232,
"grad_norm": 1.8851878790913914,
"learning_rate": 7.500000000000001e-06,
"loss": 0.8752,
"step": 508
},
{
"epoch": 1.3472775564409032,
"grad_norm": 1.802756747375581,
"learning_rate": 7.490889060974202e-06,
"loss": 0.8339,
"step": 509
},
{
"epoch": 1.3499335989375831,
"grad_norm": 1.6221335431584782,
"learning_rate": 7.4817671077530295e-06,
"loss": 0.8079,
"step": 510
},
{
"epoch": 1.352589641434263,
"grad_norm": 1.6762124149564765,
"learning_rate": 7.4726341806718735e-06,
"loss": 0.7527,
"step": 511
},
{
"epoch": 1.355245683930943,
"grad_norm": 1.817888811411582,
"learning_rate": 7.463490320114646e-06,
"loss": 0.8421,
"step": 512
},
{
"epoch": 1.357901726427623,
"grad_norm": 1.6277711511409727,
"learning_rate": 7.454335566513603e-06,
"loss": 0.8531,
"step": 513
},
{
"epoch": 1.360557768924303,
"grad_norm": 1.7798649541264224,
"learning_rate": 7.445169960349167e-06,
"loss": 0.8855,
"step": 514
},
{
"epoch": 1.3632138114209829,
"grad_norm": 1.6603027624169684,
"learning_rate": 7.435993542149751e-06,
"loss": 0.8034,
"step": 515
},
{
"epoch": 1.3658698539176628,
"grad_norm": 1.9257358203569979,
"learning_rate": 7.426806352491575e-06,
"loss": 0.7991,
"step": 516
},
{
"epoch": 1.3685258964143427,
"grad_norm": 1.6083096418618006,
"learning_rate": 7.417608431998487e-06,
"loss": 0.8302,
"step": 517
},
{
"epoch": 1.3711819389110227,
"grad_norm": 1.6367347889439254,
"learning_rate": 7.408399821341787e-06,
"loss": 0.7769,
"step": 518
},
{
"epoch": 1.3738379814077026,
"grad_norm": 1.7083783084901518,
"learning_rate": 7.399180561240044e-06,
"loss": 0.796,
"step": 519
},
{
"epoch": 1.3764940239043826,
"grad_norm": 1.5960672101189293,
"learning_rate": 7.389950692458916e-06,
"loss": 0.8736,
"step": 520
},
{
"epoch": 1.3791500664010625,
"grad_norm": 1.7397680807457505,
"learning_rate": 7.38071025581097e-06,
"loss": 0.8015,
"step": 521
},
{
"epoch": 1.3818061088977425,
"grad_norm": 1.6440955438519538,
"learning_rate": 7.371459292155501e-06,
"loss": 0.8266,
"step": 522
},
{
"epoch": 1.3844621513944224,
"grad_norm": 1.7177209541893135,
"learning_rate": 7.362197842398355e-06,
"loss": 0.7857,
"step": 523
},
{
"epoch": 1.3871181938911024,
"grad_norm": 1.6619729313815175,
"learning_rate": 7.3529259474917455e-06,
"loss": 0.7885,
"step": 524
},
{
"epoch": 1.3897742363877823,
"grad_norm": 1.7199503184792022,
"learning_rate": 7.34364364843407e-06,
"loss": 0.845,
"step": 525
},
{
"epoch": 1.3924302788844622,
"grad_norm": 1.8306223658893233,
"learning_rate": 7.3343509862697295e-06,
"loss": 0.8368,
"step": 526
},
{
"epoch": 1.3950863213811422,
"grad_norm": 1.5439578957272115,
"learning_rate": 7.325048002088955e-06,
"loss": 0.8093,
"step": 527
},
{
"epoch": 1.3977423638778221,
"grad_norm": 1.9216923568783284,
"learning_rate": 7.315734737027612e-06,
"loss": 0.8178,
"step": 528
},
{
"epoch": 1.400398406374502,
"grad_norm": 1.5591840729803677,
"learning_rate": 7.30641123226703e-06,
"loss": 0.7905,
"step": 529
},
{
"epoch": 1.403054448871182,
"grad_norm": 1.5800929847807523,
"learning_rate": 7.297077529033814e-06,
"loss": 0.8103,
"step": 530
},
{
"epoch": 1.405710491367862,
"grad_norm": 1.7493518294241435,
"learning_rate": 7.287733668599669e-06,
"loss": 0.8348,
"step": 531
},
{
"epoch": 1.408366533864542,
"grad_norm": 1.9535948089272044,
"learning_rate": 7.278379692281209e-06,
"loss": 0.8116,
"step": 532
},
{
"epoch": 1.4110225763612219,
"grad_norm": 1.7032888435426474,
"learning_rate": 7.2690156414397775e-06,
"loss": 0.7952,
"step": 533
},
{
"epoch": 1.4136786188579018,
"grad_norm": 1.7632789452727808,
"learning_rate": 7.2596415574812695e-06,
"loss": 0.8484,
"step": 534
},
{
"epoch": 1.4163346613545817,
"grad_norm": 1.9343554806946162,
"learning_rate": 7.250257481855941e-06,
"loss": 0.7913,
"step": 535
},
{
"epoch": 1.4189907038512617,
"grad_norm": 1.5626080706093446,
"learning_rate": 7.24086345605823e-06,
"loss": 0.8043,
"step": 536
},
{
"epoch": 1.4216467463479416,
"grad_norm": 1.6881053251561386,
"learning_rate": 7.231459521626574e-06,
"loss": 0.7897,
"step": 537
},
{
"epoch": 1.4243027888446216,
"grad_norm": 1.6829030745040456,
"learning_rate": 7.22204572014322e-06,
"loss": 0.7077,
"step": 538
},
{
"epoch": 1.4269588313413015,
"grad_norm": 1.7582319600473906,
"learning_rate": 7.212622093234049e-06,
"loss": 0.7394,
"step": 539
},
{
"epoch": 1.4296148738379815,
"grad_norm": 1.82729749435436,
"learning_rate": 7.20318868256839e-06,
"loss": 0.7831,
"step": 540
},
{
"epoch": 1.4322709163346614,
"grad_norm": 1.8895948305148935,
"learning_rate": 7.193745529858827e-06,
"loss": 0.8085,
"step": 541
},
{
"epoch": 1.4349269588313414,
"grad_norm": 1.5870695996483368,
"learning_rate": 7.184292676861024e-06,
"loss": 0.7976,
"step": 542
},
{
"epoch": 1.4375830013280213,
"grad_norm": 1.661068432134725,
"learning_rate": 7.174830165373542e-06,
"loss": 0.7795,
"step": 543
},
{
"epoch": 1.4402390438247012,
"grad_norm": 1.7637781271023871,
"learning_rate": 7.165358037237644e-06,
"loss": 0.7797,
"step": 544
},
{
"epoch": 1.4428950863213812,
"grad_norm": 1.7573078579380121,
"learning_rate": 7.155876334337119e-06,
"loss": 0.7881,
"step": 545
},
{
"epoch": 1.4455511288180611,
"grad_norm": 1.788971177497177,
"learning_rate": 7.146385098598092e-06,
"loss": 0.7926,
"step": 546
},
{
"epoch": 1.448207171314741,
"grad_norm": 1.483941017753339,
"learning_rate": 7.136884371988844e-06,
"loss": 0.7945,
"step": 547
},
{
"epoch": 1.450863213811421,
"grad_norm": 2.158603581584702,
"learning_rate": 7.127374196519616e-06,
"loss": 0.8339,
"step": 548
},
{
"epoch": 1.453519256308101,
"grad_norm": 1.5625619640257642,
"learning_rate": 7.117854614242434e-06,
"loss": 0.7366,
"step": 549
},
{
"epoch": 1.456175298804781,
"grad_norm": 1.797618930907081,
"learning_rate": 7.10832566725092e-06,
"loss": 0.8613,
"step": 550
},
{
"epoch": 1.4588313413014609,
"grad_norm": 1.7910995750400152,
"learning_rate": 7.098787397680104e-06,
"loss": 0.8439,
"step": 551
},
{
"epoch": 1.4614873837981408,
"grad_norm": 1.60673403396422,
"learning_rate": 7.0892398477062375e-06,
"loss": 0.8891,
"step": 552
},
{
"epoch": 1.4641434262948207,
"grad_norm": 1.6427911933197976,
"learning_rate": 7.079683059546607e-06,
"loss": 0.8271,
"step": 553
},
{
"epoch": 1.4667994687915007,
"grad_norm": 1.6285500237390729,
"learning_rate": 7.0701170754593516e-06,
"loss": 0.8588,
"step": 554
},
{
"epoch": 1.4694555112881806,
"grad_norm": 1.718883251928568,
"learning_rate": 7.060541937743269e-06,
"loss": 0.794,
"step": 555
},
{
"epoch": 1.4721115537848606,
"grad_norm": 1.5365027160461944,
"learning_rate": 7.0509576887376375e-06,
"loss": 0.7856,
"step": 556
},
{
"epoch": 1.4747675962815405,
"grad_norm": 1.5936955094419631,
"learning_rate": 7.041364370822017e-06,
"loss": 0.7704,
"step": 557
},
{
"epoch": 1.4774236387782205,
"grad_norm": 1.5986404122720763,
"learning_rate": 7.031762026416074e-06,
"loss": 0.784,
"step": 558
},
{
"epoch": 1.4800796812749004,
"grad_norm": 1.6290145069786532,
"learning_rate": 7.022150697979385e-06,
"loss": 0.8711,
"step": 559
},
{
"epoch": 1.4827357237715804,
"grad_norm": 1.646088871212861,
"learning_rate": 7.0125304280112546e-06,
"loss": 0.7449,
"step": 560
},
{
"epoch": 1.4853917662682603,
"grad_norm": 1.628406730851986,
"learning_rate": 7.002901259050523e-06,
"loss": 0.8154,
"step": 561
},
{
"epoch": 1.4880478087649402,
"grad_norm": 1.585585277641927,
"learning_rate": 6.99326323367538e-06,
"loss": 0.8304,
"step": 562
},
{
"epoch": 1.4907038512616202,
"grad_norm": 1.5731659024948346,
"learning_rate": 6.983616394503177e-06,
"loss": 0.7599,
"step": 563
},
{
"epoch": 1.4933598937583001,
"grad_norm": 1.8337268519091863,
"learning_rate": 6.9739607841902365e-06,
"loss": 0.8634,
"step": 564
},
{
"epoch": 1.49601593625498,
"grad_norm": 1.6324940281109015,
"learning_rate": 6.96429644543167e-06,
"loss": 0.7786,
"step": 565
},
{
"epoch": 1.49867197875166,
"grad_norm": 1.6269379425991606,
"learning_rate": 6.954623420961179e-06,
"loss": 0.7474,
"step": 566
},
{
"epoch": 1.50132802124834,
"grad_norm": 1.5892146882859803,
"learning_rate": 6.944941753550877e-06,
"loss": 0.8374,
"step": 567
},
{
"epoch": 1.50398406374502,
"grad_norm": 1.7162255540973645,
"learning_rate": 6.9352514860110876e-06,
"loss": 0.845,
"step": 568
},
{
"epoch": 1.5066401062416999,
"grad_norm": 1.6706147544781347,
"learning_rate": 6.925552661190166e-06,
"loss": 0.7899,
"step": 569
},
{
"epoch": 1.5092961487383798,
"grad_norm": 1.6566681467322444,
"learning_rate": 6.915845321974309e-06,
"loss": 0.8416,
"step": 570
},
{
"epoch": 1.5119521912350598,
"grad_norm": 1.6291748860885436,
"learning_rate": 6.906129511287358e-06,
"loss": 0.8324,
"step": 571
},
{
"epoch": 1.5146082337317397,
"grad_norm": 2.00417453104611,
"learning_rate": 6.8964052720906175e-06,
"loss": 0.7287,
"step": 572
},
{
"epoch": 1.5172642762284196,
"grad_norm": 1.7178352113474786,
"learning_rate": 6.886672647382653e-06,
"loss": 0.7881,
"step": 573
},
{
"epoch": 1.5199203187250996,
"grad_norm": 1.894127341764823,
"learning_rate": 6.876931680199121e-06,
"loss": 0.8068,
"step": 574
},
{
"epoch": 1.5225763612217795,
"grad_norm": 1.6002909924709852,
"learning_rate": 6.867182413612556e-06,
"loss": 0.7499,
"step": 575
},
{
"epoch": 1.5252324037184595,
"grad_norm": 1.6013025100948648,
"learning_rate": 6.857424890732195e-06,
"loss": 0.7948,
"step": 576
},
{
"epoch": 1.5278884462151394,
"grad_norm": 1.7428611130277234,
"learning_rate": 6.847659154703785e-06,
"loss": 0.8532,
"step": 577
},
{
"epoch": 1.5305444887118194,
"grad_norm": 1.5972099589901563,
"learning_rate": 6.837885248709386e-06,
"loss": 0.7441,
"step": 578
},
{
"epoch": 1.5332005312084993,
"grad_norm": 1.7281335555363582,
"learning_rate": 6.8281032159671865e-06,
"loss": 0.8236,
"step": 579
},
{
"epoch": 1.5358565737051793,
"grad_norm": 1.6497765487254892,
"learning_rate": 6.818313099731308e-06,
"loss": 0.846,
"step": 580
},
{
"epoch": 1.5385126162018592,
"grad_norm": 2.751709704124397,
"learning_rate": 6.8085149432916155e-06,
"loss": 0.752,
"step": 581
},
{
"epoch": 1.5411686586985391,
"grad_norm": 1.8663852239561056,
"learning_rate": 6.798708789973527e-06,
"loss": 0.77,
"step": 582
},
{
"epoch": 1.543824701195219,
"grad_norm": 1.6698060547997404,
"learning_rate": 6.788894683137822e-06,
"loss": 0.8077,
"step": 583
},
{
"epoch": 1.546480743691899,
"grad_norm": 1.6116361837634285,
"learning_rate": 6.779072666180447e-06,
"loss": 0.8133,
"step": 584
},
{
"epoch": 1.549136786188579,
"grad_norm": 1.542072654470527,
"learning_rate": 6.769242782532324e-06,
"loss": 0.7846,
"step": 585
},
{
"epoch": 1.551792828685259,
"grad_norm": 1.4898452945341245,
"learning_rate": 6.759405075659165e-06,
"loss": 0.8475,
"step": 586
},
{
"epoch": 1.5544488711819389,
"grad_norm": 1.5173495908692989,
"learning_rate": 6.749559589061273e-06,
"loss": 0.7964,
"step": 587
},
{
"epoch": 1.5571049136786188,
"grad_norm": 1.8749975291685148,
"learning_rate": 6.739706366273346e-06,
"loss": 0.8505,
"step": 588
},
{
"epoch": 1.5597609561752988,
"grad_norm": 1.827884336582995,
"learning_rate": 6.7298454508642945e-06,
"loss": 0.8439,
"step": 589
},
{
"epoch": 1.5624169986719787,
"grad_norm": 1.6275292009586355,
"learning_rate": 6.7199768864370455e-06,
"loss": 0.7982,
"step": 590
},
{
"epoch": 1.5650730411686586,
"grad_norm": 1.5747624399878266,
"learning_rate": 6.710100716628345e-06,
"loss": 0.797,
"step": 591
},
{
"epoch": 1.5677290836653386,
"grad_norm": 1.5774705921149774,
"learning_rate": 6.700216985108568e-06,
"loss": 0.8065,
"step": 592
},
{
"epoch": 1.5703851261620185,
"grad_norm": 1.5778429721001461,
"learning_rate": 6.690325735581532e-06,
"loss": 0.8202,
"step": 593
},
{
"epoch": 1.5730411686586985,
"grad_norm": 1.7604471332843126,
"learning_rate": 6.680427011784292e-06,
"loss": 0.7897,
"step": 594
},
{
"epoch": 1.5756972111553784,
"grad_norm": 1.686769635907997,
"learning_rate": 6.6705208574869504e-06,
"loss": 0.7761,
"step": 595
},
{
"epoch": 1.5783532536520584,
"grad_norm": 1.5053126110771002,
"learning_rate": 6.660607316492471e-06,
"loss": 0.8438,
"step": 596
},
{
"epoch": 1.5810092961487383,
"grad_norm": 1.6763577131611167,
"learning_rate": 6.65068643263648e-06,
"loss": 0.8336,
"step": 597
},
{
"epoch": 1.5836653386454183,
"grad_norm": 1.6119056861498604,
"learning_rate": 6.640758249787067e-06,
"loss": 0.866,
"step": 598
},
{
"epoch": 1.5863213811420982,
"grad_norm": 1.6391451248424211,
"learning_rate": 6.630822811844604e-06,
"loss": 0.7924,
"step": 599
},
{
"epoch": 1.5889774236387781,
"grad_norm": 1.5817186839646116,
"learning_rate": 6.620880162741534e-06,
"loss": 0.7637,
"step": 600
},
{
"epoch": 1.591633466135458,
"grad_norm": 1.6537509811359954,
"learning_rate": 6.610930346442198e-06,
"loss": 0.8418,
"step": 601
},
{
"epoch": 1.594289508632138,
"grad_norm": 1.6834866329043456,
"learning_rate": 6.600973406942617e-06,
"loss": 0.8283,
"step": 602
},
{
"epoch": 1.596945551128818,
"grad_norm": 1.5806755355796025,
"learning_rate": 6.591009388270315e-06,
"loss": 0.8051,
"step": 603
},
{
"epoch": 1.599601593625498,
"grad_norm": 1.6625638337455775,
"learning_rate": 6.58103833448412e-06,
"loss": 0.7581,
"step": 604
},
{
"epoch": 1.6022576361221779,
"grad_norm": 1.634181305057464,
"learning_rate": 6.571060289673966e-06,
"loss": 0.7476,
"step": 605
},
{
"epoch": 1.6049136786188578,
"grad_norm": 34.53829269757645,
"learning_rate": 6.5610752979607e-06,
"loss": 0.7517,
"step": 606
},
{
"epoch": 1.6075697211155378,
"grad_norm": 1.8445764149564496,
"learning_rate": 6.551083403495885e-06,
"loss": 0.8535,
"step": 607
},
{
"epoch": 1.6102257636122177,
"grad_norm": 1.7840446085100523,
"learning_rate": 6.54108465046161e-06,
"loss": 0.815,
"step": 608
},
{
"epoch": 1.6128818061088976,
"grad_norm": 1.5939129613902245,
"learning_rate": 6.531079083070289e-06,
"loss": 0.8234,
"step": 609
},
{
"epoch": 1.6155378486055776,
"grad_norm": 1.6112292642729917,
"learning_rate": 6.521066745564467e-06,
"loss": 0.7916,
"step": 610
},
{
"epoch": 1.6181938911022575,
"grad_norm": 1.5131144042037576,
"learning_rate": 6.511047682216628e-06,
"loss": 0.781,
"step": 611
},
{
"epoch": 1.6208499335989375,
"grad_norm": 1.6311550231780976,
"learning_rate": 6.501021937328992e-06,
"loss": 0.769,
"step": 612
},
{
"epoch": 1.6235059760956174,
"grad_norm": 1.5393946017610958,
"learning_rate": 6.490989555233328e-06,
"loss": 0.7904,
"step": 613
},
{
"epoch": 1.6261620185922974,
"grad_norm": 1.6268714117788674,
"learning_rate": 6.480950580290751e-06,
"loss": 0.8433,
"step": 614
},
{
"epoch": 1.6288180610889773,
"grad_norm": 1.6330202880257099,
"learning_rate": 6.470905056891526e-06,
"loss": 0.7714,
"step": 615
},
{
"epoch": 1.6314741035856573,
"grad_norm": 1.5562395275992014,
"learning_rate": 6.460853029454879e-06,
"loss": 0.7867,
"step": 616
},
{
"epoch": 1.6341301460823372,
"grad_norm": 2.217671570613303,
"learning_rate": 6.450794542428791e-06,
"loss": 0.8091,
"step": 617
},
{
"epoch": 1.6367861885790171,
"grad_norm": 1.544981928227297,
"learning_rate": 6.440729640289809e-06,
"loss": 0.7813,
"step": 618
},
{
"epoch": 1.639442231075697,
"grad_norm": 1.4740653481229222,
"learning_rate": 6.4306583675428435e-06,
"loss": 0.7833,
"step": 619
},
{
"epoch": 1.642098273572377,
"grad_norm": 1.6073920238722603,
"learning_rate": 6.420580768720977e-06,
"loss": 0.838,
"step": 620
},
{
"epoch": 1.644754316069057,
"grad_norm": 1.612548032767887,
"learning_rate": 6.410496888385266e-06,
"loss": 0.7618,
"step": 621
},
{
"epoch": 1.647410358565737,
"grad_norm": 1.6152490679947544,
"learning_rate": 6.4004067711245366e-06,
"loss": 0.8393,
"step": 622
},
{
"epoch": 1.6500664010624169,
"grad_norm": 1.6599220046956429,
"learning_rate": 6.3903104615551956e-06,
"loss": 0.8162,
"step": 623
},
{
"epoch": 1.6527224435590968,
"grad_norm": 1.4931102060723278,
"learning_rate": 6.380208004321037e-06,
"loss": 0.7566,
"step": 624
},
{
"epoch": 1.6553784860557768,
"grad_norm": 1.5929400984170048,
"learning_rate": 6.370099444093032e-06,
"loss": 0.7796,
"step": 625
},
{
"epoch": 1.6580345285524567,
"grad_norm": 1.727388116576355,
"learning_rate": 6.359984825569138e-06,
"loss": 0.7898,
"step": 626
},
{
"epoch": 1.6606905710491366,
"grad_norm": 1.8183131880092849,
"learning_rate": 6.349864193474104e-06,
"loss": 0.8609,
"step": 627
},
{
"epoch": 1.6633466135458166,
"grad_norm": 1.8676431191653386,
"learning_rate": 6.3397375925592675e-06,
"loss": 0.8784,
"step": 628
},
{
"epoch": 1.6660026560424965,
"grad_norm": 1.6708110546891144,
"learning_rate": 6.32960506760236e-06,
"loss": 0.7962,
"step": 629
},
{
"epoch": 1.6686586985391765,
"grad_norm": 1.6511789078182308,
"learning_rate": 6.319466663407309e-06,
"loss": 0.8078,
"step": 630
},
{
"epoch": 1.6713147410358564,
"grad_norm": 1.6698570509044466,
"learning_rate": 6.309322424804034e-06,
"loss": 0.7975,
"step": 631
},
{
"epoch": 1.6739707835325364,
"grad_norm": 1.9330575842850526,
"learning_rate": 6.29917239664826e-06,
"loss": 0.7993,
"step": 632
},
{
"epoch": 1.6766268260292163,
"grad_norm": 1.647084849034996,
"learning_rate": 6.289016623821308e-06,
"loss": 0.8084,
"step": 633
},
{
"epoch": 1.6792828685258963,
"grad_norm": 1.6897703069676664,
"learning_rate": 6.2788551512299014e-06,
"loss": 0.8016,
"step": 634
},
{
"epoch": 1.6819389110225762,
"grad_norm": 1.5440808150197218,
"learning_rate": 6.268688023805965e-06,
"loss": 0.7948,
"step": 635
},
{
"epoch": 1.6845949535192561,
"grad_norm": 2.7749014353276165,
"learning_rate": 6.25851528650643e-06,
"loss": 0.7971,
"step": 636
},
{
"epoch": 1.687250996015936,
"grad_norm": 1.7083444476431595,
"learning_rate": 6.248336984313035e-06,
"loss": 0.8095,
"step": 637
},
{
"epoch": 1.6899070385126163,
"grad_norm": 1.5870927607624352,
"learning_rate": 6.2381531622321234e-06,
"loss": 0.7709,
"step": 638
},
{
"epoch": 1.6925630810092962,
"grad_norm": 1.6015762357015875,
"learning_rate": 6.227963865294444e-06,
"loss": 0.8129,
"step": 639
},
{
"epoch": 1.6952191235059761,
"grad_norm": 1.65706125860874,
"learning_rate": 6.2177691385549595e-06,
"loss": 0.8378,
"step": 640
},
{
"epoch": 1.697875166002656,
"grad_norm": 1.604007060138125,
"learning_rate": 6.207569027092642e-06,
"loss": 0.8319,
"step": 641
},
{
"epoch": 1.700531208499336,
"grad_norm": 1.6045171117881705,
"learning_rate": 6.1973635760102645e-06,
"loss": 0.77,
"step": 642
},
{
"epoch": 1.703187250996016,
"grad_norm": 1.5733954742264373,
"learning_rate": 6.18715283043422e-06,
"loss": 0.8035,
"step": 643
},
{
"epoch": 1.705843293492696,
"grad_norm": 1.8117301973869768,
"learning_rate": 6.1769368355143125e-06,
"loss": 0.8834,
"step": 644
},
{
"epoch": 1.7084993359893759,
"grad_norm": 1.7588216405278665,
"learning_rate": 6.166715636423552e-06,
"loss": 0.8235,
"step": 645
},
{
"epoch": 1.7111553784860558,
"grad_norm": 1.6509712115249005,
"learning_rate": 6.156489278357967e-06,
"loss": 0.8207,
"step": 646
},
{
"epoch": 1.7138114209827358,
"grad_norm": 1.7572298275421452,
"learning_rate": 6.14625780653639e-06,
"loss": 0.7946,
"step": 647
},
{
"epoch": 1.7164674634794157,
"grad_norm": 1.538832182813809,
"learning_rate": 6.136021266200271e-06,
"loss": 0.7535,
"step": 648
},
{
"epoch": 1.7191235059760956,
"grad_norm": 1.8298709899462002,
"learning_rate": 6.125779702613471e-06,
"loss": 0.8351,
"step": 649
},
{
"epoch": 1.7217795484727756,
"grad_norm": 1.756099401566392,
"learning_rate": 6.115533161062062e-06,
"loss": 0.8398,
"step": 650
},
{
"epoch": 1.7244355909694555,
"grad_norm": 1.7062669920402536,
"learning_rate": 6.105281686854129e-06,
"loss": 0.816,
"step": 651
},
{
"epoch": 1.7270916334661355,
"grad_norm": 1.6150556400750276,
"learning_rate": 6.0950253253195656e-06,
"loss": 0.8606,
"step": 652
},
{
"epoch": 1.7297476759628154,
"grad_norm": 1.7332370774628512,
"learning_rate": 6.084764121809878e-06,
"loss": 0.821,
"step": 653
},
{
"epoch": 1.7324037184594954,
"grad_norm": 2.0279637900537955,
"learning_rate": 6.074498121697983e-06,
"loss": 0.8049,
"step": 654
},
{
"epoch": 1.7350597609561753,
"grad_norm": 1.6258653064667319,
"learning_rate": 6.064227370378007e-06,
"loss": 0.7857,
"step": 655
},
{
"epoch": 1.7377158034528553,
"grad_norm": 1.6293060217669535,
"learning_rate": 6.053951913265083e-06,
"loss": 0.8198,
"step": 656
},
{
"epoch": 1.7403718459495352,
"grad_norm": 1.6077770915178562,
"learning_rate": 6.043671795795152e-06,
"loss": 0.8127,
"step": 657
},
{
"epoch": 1.7430278884462151,
"grad_norm": 1.622591828340667,
"learning_rate": 6.033387063424765e-06,
"loss": 0.7998,
"step": 658
},
{
"epoch": 1.745683930942895,
"grad_norm": 1.6328037190371787,
"learning_rate": 6.023097761630879e-06,
"loss": 0.8009,
"step": 659
},
{
"epoch": 1.748339973439575,
"grad_norm": 1.6308555840386694,
"learning_rate": 6.012803935910655e-06,
"loss": 0.761,
"step": 660
},
{
"epoch": 1.750996015936255,
"grad_norm": 1.4971140185122018,
"learning_rate": 6.002505631781257e-06,
"loss": 0.7743,
"step": 661
},
{
"epoch": 1.753652058432935,
"grad_norm": 1.5208500640814875,
"learning_rate": 5.9922028947796495e-06,
"loss": 0.6865,
"step": 662
},
{
"epoch": 1.7563081009296149,
"grad_norm": 1.6739817122782799,
"learning_rate": 5.9818957704624046e-06,
"loss": 0.8353,
"step": 663
},
{
"epoch": 1.7589641434262948,
"grad_norm": 1.6866866619757162,
"learning_rate": 5.971584304405489e-06,
"loss": 0.8435,
"step": 664
},
{
"epoch": 1.7616201859229748,
"grad_norm": 1.8112833554197036,
"learning_rate": 5.96126854220407e-06,
"loss": 0.7812,
"step": 665
},
{
"epoch": 1.7642762284196547,
"grad_norm": 1.6679412962431888,
"learning_rate": 5.95094852947231e-06,
"loss": 0.7911,
"step": 666
},
{
"epoch": 1.7669322709163346,
"grad_norm": 1.6011378182746292,
"learning_rate": 5.94062431184317e-06,
"loss": 0.7617,
"step": 667
},
{
"epoch": 1.7695883134130146,
"grad_norm": 1.845248071116995,
"learning_rate": 5.930295934968197e-06,
"loss": 0.8711,
"step": 668
},
{
"epoch": 1.7722443559096945,
"grad_norm": 1.725596928754398,
"learning_rate": 5.919963444517338e-06,
"loss": 0.8052,
"step": 669
},
{
"epoch": 1.7749003984063745,
"grad_norm": 1.9163181126771034,
"learning_rate": 5.909626886178721e-06,
"loss": 0.8538,
"step": 670
},
{
"epoch": 1.7775564409030544,
"grad_norm": 1.7009303732171024,
"learning_rate": 5.899286305658468e-06,
"loss": 0.8197,
"step": 671
},
{
"epoch": 1.7802124833997344,
"grad_norm": 1.5247343576304235,
"learning_rate": 5.888941748680484e-06,
"loss": 0.8159,
"step": 672
},
{
"epoch": 1.7828685258964143,
"grad_norm": 2.044707850741483,
"learning_rate": 5.878593260986256e-06,
"loss": 0.8172,
"step": 673
},
{
"epoch": 1.7855245683930943,
"grad_norm": 1.7418155389848087,
"learning_rate": 5.8682408883346535e-06,
"loss": 0.8196,
"step": 674
},
{
"epoch": 1.7881806108897742,
"grad_norm": 1.4597724863288657,
"learning_rate": 5.857884676501721e-06,
"loss": 0.7647,
"step": 675
},
{
"epoch": 1.7908366533864541,
"grad_norm": 1.6458660814062083,
"learning_rate": 5.8475246712804845e-06,
"loss": 0.8045,
"step": 676
},
{
"epoch": 1.793492695883134,
"grad_norm": 1.6252499093239745,
"learning_rate": 5.83716091848074e-06,
"loss": 0.8091,
"step": 677
},
{
"epoch": 1.796148738379814,
"grad_norm": 1.6610545996985242,
"learning_rate": 5.8267934639288525e-06,
"loss": 0.8809,
"step": 678
},
{
"epoch": 1.798804780876494,
"grad_norm": 1.681945309800806,
"learning_rate": 5.816422353467562e-06,
"loss": 0.7976,
"step": 679
},
{
"epoch": 1.801460823373174,
"grad_norm": 1.7407405789509727,
"learning_rate": 5.80604763295577e-06,
"loss": 0.7653,
"step": 680
},
{
"epoch": 1.8041168658698539,
"grad_norm": 1.7577038472175832,
"learning_rate": 5.795669348268339e-06,
"loss": 0.7965,
"step": 681
},
{
"epoch": 1.8067729083665338,
"grad_norm": 1.7731919678676291,
"learning_rate": 5.785287545295895e-06,
"loss": 0.7594,
"step": 682
},
{
"epoch": 1.8094289508632138,
"grad_norm": 1.6141949293008162,
"learning_rate": 5.77490226994462e-06,
"loss": 0.801,
"step": 683
},
{
"epoch": 1.8120849933598937,
"grad_norm": 1.812626295914488,
"learning_rate": 5.76451356813605e-06,
"loss": 0.8163,
"step": 684
},
{
"epoch": 1.8147410358565739,
"grad_norm": 1.6162681395964988,
"learning_rate": 5.7541214858068705e-06,
"loss": 0.8397,
"step": 685
},
{
"epoch": 1.8173970783532538,
"grad_norm": 1.721558869882608,
"learning_rate": 5.743726068908717e-06,
"loss": 0.8043,
"step": 686
},
{
"epoch": 1.8200531208499338,
"grad_norm": 1.8032721689703735,
"learning_rate": 5.733327363407973e-06,
"loss": 0.7866,
"step": 687
},
{
"epoch": 1.8227091633466137,
"grad_norm": 1.6708160503371885,
"learning_rate": 5.722925415285555e-06,
"loss": 0.7858,
"step": 688
},
{
"epoch": 1.8253652058432936,
"grad_norm": 1.766193995933153,
"learning_rate": 5.712520270536723e-06,
"loss": 0.7798,
"step": 689
},
{
"epoch": 1.8280212483399736,
"grad_norm": 1.5356744931510473,
"learning_rate": 5.702111975170875e-06,
"loss": 0.7936,
"step": 690
},
{
"epoch": 1.8306772908366535,
"grad_norm": 1.732102227757507,
"learning_rate": 5.691700575211335e-06,
"loss": 0.788,
"step": 691
},
{
"epoch": 1.8333333333333335,
"grad_norm": 1.6592997418979596,
"learning_rate": 5.681286116695155e-06,
"loss": 0.8294,
"step": 692
},
{
"epoch": 1.8359893758300134,
"grad_norm": 1.9062250490573815,
"learning_rate": 5.670868645672916e-06,
"loss": 0.8372,
"step": 693
},
{
"epoch": 1.8386454183266934,
"grad_norm": 1.7172232578747761,
"learning_rate": 5.660448208208513e-06,
"loss": 0.8292,
"step": 694
},
{
"epoch": 1.8413014608233733,
"grad_norm": 1.8188926512319041,
"learning_rate": 5.650024850378964e-06,
"loss": 0.8221,
"step": 695
},
{
"epoch": 1.8439575033200533,
"grad_norm": 1.638642817866283,
"learning_rate": 5.6395986182741965e-06,
"loss": 0.8157,
"step": 696
},
{
"epoch": 1.8466135458167332,
"grad_norm": 1.617796881961262,
"learning_rate": 5.629169557996848e-06,
"loss": 0.7954,
"step": 697
},
{
"epoch": 1.8492695883134131,
"grad_norm": 1.7846585636780499,
"learning_rate": 5.618737715662067e-06,
"loss": 0.8045,
"step": 698
},
{
"epoch": 1.851925630810093,
"grad_norm": 1.52955631208176,
"learning_rate": 5.608303137397294e-06,
"loss": 0.7532,
"step": 699
},
{
"epoch": 1.854581673306773,
"grad_norm": 1.6372599632527154,
"learning_rate": 5.597865869342075e-06,
"loss": 0.7908,
"step": 700
},
{
"epoch": 1.857237715803453,
"grad_norm": 1.557033830346597,
"learning_rate": 5.5874259576478465e-06,
"loss": 0.8461,
"step": 701
},
{
"epoch": 1.859893758300133,
"grad_norm": 2.0636224479100647,
"learning_rate": 5.5769834484777344e-06,
"loss": 0.794,
"step": 702
},
{
"epoch": 1.8625498007968129,
"grad_norm": 1.5906609489633872,
"learning_rate": 5.566538388006351e-06,
"loss": 0.7641,
"step": 703
},
{
"epoch": 1.8652058432934928,
"grad_norm": 1.7691225304057758,
"learning_rate": 5.556090822419589e-06,
"loss": 0.8186,
"step": 704
},
{
"epoch": 1.8678618857901728,
"grad_norm": 1.6618983785480637,
"learning_rate": 5.54564079791442e-06,
"loss": 0.7933,
"step": 705
},
{
"epoch": 1.8705179282868527,
"grad_norm": 1.6226756802008044,
"learning_rate": 5.535188360698687e-06,
"loss": 0.8021,
"step": 706
},
{
"epoch": 1.8731739707835326,
"grad_norm": 2.178124602174882,
"learning_rate": 5.524733556990904e-06,
"loss": 0.8244,
"step": 707
},
{
"epoch": 1.8758300132802126,
"grad_norm": 1.7670614639798954,
"learning_rate": 5.514276433020044e-06,
"loss": 0.6896,
"step": 708
},
{
"epoch": 1.8784860557768925,
"grad_norm": 1.6798674141158796,
"learning_rate": 5.503817035025341e-06,
"loss": 0.8297,
"step": 709
},
{
"epoch": 1.8811420982735725,
"grad_norm": 1.6026510714619167,
"learning_rate": 5.493355409256091e-06,
"loss": 0.8391,
"step": 710
},
{
"epoch": 1.8837981407702524,
"grad_norm": 1.8164233494348743,
"learning_rate": 5.482891601971434e-06,
"loss": 0.8216,
"step": 711
},
{
"epoch": 1.8864541832669324,
"grad_norm": 2.225469822194458,
"learning_rate": 5.472425659440157e-06,
"loss": 0.838,
"step": 712
},
{
"epoch": 1.8891102257636123,
"grad_norm": 1.6986504060233345,
"learning_rate": 5.461957627940489e-06,
"loss": 0.7715,
"step": 713
},
{
"epoch": 1.8917662682602923,
"grad_norm": 1.853182152455261,
"learning_rate": 5.451487553759899e-06,
"loss": 0.7993,
"step": 714
},
{
"epoch": 1.8944223107569722,
"grad_norm": 1.6193780421833341,
"learning_rate": 5.441015483194883e-06,
"loss": 0.7837,
"step": 715
},
{
"epoch": 1.8970783532536521,
"grad_norm": 1.65935355108129,
"learning_rate": 5.43054146255077e-06,
"loss": 0.7968,
"step": 716
},
{
"epoch": 1.899734395750332,
"grad_norm": 1.7128452889705919,
"learning_rate": 5.420065538141507e-06,
"loss": 0.8091,
"step": 717
},
{
"epoch": 1.902390438247012,
"grad_norm": 1.7131843789521455,
"learning_rate": 5.409587756289462e-06,
"loss": 0.7841,
"step": 718
},
{
"epoch": 1.905046480743692,
"grad_norm": 1.8326844843438326,
"learning_rate": 5.399108163325217e-06,
"loss": 0.7998,
"step": 719
},
{
"epoch": 1.907702523240372,
"grad_norm": 1.5677093021811603,
"learning_rate": 5.388626805587361e-06,
"loss": 0.7657,
"step": 720
},
{
"epoch": 1.9103585657370519,
"grad_norm": 1.6608756530604105,
"learning_rate": 5.378143729422285e-06,
"loss": 0.8002,
"step": 721
},
{
"epoch": 1.9130146082337318,
"grad_norm": 1.5762455616412385,
"learning_rate": 5.367658981183979e-06,
"loss": 0.7799,
"step": 722
},
{
"epoch": 1.9156706507304118,
"grad_norm": 1.6223542306458065,
"learning_rate": 5.357172607233831e-06,
"loss": 0.7568,
"step": 723
},
{
"epoch": 1.9183266932270917,
"grad_norm": 1.6243360897116195,
"learning_rate": 5.346684653940408e-06,
"loss": 0.8361,
"step": 724
},
{
"epoch": 1.9209827357237717,
"grad_norm": 1.717966580725845,
"learning_rate": 5.3361951676792745e-06,
"loss": 0.8181,
"step": 725
},
{
"epoch": 1.9236387782204516,
"grad_norm": 1.6409767034053258,
"learning_rate": 5.325704194832759e-06,
"loss": 0.7991,
"step": 726
},
{
"epoch": 1.9262948207171315,
"grad_norm": 1.7093634730103173,
"learning_rate": 5.315211781789775e-06,
"loss": 0.7401,
"step": 727
},
{
"epoch": 1.9289508632138115,
"grad_norm": 1.8814933118849915,
"learning_rate": 5.304717974945596e-06,
"loss": 0.8212,
"step": 728
},
{
"epoch": 1.9316069057104914,
"grad_norm": 1.6022448134502887,
"learning_rate": 5.294222820701661e-06,
"loss": 0.7712,
"step": 729
},
{
"epoch": 1.9342629482071714,
"grad_norm": 1.9257961733760494,
"learning_rate": 5.2837263654653715e-06,
"loss": 0.7694,
"step": 730
},
{
"epoch": 1.9369189907038513,
"grad_norm": 1.55634864513684,
"learning_rate": 5.273228655649873e-06,
"loss": 0.7937,
"step": 731
},
{
"epoch": 1.9395750332005313,
"grad_norm": 1.65857937432083,
"learning_rate": 5.2627297376738674e-06,
"loss": 0.7309,
"step": 732
},
{
"epoch": 1.9422310756972112,
"grad_norm": 1.6472003637687052,
"learning_rate": 5.252229657961394e-06,
"loss": 0.8135,
"step": 733
},
{
"epoch": 1.9448871181938912,
"grad_norm": 1.6658259885988707,
"learning_rate": 5.24172846294163e-06,
"loss": 0.7817,
"step": 734
},
{
"epoch": 1.947543160690571,
"grad_norm": 1.7271975202258703,
"learning_rate": 5.231226199048682e-06,
"loss": 0.7704,
"step": 735
},
{
"epoch": 1.950199203187251,
"grad_norm": 1.5236007987102846,
"learning_rate": 5.2207229127213866e-06,
"loss": 0.8125,
"step": 736
},
{
"epoch": 1.952855245683931,
"grad_norm": 1.5827626707270386,
"learning_rate": 5.210218650403101e-06,
"loss": 0.8218,
"step": 737
},
{
"epoch": 1.955511288180611,
"grad_norm": 1.6150367316693914,
"learning_rate": 5.199713458541495e-06,
"loss": 0.7933,
"step": 738
},
{
"epoch": 1.9581673306772909,
"grad_norm": 1.6711992720608577,
"learning_rate": 5.189207383588353e-06,
"loss": 0.8075,
"step": 739
},
{
"epoch": 1.9608233731739708,
"grad_norm": 1.6593919600555247,
"learning_rate": 5.178700471999357e-06,
"loss": 0.7473,
"step": 740
},
{
"epoch": 1.9634794156706508,
"grad_norm": 1.7172984617619111,
"learning_rate": 5.168192770233901e-06,
"loss": 0.7953,
"step": 741
},
{
"epoch": 1.9661354581673307,
"grad_norm": 1.7140610596606283,
"learning_rate": 5.157684324754858e-06,
"loss": 0.883,
"step": 742
},
{
"epoch": 1.9687915006640107,
"grad_norm": 1.564534131388113,
"learning_rate": 5.1471751820284e-06,
"loss": 0.7756,
"step": 743
},
{
"epoch": 1.9714475431606906,
"grad_norm": 1.7708506721058221,
"learning_rate": 5.136665388523779e-06,
"loss": 0.7802,
"step": 744
},
{
"epoch": 1.9741035856573705,
"grad_norm": 1.7847297514234355,
"learning_rate": 5.126154990713123e-06,
"loss": 0.794,
"step": 745
},
{
"epoch": 1.9767596281540505,
"grad_norm": 1.6540427509570816,
"learning_rate": 5.115644035071234e-06,
"loss": 0.803,
"step": 746
},
{
"epoch": 1.9794156706507304,
"grad_norm": 1.5823166162412527,
"learning_rate": 5.1051325680753826e-06,
"loss": 0.6617,
"step": 747
},
{
"epoch": 1.9820717131474104,
"grad_norm": 1.6625490997353582,
"learning_rate": 5.094620636205096e-06,
"loss": 0.7835,
"step": 748
},
{
"epoch": 1.9847277556440903,
"grad_norm": 1.6216871637425778,
"learning_rate": 5.084108285941959e-06,
"loss": 0.816,
"step": 749
},
{
"epoch": 1.9873837981407703,
"grad_norm": 1.6590146206027645,
"learning_rate": 5.073595563769407e-06,
"loss": 0.8446,
"step": 750
},
{
"epoch": 1.9900398406374502,
"grad_norm": 1.614326585836819,
"learning_rate": 5.06308251617252e-06,
"loss": 0.7208,
"step": 751
},
{
"epoch": 1.9926958831341302,
"grad_norm": 1.6342275277779148,
"learning_rate": 5.052569189637813e-06,
"loss": 0.7641,
"step": 752
}
],
"logging_steps": 1,
"max_steps": 1504,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 376,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 716175260516352.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}