Safetensors
German
qwen2_audio
gerqwen-audio / trainer_state.json
flozi00's picture
Upload folder using huggingface_hub
3147114 verified
raw
history blame
47.7 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.942375227311212,
"eval_steps": 100000,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.000942375227311212,
"grad_norm": 14.25,
"learning_rate": 1e-05,
"loss": 0.46586317,
"memory(GiB)": 64.76,
"step": 1,
"train_speed(iter/s)": 0.003324
},
{
"epoch": 0.00471187613655606,
"grad_norm": 2.46875,
"learning_rate": 9.999648647603774e-06,
"loss": 0.26192743,
"memory(GiB)": 75.3,
"step": 5,
"train_speed(iter/s)": 0.003362
},
{
"epoch": 0.00942375227311212,
"grad_norm": 1.15625,
"learning_rate": 9.998221363123425e-06,
"loss": 0.10271888,
"memory(GiB)": 75.3,
"step": 10,
"train_speed(iter/s)": 0.003359
},
{
"epoch": 0.01413562840966818,
"grad_norm": 1.1796875,
"learning_rate": 9.995696500215899e-06,
"loss": 0.09046092,
"memory(GiB)": 75.3,
"step": 15,
"train_speed(iter/s)": 0.003358
},
{
"epoch": 0.01884750454622424,
"grad_norm": 1.140625,
"learning_rate": 9.992074613325435e-06,
"loss": 0.08653282,
"memory(GiB)": 75.3,
"step": 20,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.023559380682780302,
"grad_norm": 1.078125,
"learning_rate": 9.987356497795944e-06,
"loss": 0.08451628,
"memory(GiB)": 75.3,
"step": 25,
"train_speed(iter/s)": 0.003358
},
{
"epoch": 0.02827125681933636,
"grad_norm": 1.09375,
"learning_rate": 9.981543189696349e-06,
"loss": 0.0772208,
"memory(GiB)": 75.3,
"step": 30,
"train_speed(iter/s)": 0.003356
},
{
"epoch": 0.03298313295589242,
"grad_norm": 1.125,
"learning_rate": 9.97463596559307e-06,
"loss": 0.08322463,
"memory(GiB)": 75.3,
"step": 35,
"train_speed(iter/s)": 0.003356
},
{
"epoch": 0.03769500909244848,
"grad_norm": 1.09375,
"learning_rate": 9.966636342269706e-06,
"loss": 0.07725406,
"memory(GiB)": 75.3,
"step": 40,
"train_speed(iter/s)": 0.003355
},
{
"epoch": 0.04240688522900454,
"grad_norm": 1.15625,
"learning_rate": 9.957546076393944e-06,
"loss": 0.07683957,
"memory(GiB)": 75.3,
"step": 45,
"train_speed(iter/s)": 0.003356
},
{
"epoch": 0.047118761365560605,
"grad_norm": 1.1328125,
"learning_rate": 9.947367164131823e-06,
"loss": 0.07508552,
"memory(GiB)": 75.3,
"step": 50,
"train_speed(iter/s)": 0.003355
},
{
"epoch": 0.05183063750211667,
"grad_norm": 1.0703125,
"learning_rate": 9.936101840709373e-06,
"loss": 0.07236413,
"memory(GiB)": 75.3,
"step": 55,
"train_speed(iter/s)": 0.003353
},
{
"epoch": 0.05654251363867272,
"grad_norm": 1.0703125,
"learning_rate": 9.923752579921787e-06,
"loss": 0.07231579,
"memory(GiB)": 75.3,
"step": 60,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.06125438977522878,
"grad_norm": 1.0234375,
"learning_rate": 9.910322093590177e-06,
"loss": 0.07145001,
"memory(GiB)": 75.3,
"step": 65,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.06596626591178484,
"grad_norm": 1.0546875,
"learning_rate": 9.895813330966086e-06,
"loss": 0.07301619,
"memory(GiB)": 75.3,
"step": 70,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.0706781420483409,
"grad_norm": 1.1015625,
"learning_rate": 9.880229478083849e-06,
"loss": 0.0724276,
"memory(GiB)": 75.3,
"step": 75,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.07539001818489696,
"grad_norm": 1.0390625,
"learning_rate": 9.863573957060953e-06,
"loss": 0.06874905,
"memory(GiB)": 75.3,
"step": 80,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.08010189432145302,
"grad_norm": 1.0859375,
"learning_rate": 9.845850425346563e-06,
"loss": 0.07212579,
"memory(GiB)": 75.3,
"step": 85,
"train_speed(iter/s)": 0.003347
},
{
"epoch": 0.08481377045800909,
"grad_norm": 1.1171875,
"learning_rate": 9.827062774918377e-06,
"loss": 0.07294501,
"memory(GiB)": 75.3,
"step": 90,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.08952564659456515,
"grad_norm": 0.98828125,
"learning_rate": 9.807215131427966e-06,
"loss": 0.06517277,
"memory(GiB)": 75.3,
"step": 95,
"train_speed(iter/s)": 0.003345
},
{
"epoch": 0.09423752273112121,
"grad_norm": 0.984375,
"learning_rate": 9.786311853294799e-06,
"loss": 0.06962139,
"memory(GiB)": 75.3,
"step": 100,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.09894939886767727,
"grad_norm": 0.98828125,
"learning_rate": 9.764357530749178e-06,
"loss": 0.06724482,
"memory(GiB)": 75.3,
"step": 105,
"train_speed(iter/s)": 0.003339
},
{
"epoch": 0.10366127500423333,
"grad_norm": 1.015625,
"learning_rate": 9.741356984824234e-06,
"loss": 0.06572815,
"memory(GiB)": 75.3,
"step": 110,
"train_speed(iter/s)": 0.003339
},
{
"epoch": 0.10837315114078938,
"grad_norm": 1.0390625,
"learning_rate": 9.717315266297277e-06,
"loss": 0.06739124,
"memory(GiB)": 75.3,
"step": 115,
"train_speed(iter/s)": 0.003342
},
{
"epoch": 0.11308502727734544,
"grad_norm": 0.9375,
"learning_rate": 9.692237654580658e-06,
"loss": 0.06834027,
"memory(GiB)": 75.3,
"step": 120,
"train_speed(iter/s)": 0.003342
},
{
"epoch": 0.1177969034139015,
"grad_norm": 1.078125,
"learning_rate": 9.66612965656245e-06,
"loss": 0.0658385,
"memory(GiB)": 75.3,
"step": 125,
"train_speed(iter/s)": 0.003343
},
{
"epoch": 0.12250877955045757,
"grad_norm": 1.0859375,
"learning_rate": 9.638997005397174e-06,
"loss": 0.0717117,
"memory(GiB)": 75.3,
"step": 130,
"train_speed(iter/s)": 0.003344
},
{
"epoch": 0.12722065568701363,
"grad_norm": 0.9765625,
"learning_rate": 9.610845659246833e-06,
"loss": 0.0667814,
"memory(GiB)": 75.3,
"step": 135,
"train_speed(iter/s)": 0.003344
},
{
"epoch": 0.13193253182356968,
"grad_norm": 0.9140625,
"learning_rate": 9.581681799972528e-06,
"loss": 0.06573244,
"memory(GiB)": 75.3,
"step": 140,
"train_speed(iter/s)": 0.003345
},
{
"epoch": 0.13664440796012575,
"grad_norm": 1.03125,
"learning_rate": 9.551511831776966e-06,
"loss": 0.06967602,
"memory(GiB)": 75.3,
"step": 145,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.1413562840966818,
"grad_norm": 0.90625,
"learning_rate": 9.520342379798141e-06,
"loss": 0.06216406,
"memory(GiB)": 75.3,
"step": 150,
"train_speed(iter/s)": 0.003345
},
{
"epoch": 0.14606816023323788,
"grad_norm": 1.0390625,
"learning_rate": 9.488180288654485e-06,
"loss": 0.06460171,
"memory(GiB)": 75.3,
"step": 155,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.15078003636979392,
"grad_norm": 1.078125,
"learning_rate": 9.45503262094184e-06,
"loss": 0.06467786,
"memory(GiB)": 75.3,
"step": 160,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.15549191250635,
"grad_norm": 1.0625,
"learning_rate": 9.420906655682553e-06,
"loss": 0.06358048,
"memory(GiB)": 75.3,
"step": 165,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.16020378864290605,
"grad_norm": 1.015625,
"learning_rate": 9.385809886727044e-06,
"loss": 0.06778824,
"memory(GiB)": 75.3,
"step": 170,
"train_speed(iter/s)": 0.003343
},
{
"epoch": 0.16491566477946212,
"grad_norm": 1.046875,
"learning_rate": 9.349750021108212e-06,
"loss": 0.06321884,
"memory(GiB)": 75.3,
"step": 175,
"train_speed(iter/s)": 0.003343
},
{
"epoch": 0.16962754091601817,
"grad_norm": 0.97265625,
"learning_rate": 9.31273497734901e-06,
"loss": 0.06310185,
"memory(GiB)": 75.3,
"step": 180,
"train_speed(iter/s)": 0.003344
},
{
"epoch": 0.17433941705257422,
"grad_norm": 0.9765625,
"learning_rate": 9.274772883723587e-06,
"loss": 0.06271737,
"memory(GiB)": 75.3,
"step": 185,
"train_speed(iter/s)": 0.003344
},
{
"epoch": 0.1790512931891303,
"grad_norm": 0.97265625,
"learning_rate": 9.235872076472378e-06,
"loss": 0.06393245,
"memory(GiB)": 75.3,
"step": 190,
"train_speed(iter/s)": 0.003345
},
{
"epoch": 0.18376316932568634,
"grad_norm": 1.03125,
"learning_rate": 9.196041097971509e-06,
"loss": 0.06558744,
"memory(GiB)": 75.3,
"step": 195,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.18847504546224242,
"grad_norm": 0.98046875,
"learning_rate": 9.155288694856942e-06,
"loss": 0.06127087,
"memory(GiB)": 75.3,
"step": 200,
"train_speed(iter/s)": 0.003345
},
{
"epoch": 0.19318692159879847,
"grad_norm": 0.875,
"learning_rate": 9.113623816103775e-06,
"loss": 0.06313071,
"memory(GiB)": 75.3,
"step": 205,
"train_speed(iter/s)": 0.003343
},
{
"epoch": 0.19789879773535454,
"grad_norm": 1.0,
"learning_rate": 9.071055611061102e-06,
"loss": 0.06330621,
"memory(GiB)": 75.3,
"step": 210,
"train_speed(iter/s)": 0.003343
},
{
"epoch": 0.2026106738719106,
"grad_norm": 0.9453125,
"learning_rate": 9.027593427442867e-06,
"loss": 0.06415906,
"memory(GiB)": 75.3,
"step": 215,
"train_speed(iter/s)": 0.003343
},
{
"epoch": 0.20732255000846667,
"grad_norm": 0.94140625,
"learning_rate": 8.98324680927517e-06,
"loss": 0.06299359,
"memory(GiB)": 75.3,
"step": 220,
"train_speed(iter/s)": 0.003343
},
{
"epoch": 0.21203442614502271,
"grad_norm": 0.9609375,
"learning_rate": 8.938025494800454e-06,
"loss": 0.06004124,
"memory(GiB)": 75.3,
"step": 225,
"train_speed(iter/s)": 0.003343
},
{
"epoch": 0.21674630228157876,
"grad_norm": 0.97265625,
"learning_rate": 8.891939414339048e-06,
"loss": 0.06477681,
"memory(GiB)": 75.3,
"step": 230,
"train_speed(iter/s)": 0.003344
},
{
"epoch": 0.22145817841813484,
"grad_norm": 0.92578125,
"learning_rate": 8.844998688108535e-06,
"loss": 0.06010489,
"memory(GiB)": 75.3,
"step": 235,
"train_speed(iter/s)": 0.003344
},
{
"epoch": 0.22617005455469089,
"grad_norm": 0.9609375,
"learning_rate": 8.797213624001403e-06,
"loss": 0.05960445,
"memory(GiB)": 75.3,
"step": 240,
"train_speed(iter/s)": 0.003345
},
{
"epoch": 0.23088193069124696,
"grad_norm": 1.0,
"learning_rate": 8.748594715321512e-06,
"loss": 0.06301316,
"memory(GiB)": 75.3,
"step": 245,
"train_speed(iter/s)": 0.003345
},
{
"epoch": 0.235593806827803,
"grad_norm": 0.94921875,
"learning_rate": 8.699152638479817e-06,
"loss": 0.06120233,
"memory(GiB)": 75.3,
"step": 250,
"train_speed(iter/s)": 0.003345
},
{
"epoch": 0.24030568296435909,
"grad_norm": 0.97265625,
"learning_rate": 8.6488982506499e-06,
"loss": 0.06014684,
"memory(GiB)": 75.3,
"step": 255,
"train_speed(iter/s)": 0.003345
},
{
"epoch": 0.24501755910091513,
"grad_norm": 1.0,
"learning_rate": 8.597842587383797e-06,
"loss": 0.05922247,
"memory(GiB)": 75.3,
"step": 260,
"train_speed(iter/s)": 0.003345
},
{
"epoch": 0.24972943523747118,
"grad_norm": 0.97265625,
"learning_rate": 8.545996860188668e-06,
"loss": 0.05851297,
"memory(GiB)": 75.3,
"step": 265,
"train_speed(iter/s)": 0.003345
},
{
"epoch": 0.25444131137402726,
"grad_norm": 1.0625,
"learning_rate": 8.493372454064809e-06,
"loss": 0.05934198,
"memory(GiB)": 75.3,
"step": 270,
"train_speed(iter/s)": 0.003345
},
{
"epoch": 0.2591531875105833,
"grad_norm": 0.90234375,
"learning_rate": 8.439980925005587e-06,
"loss": 0.06134464,
"memory(GiB)": 75.3,
"step": 275,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.26386506364713935,
"grad_norm": 0.90234375,
"learning_rate": 8.385833997459804e-06,
"loss": 0.05825667,
"memory(GiB)": 75.3,
"step": 280,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.26857693978369546,
"grad_norm": 0.8828125,
"learning_rate": 8.330943561757092e-06,
"loss": 0.06092241,
"memory(GiB)": 75.3,
"step": 285,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.2732888159202515,
"grad_norm": 0.91796875,
"learning_rate": 8.275321671496862e-06,
"loss": 0.05940055,
"memory(GiB)": 75.3,
"step": 290,
"train_speed(iter/s)": 0.003347
},
{
"epoch": 0.27800069205680755,
"grad_norm": 0.9140625,
"learning_rate": 8.218980540901417e-06,
"loss": 0.05920713,
"memory(GiB)": 75.3,
"step": 295,
"train_speed(iter/s)": 0.003347
},
{
"epoch": 0.2827125681933636,
"grad_norm": 0.92578125,
"learning_rate": 8.16193254213377e-06,
"loss": 0.05777416,
"memory(GiB)": 75.3,
"step": 300,
"train_speed(iter/s)": 0.003347
},
{
"epoch": 0.2874244443299197,
"grad_norm": 0.91015625,
"learning_rate": 8.104190202580811e-06,
"loss": 0.05302551,
"memory(GiB)": 75.3,
"step": 305,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.29213632046647575,
"grad_norm": 0.91796875,
"learning_rate": 8.045766202102358e-06,
"loss": 0.05804279,
"memory(GiB)": 75.3,
"step": 310,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.2968481966030318,
"grad_norm": 0.9375,
"learning_rate": 7.986673370246743e-06,
"loss": 0.05822692,
"memory(GiB)": 75.3,
"step": 315,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.30156007273958785,
"grad_norm": 1.0078125,
"learning_rate": 7.926924683433523e-06,
"loss": 0.06007032,
"memory(GiB)": 75.3,
"step": 320,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.3062719488761439,
"grad_norm": 0.921875,
"learning_rate": 7.866533262103937e-06,
"loss": 0.06018423,
"memory(GiB)": 75.3,
"step": 325,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.3109838250127,
"grad_norm": 0.9375,
"learning_rate": 7.805512367839742e-06,
"loss": 0.05931915,
"memory(GiB)": 75.3,
"step": 330,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.31569570114925605,
"grad_norm": 1.015625,
"learning_rate": 7.743875400451047e-06,
"loss": 0.0566447,
"memory(GiB)": 75.3,
"step": 335,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.3204075772858121,
"grad_norm": 0.8203125,
"learning_rate": 7.681635895033798e-06,
"loss": 0.05161901,
"memory(GiB)": 75.3,
"step": 340,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.32511945342236814,
"grad_norm": 1.0,
"learning_rate": 7.6188075189975644e-06,
"loss": 0.05694907,
"memory(GiB)": 75.3,
"step": 345,
"train_speed(iter/s)": 0.003346
},
{
"epoch": 0.32983132955892425,
"grad_norm": 1.0390625,
"learning_rate": 7.555404069064245e-06,
"loss": 0.05555046,
"memory(GiB)": 75.3,
"step": 350,
"train_speed(iter/s)": 0.003347
},
{
"epoch": 0.3345432056954803,
"grad_norm": 0.97265625,
"learning_rate": 7.491439468238404e-06,
"loss": 0.05587023,
"memory(GiB)": 75.3,
"step": 355,
"train_speed(iter/s)": 0.003347
},
{
"epoch": 0.33925508183203634,
"grad_norm": 0.96875,
"learning_rate": 7.426927762749867e-06,
"loss": 0.05913154,
"memory(GiB)": 75.3,
"step": 360,
"train_speed(iter/s)": 0.003347
},
{
"epoch": 0.3439669579685924,
"grad_norm": 0.89453125,
"learning_rate": 7.361883118969248e-06,
"loss": 0.05830712,
"memory(GiB)": 75.3,
"step": 365,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.34867883410514844,
"grad_norm": 0.89453125,
"learning_rate": 7.2963198202971055e-06,
"loss": 0.05937972,
"memory(GiB)": 75.3,
"step": 370,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.35339071024170454,
"grad_norm": 0.9375,
"learning_rate": 7.230252264027398e-06,
"loss": 0.0565136,
"memory(GiB)": 75.3,
"step": 375,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.3581025863782606,
"grad_norm": 0.96875,
"learning_rate": 7.163694958185928e-06,
"loss": 0.05636386,
"memory(GiB)": 75.3,
"step": 380,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.36281446251481664,
"grad_norm": 0.96875,
"learning_rate": 7.09666251834447e-06,
"loss": 0.06038175,
"memory(GiB)": 75.3,
"step": 385,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.3675263386513727,
"grad_norm": 0.92578125,
"learning_rate": 7.0291696644112705e-06,
"loss": 0.05833557,
"memory(GiB)": 75.3,
"step": 390,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.3722382147879288,
"grad_norm": 0.8359375,
"learning_rate": 6.9612312173986675e-06,
"loss": 0.05632974,
"memory(GiB)": 75.3,
"step": 395,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.37695009092448484,
"grad_norm": 0.921875,
"learning_rate": 6.892862096168469e-06,
"loss": 0.05656151,
"memory(GiB)": 75.3,
"step": 400,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.3816619670610409,
"grad_norm": 0.98828125,
"learning_rate": 6.824077314155877e-06,
"loss": 0.05432441,
"memory(GiB)": 75.3,
"step": 405,
"train_speed(iter/s)": 0.003347
},
{
"epoch": 0.38637384319759693,
"grad_norm": 0.9453125,
"learning_rate": 6.75489197607262e-06,
"loss": 0.05709869,
"memory(GiB)": 75.3,
"step": 410,
"train_speed(iter/s)": 0.003347
},
{
"epoch": 0.391085719334153,
"grad_norm": 1.0546875,
"learning_rate": 6.6853212745900585e-06,
"loss": 0.05979726,
"memory(GiB)": 75.3,
"step": 415,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.3957975954707091,
"grad_norm": 0.9140625,
"learning_rate": 6.615380487002969e-06,
"loss": 0.0600209,
"memory(GiB)": 75.3,
"step": 420,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.40050947160726513,
"grad_norm": 0.94140625,
"learning_rate": 6.545084971874738e-06,
"loss": 0.0563777,
"memory(GiB)": 75.3,
"step": 425,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.4052213477438212,
"grad_norm": 0.91796875,
"learning_rate": 6.474450165664722e-06,
"loss": 0.05698464,
"memory(GiB)": 75.3,
"step": 430,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.40993322388037723,
"grad_norm": 0.890625,
"learning_rate": 6.4034915793385e-06,
"loss": 0.05311573,
"memory(GiB)": 75.3,
"step": 435,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.41464510001693333,
"grad_norm": 0.97265625,
"learning_rate": 6.332224794961752e-06,
"loss": 0.05458606,
"memory(GiB)": 75.3,
"step": 440,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.4193569761534894,
"grad_norm": 0.95703125,
"learning_rate": 6.260665462278544e-06,
"loss": 0.05579169,
"memory(GiB)": 75.3,
"step": 445,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.42406885229004543,
"grad_norm": 0.99609375,
"learning_rate": 6.18882929527473e-06,
"loss": 0.06002288,
"memory(GiB)": 75.3,
"step": 450,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.4287807284266015,
"grad_norm": 0.94140625,
"learning_rate": 6.116732068727271e-06,
"loss": 0.05494517,
"memory(GiB)": 75.3,
"step": 455,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.4334926045631575,
"grad_norm": 0.953125,
"learning_rate": 6.0443896147401856e-06,
"loss": 0.0547879,
"memory(GiB)": 75.3,
"step": 460,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.4382044806997136,
"grad_norm": 0.82421875,
"learning_rate": 5.971817819267914e-06,
"loss": 0.05363967,
"memory(GiB)": 75.3,
"step": 465,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.4429163568362697,
"grad_norm": 0.91796875,
"learning_rate": 5.8990326186268655e-06,
"loss": 0.056594,
"memory(GiB)": 75.3,
"step": 470,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.4476282329728257,
"grad_norm": 0.9765625,
"learning_rate": 5.826049995995905e-06,
"loss": 0.05898719,
"memory(GiB)": 75.3,
"step": 475,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.45234010910938177,
"grad_norm": 1.3671875,
"learning_rate": 5.752885977906539e-06,
"loss": 0.05439388,
"memory(GiB)": 75.3,
"step": 480,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.4570519852459379,
"grad_norm": 1.0390625,
"learning_rate": 5.679556630723592e-06,
"loss": 0.05334362,
"memory(GiB)": 75.3,
"step": 485,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.4617638613824939,
"grad_norm": 0.9765625,
"learning_rate": 5.606078057117136e-06,
"loss": 0.06019425,
"memory(GiB)": 75.3,
"step": 490,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.46647573751904997,
"grad_norm": 0.95703125,
"learning_rate": 5.532466392526439e-06,
"loss": 0.05597678,
"memory(GiB)": 75.3,
"step": 495,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.471187613655606,
"grad_norm": 0.86328125,
"learning_rate": 5.458737801616721e-06,
"loss": 0.05094014,
"memory(GiB)": 75.3,
"step": 500,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.47589948979216207,
"grad_norm": 0.875,
"learning_rate": 5.384908474729501e-06,
"loss": 0.0548723,
"memory(GiB)": 75.3,
"step": 505,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.48061136592871817,
"grad_norm": 0.8984375,
"learning_rate": 5.310994624327292e-06,
"loss": 0.05574841,
"memory(GiB)": 75.3,
"step": 510,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.4853232420652742,
"grad_norm": 0.8671875,
"learning_rate": 5.23701248143345e-06,
"loss": 0.05651059,
"memory(GiB)": 75.3,
"step": 515,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.49003511820183027,
"grad_norm": 0.921875,
"learning_rate": 5.162978292067933e-06,
"loss": 0.05878415,
"memory(GiB)": 75.3,
"step": 520,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.4947469943383863,
"grad_norm": 1.0234375,
"learning_rate": 5.088908313679788e-06,
"loss": 0.05620171,
"memory(GiB)": 75.3,
"step": 525,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.49945887047494236,
"grad_norm": 0.86328125,
"learning_rate": 5.014818811577104e-06,
"loss": 0.05407885,
"memory(GiB)": 75.3,
"step": 530,
"train_speed(iter/s)": 0.003348
},
{
"epoch": 0.5041707466114984,
"grad_norm": 0.84375,
"learning_rate": 4.940726055355259e-06,
"loss": 0.05323058,
"memory(GiB)": 75.3,
"step": 535,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.5088826227480545,
"grad_norm": 0.83984375,
"learning_rate": 4.866646315324217e-06,
"loss": 0.05346375,
"memory(GiB)": 75.3,
"step": 540,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.5135944988846106,
"grad_norm": 0.828125,
"learning_rate": 4.792595858935668e-06,
"loss": 0.05774211,
"memory(GiB)": 75.3,
"step": 545,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.5183063750211666,
"grad_norm": 0.9921875,
"learning_rate": 4.718590947210788e-06,
"loss": 0.05547717,
"memory(GiB)": 75.3,
"step": 550,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.5230182511577227,
"grad_norm": 0.8046875,
"learning_rate": 4.644647831169435e-06,
"loss": 0.05536319,
"memory(GiB)": 75.3,
"step": 555,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.5277301272942787,
"grad_norm": 1.015625,
"learning_rate": 4.570782748261516e-06,
"loss": 0.05369086,
"memory(GiB)": 75.3,
"step": 560,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.5324420034308348,
"grad_norm": 0.94140625,
"learning_rate": 4.497011918801347e-06,
"loss": 0.05471834,
"memory(GiB)": 75.3,
"step": 565,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.5371538795673909,
"grad_norm": 0.9140625,
"learning_rate": 4.423351542405764e-06,
"loss": 0.05114409,
"memory(GiB)": 75.3,
"step": 570,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.5418657557039469,
"grad_norm": 0.9765625,
"learning_rate": 4.349817794436805e-06,
"loss": 0.05673685,
"memory(GiB)": 75.3,
"step": 575,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.546577631840503,
"grad_norm": 0.88671875,
"learning_rate": 4.276426822449682e-06,
"loss": 0.05527523,
"memory(GiB)": 75.3,
"step": 580,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.551289507977059,
"grad_norm": 0.90625,
"learning_rate": 4.203194742646893e-06,
"loss": 0.05317973,
"memory(GiB)": 75.3,
"step": 585,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.5560013841136151,
"grad_norm": 1.0078125,
"learning_rate": 4.130137636339191e-06,
"loss": 0.05449303,
"memory(GiB)": 75.3,
"step": 590,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.5607132602501712,
"grad_norm": 0.89453125,
"learning_rate": 4.057271546414242e-06,
"loss": 0.05341119,
"memory(GiB)": 75.3,
"step": 595,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.5654251363867272,
"grad_norm": 0.8515625,
"learning_rate": 3.984612473813689e-06,
"loss": 0.05254069,
"memory(GiB)": 75.3,
"step": 600,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.5701370125232833,
"grad_norm": 0.8984375,
"learning_rate": 3.912176374019462e-06,
"loss": 0.05324795,
"memory(GiB)": 75.3,
"step": 605,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.5748488886598394,
"grad_norm": 0.8671875,
"learning_rate": 3.839979153550039e-06,
"loss": 0.05177047,
"memory(GiB)": 75.3,
"step": 610,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.5795607647963954,
"grad_norm": 0.82421875,
"learning_rate": 3.768036666467486e-06,
"loss": 0.05265539,
"memory(GiB)": 75.3,
"step": 615,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.5842726409329515,
"grad_norm": 0.88671875,
"learning_rate": 3.6963647108959868e-06,
"loss": 0.05418316,
"memory(GiB)": 75.3,
"step": 620,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.5889845170695075,
"grad_norm": 0.93359375,
"learning_rate": 3.6249790255526916e-06,
"loss": 0.05562772,
"memory(GiB)": 75.3,
"step": 625,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.5936963932060636,
"grad_norm": 0.90234375,
"learning_rate": 3.553895286291577e-06,
"loss": 0.05445199,
"memory(GiB)": 75.3,
"step": 630,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.5984082693426197,
"grad_norm": 0.90234375,
"learning_rate": 3.483129102661137e-06,
"loss": 0.05333483,
"memory(GiB)": 75.3,
"step": 635,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6031201454791757,
"grad_norm": 0.8515625,
"learning_rate": 3.4126960144766107e-06,
"loss": 0.05417204,
"memory(GiB)": 75.3,
"step": 640,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6078320216157318,
"grad_norm": 0.91015625,
"learning_rate": 3.3426114884075488e-06,
"loss": 0.05412987,
"memory(GiB)": 75.3,
"step": 645,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6125438977522878,
"grad_norm": 0.87109375,
"learning_rate": 3.272890914581417e-06,
"loss": 0.05388454,
"memory(GiB)": 75.3,
"step": 650,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.6172557738888439,
"grad_norm": 0.85546875,
"learning_rate": 3.2035496032040303e-06,
"loss": 0.05097753,
"memory(GiB)": 75.3,
"step": 655,
"train_speed(iter/s)": 0.003349
},
{
"epoch": 0.6219676500254,
"grad_norm": 0.875,
"learning_rate": 3.134602781197515e-06,
"loss": 0.05341196,
"memory(GiB)": 75.3,
"step": 660,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.626679526161956,
"grad_norm": 0.90625,
"learning_rate": 3.0660655888565827e-06,
"loss": 0.05016219,
"memory(GiB)": 75.3,
"step": 665,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6313914022985121,
"grad_norm": 0.95703125,
"learning_rate": 2.997953076523803e-06,
"loss": 0.05216441,
"memory(GiB)": 75.3,
"step": 670,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6361032784350681,
"grad_norm": 1.015625,
"learning_rate": 2.930280201284654e-06,
"loss": 0.05449665,
"memory(GiB)": 75.3,
"step": 675,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6408151545716242,
"grad_norm": 0.921875,
"learning_rate": 2.863061823683032e-06,
"loss": 0.05129569,
"memory(GiB)": 75.3,
"step": 680,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6455270307081803,
"grad_norm": 0.87890625,
"learning_rate": 2.7963127044579697e-06,
"loss": 0.05290835,
"memory(GiB)": 75.3,
"step": 685,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6502389068447363,
"grad_norm": 0.87109375,
"learning_rate": 2.7300475013022666e-06,
"loss": 0.0528672,
"memory(GiB)": 75.3,
"step": 690,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6549507829812924,
"grad_norm": 0.984375,
"learning_rate": 2.6642807656437565e-06,
"loss": 0.05229232,
"memory(GiB)": 75.3,
"step": 695,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6596626591178485,
"grad_norm": 0.9609375,
"learning_rate": 2.599026939449899e-06,
"loss": 0.05371115,
"memory(GiB)": 75.3,
"step": 700,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6643745352544045,
"grad_norm": 1.0703125,
"learning_rate": 2.534300352056416e-06,
"loss": 0.05234203,
"memory(GiB)": 75.3,
"step": 705,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6690864113909606,
"grad_norm": 0.98828125,
"learning_rate": 2.470115217020654e-06,
"loss": 0.05360326,
"memory(GiB)": 75.3,
"step": 710,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6737982875275166,
"grad_norm": 0.92578125,
"learning_rate": 2.4064856290003863e-06,
"loss": 0.05475932,
"memory(GiB)": 75.3,
"step": 715,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6785101636640727,
"grad_norm": 1.0703125,
"learning_rate": 2.3434255606586925e-06,
"loss": 0.05548735,
"memory(GiB)": 75.3,
"step": 720,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6832220398006288,
"grad_norm": 0.89453125,
"learning_rate": 2.2809488595956746e-06,
"loss": 0.05201564,
"memory(GiB)": 75.3,
"step": 725,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.6879339159371848,
"grad_norm": 0.9140625,
"learning_rate": 2.219069245307589e-06,
"loss": 0.05408272,
"memory(GiB)": 75.3,
"step": 730,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.6926457920737409,
"grad_norm": 1.1640625,
"learning_rate": 2.157800306174139e-06,
"loss": 0.05537663,
"memory(GiB)": 75.3,
"step": 735,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.6973576682102969,
"grad_norm": 1.125,
"learning_rate": 2.0971554964745476e-06,
"loss": 0.05455139,
"memory(GiB)": 75.3,
"step": 740,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.702069544346853,
"grad_norm": 0.87890625,
"learning_rate": 2.0371481334330913e-06,
"loss": 0.05394316,
"memory(GiB)": 75.3,
"step": 745,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.7067814204834091,
"grad_norm": 0.828125,
"learning_rate": 1.9777913942946987e-06,
"loss": 0.05269849,
"memory(GiB)": 75.3,
"step": 750,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.7114932966199651,
"grad_norm": 0.81640625,
"learning_rate": 1.919098313431335e-06,
"loss": 0.05057405,
"memory(GiB)": 75.3,
"step": 755,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.7162051727565212,
"grad_norm": 0.9375,
"learning_rate": 1.8610817794797164e-06,
"loss": 0.05438253,
"memory(GiB)": 75.3,
"step": 760,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.7209170488930772,
"grad_norm": 0.84375,
"learning_rate": 1.8037545325110506e-06,
"loss": 0.05222658,
"memory(GiB)": 75.3,
"step": 765,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.7256289250296333,
"grad_norm": 0.88671875,
"learning_rate": 1.7471291612333997e-06,
"loss": 0.05131737,
"memory(GiB)": 75.3,
"step": 770,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.7303408011661894,
"grad_norm": 0.93359375,
"learning_rate": 1.6912181002272714e-06,
"loss": 0.05391481,
"memory(GiB)": 75.3,
"step": 775,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.7350526773027454,
"grad_norm": 0.8984375,
"learning_rate": 1.6360336272150684e-06,
"loss": 0.05078862,
"memory(GiB)": 75.3,
"step": 780,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.7397645534393015,
"grad_norm": 0.953125,
"learning_rate": 1.581587860364977e-06,
"loss": 0.05192038,
"memory(GiB)": 75.3,
"step": 785,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.7444764295758576,
"grad_norm": 0.96875,
"learning_rate": 1.52789275562988e-06,
"loss": 0.05364103,
"memory(GiB)": 75.3,
"step": 790,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.7491883057124136,
"grad_norm": 1.0625,
"learning_rate": 1.4749601041219246e-06,
"loss": 0.0536845,
"memory(GiB)": 75.3,
"step": 795,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.7539001818489697,
"grad_norm": 0.90234375,
"learning_rate": 1.4228015295232484e-06,
"loss": 0.05084696,
"memory(GiB)": 75.3,
"step": 800,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.7586120579855257,
"grad_norm": 0.875,
"learning_rate": 1.371428485533498e-06,
"loss": 0.05773014,
"memory(GiB)": 75.3,
"step": 805,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.7633239341220818,
"grad_norm": 0.91796875,
"learning_rate": 1.3208522533546748e-06,
"loss": 0.05219783,
"memory(GiB)": 75.3,
"step": 810,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.7680358102586379,
"grad_norm": 0.96875,
"learning_rate": 1.2710839392138386e-06,
"loss": 0.05375321,
"memory(GiB)": 75.3,
"step": 815,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.7727476863951939,
"grad_norm": 0.87890625,
"learning_rate": 1.222134471924259e-06,
"loss": 0.05204231,
"memory(GiB)": 75.3,
"step": 820,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.77745956253175,
"grad_norm": 0.91796875,
"learning_rate": 1.1740146004855141e-06,
"loss": 0.0559127,
"memory(GiB)": 75.3,
"step": 825,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.782171438668306,
"grad_norm": 0.89453125,
"learning_rate": 1.1267348917230737e-06,
"loss": 0.05298336,
"memory(GiB)": 75.3,
"step": 830,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.7868833148048621,
"grad_norm": 0.90625,
"learning_rate": 1.080305727967893e-06,
"loss": 0.05347639,
"memory(GiB)": 75.3,
"step": 835,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.7915951909414182,
"grad_norm": 0.84765625,
"learning_rate": 1.0347373047765202e-06,
"loss": 0.05329442,
"memory(GiB)": 75.3,
"step": 840,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.7963070670779742,
"grad_norm": 0.8359375,
"learning_rate": 9.900396286922025e-07,
"loss": 0.0537856,
"memory(GiB)": 75.3,
"step": 845,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.8010189432145303,
"grad_norm": 0.8125,
"learning_rate": 9.462225150475296e-07,
"loss": 0.05233877,
"memory(GiB)": 75.3,
"step": 850,
"train_speed(iter/s)": 0.00335
},
{
"epoch": 0.8057308193510863,
"grad_norm": 0.88671875,
"learning_rate": 9.032955858090319e-07,
"loss": 0.0549244,
"memory(GiB)": 75.3,
"step": 855,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8104426954876424,
"grad_norm": 0.9140625,
"learning_rate": 8.612682674642647e-07,
"loss": 0.04935811,
"memory(GiB)": 75.3,
"step": 860,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8151545716241985,
"grad_norm": 0.921875,
"learning_rate": 8.201497889518073e-07,
"loss": 0.05281691,
"memory(GiB)": 75.3,
"step": 865,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8198664477607545,
"grad_norm": 0.90625,
"learning_rate": 7.799491796346487e-07,
"loss": 0.05795277,
"memory(GiB)": 75.3,
"step": 870,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8245783238973106,
"grad_norm": 0.8046875,
"learning_rate": 7.406752673173851e-07,
"loss": 0.05225162,
"memory(GiB)": 75.3,
"step": 875,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8292902000338667,
"grad_norm": 0.87890625,
"learning_rate": 7.023366763077044e-07,
"loss": 0.0509973,
"memory(GiB)": 75.3,
"step": 880,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8340020761704227,
"grad_norm": 0.87109375,
"learning_rate": 6.649418255225298e-07,
"loss": 0.05142277,
"memory(GiB)": 75.3,
"step": 885,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8387139523069788,
"grad_norm": 0.95703125,
"learning_rate": 6.284989266392805e-07,
"loss": 0.05023923,
"memory(GiB)": 75.3,
"step": 890,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8434258284435348,
"grad_norm": 0.8828125,
"learning_rate": 5.930159822926407e-07,
"loss": 0.0534648,
"memory(GiB)": 75.3,
"step": 895,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8481377045800909,
"grad_norm": 0.84375,
"learning_rate": 5.585007843172286e-07,
"loss": 0.05155768,
"memory(GiB)": 75.3,
"step": 900,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.852849580716647,
"grad_norm": 0.9453125,
"learning_rate": 5.249609120365579e-07,
"loss": 0.05368913,
"memory(GiB)": 75.3,
"step": 905,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.857561456853203,
"grad_norm": 0.859375,
"learning_rate": 4.924037305986696e-07,
"loss": 0.05452033,
"memory(GiB)": 75.3,
"step": 910,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8622733329897591,
"grad_norm": 0.8515625,
"learning_rate": 4.6083638935878025e-07,
"loss": 0.05384221,
"memory(GiB)": 75.3,
"step": 915,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.866985209126315,
"grad_norm": 0.828125,
"learning_rate": 4.302658203093418e-07,
"loss": 0.05272598,
"memory(GiB)": 75.3,
"step": 920,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8716970852628712,
"grad_norm": 0.8671875,
"learning_rate": 4.00698736557808e-07,
"loss": 0.05447989,
"memory(GiB)": 75.3,
"step": 925,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8764089613994273,
"grad_norm": 0.9453125,
"learning_rate": 3.721416308524839e-07,
"loss": 0.05123619,
"memory(GiB)": 75.3,
"step": 930,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8811208375359832,
"grad_norm": 0.8515625,
"learning_rate": 3.4460077415675473e-07,
"loss": 0.05347574,
"memory(GiB)": 75.3,
"step": 935,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8858327136725394,
"grad_norm": 0.7890625,
"learning_rate": 3.1808221427202636e-07,
"loss": 0.05334803,
"memory(GiB)": 75.3,
"step": 940,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8905445898090953,
"grad_norm": 0.94921875,
"learning_rate": 2.925917745096568e-07,
"loss": 0.05249671,
"memory(GiB)": 75.3,
"step": 945,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8952564659456514,
"grad_norm": 0.91015625,
"learning_rate": 2.681350524122045e-07,
"loss": 0.05494893,
"memory(GiB)": 75.3,
"step": 950,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.8999683420822076,
"grad_norm": 0.828125,
"learning_rate": 2.447174185242324e-07,
"loss": 0.05149726,
"memory(GiB)": 75.3,
"step": 955,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.9046802182187635,
"grad_norm": 1.0859375,
"learning_rate": 2.2234401521297576e-07,
"loss": 0.05425293,
"memory(GiB)": 75.3,
"step": 960,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.9093920943553196,
"grad_norm": 0.87109375,
"learning_rate": 2.01019755539108e-07,
"loss": 0.0552171,
"memory(GiB)": 75.3,
"step": 965,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.9141039704918758,
"grad_norm": 0.84375,
"learning_rate": 1.8074932217786445e-07,
"loss": 0.05237709,
"memory(GiB)": 75.3,
"step": 970,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.9188158466284317,
"grad_norm": 0.86328125,
"learning_rate": 1.6153716639075223e-07,
"loss": 0.05221198,
"memory(GiB)": 75.3,
"step": 975,
"train_speed(iter/s)": 0.003351
},
{
"epoch": 0.9235277227649878,
"grad_norm": 0.8125,
"learning_rate": 1.433875070480878e-07,
"loss": 0.05134506,
"memory(GiB)": 75.3,
"step": 980,
"train_speed(iter/s)": 0.003352
},
{
"epoch": 0.9282395989015438,
"grad_norm": 0.890625,
"learning_rate": 1.2630432970255014e-07,
"loss": 0.05436495,
"memory(GiB)": 75.3,
"step": 985,
"train_speed(iter/s)": 0.003352
},
{
"epoch": 0.9329514750380999,
"grad_norm": 0.921875,
"learning_rate": 1.1029138571398645e-07,
"loss": 0.05440986,
"memory(GiB)": 75.3,
"step": 990,
"train_speed(iter/s)": 0.003352
},
{
"epoch": 0.937663351174656,
"grad_norm": 0.91796875,
"learning_rate": 9.535219142563168e-08,
"loss": 0.05418127,
"memory(GiB)": 75.3,
"step": 995,
"train_speed(iter/s)": 0.003352
},
{
"epoch": 0.942375227311212,
"grad_norm": 0.8984375,
"learning_rate": 8.149002739194222e-08,
"loss": 0.05519557,
"memory(GiB)": 75.3,
"step": 1000,
"train_speed(iter/s)": 0.003352
}
],
"logging_steps": 5,
"max_steps": 1061,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 5.440049406181114e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}