|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.942375227311212, |
|
"eval_steps": 100000, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.000942375227311212, |
|
"grad_norm": 14.25, |
|
"learning_rate": 1e-05, |
|
"loss": 0.46586317, |
|
"memory(GiB)": 64.76, |
|
"step": 1, |
|
"train_speed(iter/s)": 0.003324 |
|
}, |
|
{ |
|
"epoch": 0.00471187613655606, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 9.999648647603774e-06, |
|
"loss": 0.26192743, |
|
"memory(GiB)": 75.3, |
|
"step": 5, |
|
"train_speed(iter/s)": 0.003362 |
|
}, |
|
{ |
|
"epoch": 0.00942375227311212, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 9.998221363123425e-06, |
|
"loss": 0.10271888, |
|
"memory(GiB)": 75.3, |
|
"step": 10, |
|
"train_speed(iter/s)": 0.003359 |
|
}, |
|
{ |
|
"epoch": 0.01413562840966818, |
|
"grad_norm": 1.1796875, |
|
"learning_rate": 9.995696500215899e-06, |
|
"loss": 0.09046092, |
|
"memory(GiB)": 75.3, |
|
"step": 15, |
|
"train_speed(iter/s)": 0.003358 |
|
}, |
|
{ |
|
"epoch": 0.01884750454622424, |
|
"grad_norm": 1.140625, |
|
"learning_rate": 9.992074613325435e-06, |
|
"loss": 0.08653282, |
|
"memory(GiB)": 75.3, |
|
"step": 20, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.023559380682780302, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 9.987356497795944e-06, |
|
"loss": 0.08451628, |
|
"memory(GiB)": 75.3, |
|
"step": 25, |
|
"train_speed(iter/s)": 0.003358 |
|
}, |
|
{ |
|
"epoch": 0.02827125681933636, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 9.981543189696349e-06, |
|
"loss": 0.0772208, |
|
"memory(GiB)": 75.3, |
|
"step": 30, |
|
"train_speed(iter/s)": 0.003356 |
|
}, |
|
{ |
|
"epoch": 0.03298313295589242, |
|
"grad_norm": 1.125, |
|
"learning_rate": 9.97463596559307e-06, |
|
"loss": 0.08322463, |
|
"memory(GiB)": 75.3, |
|
"step": 35, |
|
"train_speed(iter/s)": 0.003356 |
|
}, |
|
{ |
|
"epoch": 0.03769500909244848, |
|
"grad_norm": 1.09375, |
|
"learning_rate": 9.966636342269706e-06, |
|
"loss": 0.07725406, |
|
"memory(GiB)": 75.3, |
|
"step": 40, |
|
"train_speed(iter/s)": 0.003355 |
|
}, |
|
{ |
|
"epoch": 0.04240688522900454, |
|
"grad_norm": 1.15625, |
|
"learning_rate": 9.957546076393944e-06, |
|
"loss": 0.07683957, |
|
"memory(GiB)": 75.3, |
|
"step": 45, |
|
"train_speed(iter/s)": 0.003356 |
|
}, |
|
{ |
|
"epoch": 0.047118761365560605, |
|
"grad_norm": 1.1328125, |
|
"learning_rate": 9.947367164131823e-06, |
|
"loss": 0.07508552, |
|
"memory(GiB)": 75.3, |
|
"step": 50, |
|
"train_speed(iter/s)": 0.003355 |
|
}, |
|
{ |
|
"epoch": 0.05183063750211667, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 9.936101840709373e-06, |
|
"loss": 0.07236413, |
|
"memory(GiB)": 75.3, |
|
"step": 55, |
|
"train_speed(iter/s)": 0.003353 |
|
}, |
|
{ |
|
"epoch": 0.05654251363867272, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 9.923752579921787e-06, |
|
"loss": 0.07231579, |
|
"memory(GiB)": 75.3, |
|
"step": 60, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.06125438977522878, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 9.910322093590177e-06, |
|
"loss": 0.07145001, |
|
"memory(GiB)": 75.3, |
|
"step": 65, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.06596626591178484, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 9.895813330966086e-06, |
|
"loss": 0.07301619, |
|
"memory(GiB)": 75.3, |
|
"step": 70, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.0706781420483409, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 9.880229478083849e-06, |
|
"loss": 0.0724276, |
|
"memory(GiB)": 75.3, |
|
"step": 75, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.07539001818489696, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 9.863573957060953e-06, |
|
"loss": 0.06874905, |
|
"memory(GiB)": 75.3, |
|
"step": 80, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.08010189432145302, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 9.845850425346563e-06, |
|
"loss": 0.07212579, |
|
"memory(GiB)": 75.3, |
|
"step": 85, |
|
"train_speed(iter/s)": 0.003347 |
|
}, |
|
{ |
|
"epoch": 0.08481377045800909, |
|
"grad_norm": 1.1171875, |
|
"learning_rate": 9.827062774918377e-06, |
|
"loss": 0.07294501, |
|
"memory(GiB)": 75.3, |
|
"step": 90, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.08952564659456515, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 9.807215131427966e-06, |
|
"loss": 0.06517277, |
|
"memory(GiB)": 75.3, |
|
"step": 95, |
|
"train_speed(iter/s)": 0.003345 |
|
}, |
|
{ |
|
"epoch": 0.09423752273112121, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 9.786311853294799e-06, |
|
"loss": 0.06962139, |
|
"memory(GiB)": 75.3, |
|
"step": 100, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.09894939886767727, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 9.764357530749178e-06, |
|
"loss": 0.06724482, |
|
"memory(GiB)": 75.3, |
|
"step": 105, |
|
"train_speed(iter/s)": 0.003339 |
|
}, |
|
{ |
|
"epoch": 0.10366127500423333, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 9.741356984824234e-06, |
|
"loss": 0.06572815, |
|
"memory(GiB)": 75.3, |
|
"step": 110, |
|
"train_speed(iter/s)": 0.003339 |
|
}, |
|
{ |
|
"epoch": 0.10837315114078938, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 9.717315266297277e-06, |
|
"loss": 0.06739124, |
|
"memory(GiB)": 75.3, |
|
"step": 115, |
|
"train_speed(iter/s)": 0.003342 |
|
}, |
|
{ |
|
"epoch": 0.11308502727734544, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 9.692237654580658e-06, |
|
"loss": 0.06834027, |
|
"memory(GiB)": 75.3, |
|
"step": 120, |
|
"train_speed(iter/s)": 0.003342 |
|
}, |
|
{ |
|
"epoch": 0.1177969034139015, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 9.66612965656245e-06, |
|
"loss": 0.0658385, |
|
"memory(GiB)": 75.3, |
|
"step": 125, |
|
"train_speed(iter/s)": 0.003343 |
|
}, |
|
{ |
|
"epoch": 0.12250877955045757, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 9.638997005397174e-06, |
|
"loss": 0.0717117, |
|
"memory(GiB)": 75.3, |
|
"step": 130, |
|
"train_speed(iter/s)": 0.003344 |
|
}, |
|
{ |
|
"epoch": 0.12722065568701363, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 9.610845659246833e-06, |
|
"loss": 0.0667814, |
|
"memory(GiB)": 75.3, |
|
"step": 135, |
|
"train_speed(iter/s)": 0.003344 |
|
}, |
|
{ |
|
"epoch": 0.13193253182356968, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 9.581681799972528e-06, |
|
"loss": 0.06573244, |
|
"memory(GiB)": 75.3, |
|
"step": 140, |
|
"train_speed(iter/s)": 0.003345 |
|
}, |
|
{ |
|
"epoch": 0.13664440796012575, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 9.551511831776966e-06, |
|
"loss": 0.06967602, |
|
"memory(GiB)": 75.3, |
|
"step": 145, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.1413562840966818, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 9.520342379798141e-06, |
|
"loss": 0.06216406, |
|
"memory(GiB)": 75.3, |
|
"step": 150, |
|
"train_speed(iter/s)": 0.003345 |
|
}, |
|
{ |
|
"epoch": 0.14606816023323788, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 9.488180288654485e-06, |
|
"loss": 0.06460171, |
|
"memory(GiB)": 75.3, |
|
"step": 155, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.15078003636979392, |
|
"grad_norm": 1.078125, |
|
"learning_rate": 9.45503262094184e-06, |
|
"loss": 0.06467786, |
|
"memory(GiB)": 75.3, |
|
"step": 160, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.15549191250635, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 9.420906655682553e-06, |
|
"loss": 0.06358048, |
|
"memory(GiB)": 75.3, |
|
"step": 165, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.16020378864290605, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 9.385809886727044e-06, |
|
"loss": 0.06778824, |
|
"memory(GiB)": 75.3, |
|
"step": 170, |
|
"train_speed(iter/s)": 0.003343 |
|
}, |
|
{ |
|
"epoch": 0.16491566477946212, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 9.349750021108212e-06, |
|
"loss": 0.06321884, |
|
"memory(GiB)": 75.3, |
|
"step": 175, |
|
"train_speed(iter/s)": 0.003343 |
|
}, |
|
{ |
|
"epoch": 0.16962754091601817, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 9.31273497734901e-06, |
|
"loss": 0.06310185, |
|
"memory(GiB)": 75.3, |
|
"step": 180, |
|
"train_speed(iter/s)": 0.003344 |
|
}, |
|
{ |
|
"epoch": 0.17433941705257422, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 9.274772883723587e-06, |
|
"loss": 0.06271737, |
|
"memory(GiB)": 75.3, |
|
"step": 185, |
|
"train_speed(iter/s)": 0.003344 |
|
}, |
|
{ |
|
"epoch": 0.1790512931891303, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 9.235872076472378e-06, |
|
"loss": 0.06393245, |
|
"memory(GiB)": 75.3, |
|
"step": 190, |
|
"train_speed(iter/s)": 0.003345 |
|
}, |
|
{ |
|
"epoch": 0.18376316932568634, |
|
"grad_norm": 1.03125, |
|
"learning_rate": 9.196041097971509e-06, |
|
"loss": 0.06558744, |
|
"memory(GiB)": 75.3, |
|
"step": 195, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.18847504546224242, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 9.155288694856942e-06, |
|
"loss": 0.06127087, |
|
"memory(GiB)": 75.3, |
|
"step": 200, |
|
"train_speed(iter/s)": 0.003345 |
|
}, |
|
{ |
|
"epoch": 0.19318692159879847, |
|
"grad_norm": 0.875, |
|
"learning_rate": 9.113623816103775e-06, |
|
"loss": 0.06313071, |
|
"memory(GiB)": 75.3, |
|
"step": 205, |
|
"train_speed(iter/s)": 0.003343 |
|
}, |
|
{ |
|
"epoch": 0.19789879773535454, |
|
"grad_norm": 1.0, |
|
"learning_rate": 9.071055611061102e-06, |
|
"loss": 0.06330621, |
|
"memory(GiB)": 75.3, |
|
"step": 210, |
|
"train_speed(iter/s)": 0.003343 |
|
}, |
|
{ |
|
"epoch": 0.2026106738719106, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 9.027593427442867e-06, |
|
"loss": 0.06415906, |
|
"memory(GiB)": 75.3, |
|
"step": 215, |
|
"train_speed(iter/s)": 0.003343 |
|
}, |
|
{ |
|
"epoch": 0.20732255000846667, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 8.98324680927517e-06, |
|
"loss": 0.06299359, |
|
"memory(GiB)": 75.3, |
|
"step": 220, |
|
"train_speed(iter/s)": 0.003343 |
|
}, |
|
{ |
|
"epoch": 0.21203442614502271, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 8.938025494800454e-06, |
|
"loss": 0.06004124, |
|
"memory(GiB)": 75.3, |
|
"step": 225, |
|
"train_speed(iter/s)": 0.003343 |
|
}, |
|
{ |
|
"epoch": 0.21674630228157876, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 8.891939414339048e-06, |
|
"loss": 0.06477681, |
|
"memory(GiB)": 75.3, |
|
"step": 230, |
|
"train_speed(iter/s)": 0.003344 |
|
}, |
|
{ |
|
"epoch": 0.22145817841813484, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 8.844998688108535e-06, |
|
"loss": 0.06010489, |
|
"memory(GiB)": 75.3, |
|
"step": 235, |
|
"train_speed(iter/s)": 0.003344 |
|
}, |
|
{ |
|
"epoch": 0.22617005455469089, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 8.797213624001403e-06, |
|
"loss": 0.05960445, |
|
"memory(GiB)": 75.3, |
|
"step": 240, |
|
"train_speed(iter/s)": 0.003345 |
|
}, |
|
{ |
|
"epoch": 0.23088193069124696, |
|
"grad_norm": 1.0, |
|
"learning_rate": 8.748594715321512e-06, |
|
"loss": 0.06301316, |
|
"memory(GiB)": 75.3, |
|
"step": 245, |
|
"train_speed(iter/s)": 0.003345 |
|
}, |
|
{ |
|
"epoch": 0.235593806827803, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 8.699152638479817e-06, |
|
"loss": 0.06120233, |
|
"memory(GiB)": 75.3, |
|
"step": 250, |
|
"train_speed(iter/s)": 0.003345 |
|
}, |
|
{ |
|
"epoch": 0.24030568296435909, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 8.6488982506499e-06, |
|
"loss": 0.06014684, |
|
"memory(GiB)": 75.3, |
|
"step": 255, |
|
"train_speed(iter/s)": 0.003345 |
|
}, |
|
{ |
|
"epoch": 0.24501755910091513, |
|
"grad_norm": 1.0, |
|
"learning_rate": 8.597842587383797e-06, |
|
"loss": 0.05922247, |
|
"memory(GiB)": 75.3, |
|
"step": 260, |
|
"train_speed(iter/s)": 0.003345 |
|
}, |
|
{ |
|
"epoch": 0.24972943523747118, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 8.545996860188668e-06, |
|
"loss": 0.05851297, |
|
"memory(GiB)": 75.3, |
|
"step": 265, |
|
"train_speed(iter/s)": 0.003345 |
|
}, |
|
{ |
|
"epoch": 0.25444131137402726, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 8.493372454064809e-06, |
|
"loss": 0.05934198, |
|
"memory(GiB)": 75.3, |
|
"step": 270, |
|
"train_speed(iter/s)": 0.003345 |
|
}, |
|
{ |
|
"epoch": 0.2591531875105833, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 8.439980925005587e-06, |
|
"loss": 0.06134464, |
|
"memory(GiB)": 75.3, |
|
"step": 275, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.26386506364713935, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 8.385833997459804e-06, |
|
"loss": 0.05825667, |
|
"memory(GiB)": 75.3, |
|
"step": 280, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.26857693978369546, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 8.330943561757092e-06, |
|
"loss": 0.06092241, |
|
"memory(GiB)": 75.3, |
|
"step": 285, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.2732888159202515, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 8.275321671496862e-06, |
|
"loss": 0.05940055, |
|
"memory(GiB)": 75.3, |
|
"step": 290, |
|
"train_speed(iter/s)": 0.003347 |
|
}, |
|
{ |
|
"epoch": 0.27800069205680755, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 8.218980540901417e-06, |
|
"loss": 0.05920713, |
|
"memory(GiB)": 75.3, |
|
"step": 295, |
|
"train_speed(iter/s)": 0.003347 |
|
}, |
|
{ |
|
"epoch": 0.2827125681933636, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 8.16193254213377e-06, |
|
"loss": 0.05777416, |
|
"memory(GiB)": 75.3, |
|
"step": 300, |
|
"train_speed(iter/s)": 0.003347 |
|
}, |
|
{ |
|
"epoch": 0.2874244443299197, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 8.104190202580811e-06, |
|
"loss": 0.05302551, |
|
"memory(GiB)": 75.3, |
|
"step": 305, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.29213632046647575, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 8.045766202102358e-06, |
|
"loss": 0.05804279, |
|
"memory(GiB)": 75.3, |
|
"step": 310, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.2968481966030318, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 7.986673370246743e-06, |
|
"loss": 0.05822692, |
|
"memory(GiB)": 75.3, |
|
"step": 315, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.30156007273958785, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 7.926924683433523e-06, |
|
"loss": 0.06007032, |
|
"memory(GiB)": 75.3, |
|
"step": 320, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.3062719488761439, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 7.866533262103937e-06, |
|
"loss": 0.06018423, |
|
"memory(GiB)": 75.3, |
|
"step": 325, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.3109838250127, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 7.805512367839742e-06, |
|
"loss": 0.05931915, |
|
"memory(GiB)": 75.3, |
|
"step": 330, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.31569570114925605, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 7.743875400451047e-06, |
|
"loss": 0.0566447, |
|
"memory(GiB)": 75.3, |
|
"step": 335, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.3204075772858121, |
|
"grad_norm": 0.8203125, |
|
"learning_rate": 7.681635895033798e-06, |
|
"loss": 0.05161901, |
|
"memory(GiB)": 75.3, |
|
"step": 340, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.32511945342236814, |
|
"grad_norm": 1.0, |
|
"learning_rate": 7.6188075189975644e-06, |
|
"loss": 0.05694907, |
|
"memory(GiB)": 75.3, |
|
"step": 345, |
|
"train_speed(iter/s)": 0.003346 |
|
}, |
|
{ |
|
"epoch": 0.32983132955892425, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 7.555404069064245e-06, |
|
"loss": 0.05555046, |
|
"memory(GiB)": 75.3, |
|
"step": 350, |
|
"train_speed(iter/s)": 0.003347 |
|
}, |
|
{ |
|
"epoch": 0.3345432056954803, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 7.491439468238404e-06, |
|
"loss": 0.05587023, |
|
"memory(GiB)": 75.3, |
|
"step": 355, |
|
"train_speed(iter/s)": 0.003347 |
|
}, |
|
{ |
|
"epoch": 0.33925508183203634, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 7.426927762749867e-06, |
|
"loss": 0.05913154, |
|
"memory(GiB)": 75.3, |
|
"step": 360, |
|
"train_speed(iter/s)": 0.003347 |
|
}, |
|
{ |
|
"epoch": 0.3439669579685924, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 7.361883118969248e-06, |
|
"loss": 0.05830712, |
|
"memory(GiB)": 75.3, |
|
"step": 365, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.34867883410514844, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 7.2963198202971055e-06, |
|
"loss": 0.05937972, |
|
"memory(GiB)": 75.3, |
|
"step": 370, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.35339071024170454, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 7.230252264027398e-06, |
|
"loss": 0.0565136, |
|
"memory(GiB)": 75.3, |
|
"step": 375, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.3581025863782606, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 7.163694958185928e-06, |
|
"loss": 0.05636386, |
|
"memory(GiB)": 75.3, |
|
"step": 380, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.36281446251481664, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 7.09666251834447e-06, |
|
"loss": 0.06038175, |
|
"memory(GiB)": 75.3, |
|
"step": 385, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.3675263386513727, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 7.0291696644112705e-06, |
|
"loss": 0.05833557, |
|
"memory(GiB)": 75.3, |
|
"step": 390, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.3722382147879288, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 6.9612312173986675e-06, |
|
"loss": 0.05632974, |
|
"memory(GiB)": 75.3, |
|
"step": 395, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.37695009092448484, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 6.892862096168469e-06, |
|
"loss": 0.05656151, |
|
"memory(GiB)": 75.3, |
|
"step": 400, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.3816619670610409, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 6.824077314155877e-06, |
|
"loss": 0.05432441, |
|
"memory(GiB)": 75.3, |
|
"step": 405, |
|
"train_speed(iter/s)": 0.003347 |
|
}, |
|
{ |
|
"epoch": 0.38637384319759693, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 6.75489197607262e-06, |
|
"loss": 0.05709869, |
|
"memory(GiB)": 75.3, |
|
"step": 410, |
|
"train_speed(iter/s)": 0.003347 |
|
}, |
|
{ |
|
"epoch": 0.391085719334153, |
|
"grad_norm": 1.0546875, |
|
"learning_rate": 6.6853212745900585e-06, |
|
"loss": 0.05979726, |
|
"memory(GiB)": 75.3, |
|
"step": 415, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.3957975954707091, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 6.615380487002969e-06, |
|
"loss": 0.0600209, |
|
"memory(GiB)": 75.3, |
|
"step": 420, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.40050947160726513, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.0563777, |
|
"memory(GiB)": 75.3, |
|
"step": 425, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.4052213477438212, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 6.474450165664722e-06, |
|
"loss": 0.05698464, |
|
"memory(GiB)": 75.3, |
|
"step": 430, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.40993322388037723, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 6.4034915793385e-06, |
|
"loss": 0.05311573, |
|
"memory(GiB)": 75.3, |
|
"step": 435, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.41464510001693333, |
|
"grad_norm": 0.97265625, |
|
"learning_rate": 6.332224794961752e-06, |
|
"loss": 0.05458606, |
|
"memory(GiB)": 75.3, |
|
"step": 440, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.4193569761534894, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 6.260665462278544e-06, |
|
"loss": 0.05579169, |
|
"memory(GiB)": 75.3, |
|
"step": 445, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.42406885229004543, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 6.18882929527473e-06, |
|
"loss": 0.06002288, |
|
"memory(GiB)": 75.3, |
|
"step": 450, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.4287807284266015, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 6.116732068727271e-06, |
|
"loss": 0.05494517, |
|
"memory(GiB)": 75.3, |
|
"step": 455, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.4334926045631575, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 6.0443896147401856e-06, |
|
"loss": 0.0547879, |
|
"memory(GiB)": 75.3, |
|
"step": 460, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.4382044806997136, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 5.971817819267914e-06, |
|
"loss": 0.05363967, |
|
"memory(GiB)": 75.3, |
|
"step": 465, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.4429163568362697, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 5.8990326186268655e-06, |
|
"loss": 0.056594, |
|
"memory(GiB)": 75.3, |
|
"step": 470, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.4476282329728257, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 5.826049995995905e-06, |
|
"loss": 0.05898719, |
|
"memory(GiB)": 75.3, |
|
"step": 475, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.45234010910938177, |
|
"grad_norm": 1.3671875, |
|
"learning_rate": 5.752885977906539e-06, |
|
"loss": 0.05439388, |
|
"memory(GiB)": 75.3, |
|
"step": 480, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.4570519852459379, |
|
"grad_norm": 1.0390625, |
|
"learning_rate": 5.679556630723592e-06, |
|
"loss": 0.05334362, |
|
"memory(GiB)": 75.3, |
|
"step": 485, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.4617638613824939, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 5.606078057117136e-06, |
|
"loss": 0.06019425, |
|
"memory(GiB)": 75.3, |
|
"step": 490, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.46647573751904997, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 5.532466392526439e-06, |
|
"loss": 0.05597678, |
|
"memory(GiB)": 75.3, |
|
"step": 495, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.471187613655606, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 5.458737801616721e-06, |
|
"loss": 0.05094014, |
|
"memory(GiB)": 75.3, |
|
"step": 500, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.47589948979216207, |
|
"grad_norm": 0.875, |
|
"learning_rate": 5.384908474729501e-06, |
|
"loss": 0.0548723, |
|
"memory(GiB)": 75.3, |
|
"step": 505, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.48061136592871817, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 5.310994624327292e-06, |
|
"loss": 0.05574841, |
|
"memory(GiB)": 75.3, |
|
"step": 510, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.4853232420652742, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 5.23701248143345e-06, |
|
"loss": 0.05651059, |
|
"memory(GiB)": 75.3, |
|
"step": 515, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.49003511820183027, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 5.162978292067933e-06, |
|
"loss": 0.05878415, |
|
"memory(GiB)": 75.3, |
|
"step": 520, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.4947469943383863, |
|
"grad_norm": 1.0234375, |
|
"learning_rate": 5.088908313679788e-06, |
|
"loss": 0.05620171, |
|
"memory(GiB)": 75.3, |
|
"step": 525, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.49945887047494236, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 5.014818811577104e-06, |
|
"loss": 0.05407885, |
|
"memory(GiB)": 75.3, |
|
"step": 530, |
|
"train_speed(iter/s)": 0.003348 |
|
}, |
|
{ |
|
"epoch": 0.5041707466114984, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 4.940726055355259e-06, |
|
"loss": 0.05323058, |
|
"memory(GiB)": 75.3, |
|
"step": 535, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.5088826227480545, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 4.866646315324217e-06, |
|
"loss": 0.05346375, |
|
"memory(GiB)": 75.3, |
|
"step": 540, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.5135944988846106, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 4.792595858935668e-06, |
|
"loss": 0.05774211, |
|
"memory(GiB)": 75.3, |
|
"step": 545, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.5183063750211666, |
|
"grad_norm": 0.9921875, |
|
"learning_rate": 4.718590947210788e-06, |
|
"loss": 0.05547717, |
|
"memory(GiB)": 75.3, |
|
"step": 550, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.5230182511577227, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 4.644647831169435e-06, |
|
"loss": 0.05536319, |
|
"memory(GiB)": 75.3, |
|
"step": 555, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.5277301272942787, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 4.570782748261516e-06, |
|
"loss": 0.05369086, |
|
"memory(GiB)": 75.3, |
|
"step": 560, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.5324420034308348, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 4.497011918801347e-06, |
|
"loss": 0.05471834, |
|
"memory(GiB)": 75.3, |
|
"step": 565, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.5371538795673909, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 4.423351542405764e-06, |
|
"loss": 0.05114409, |
|
"memory(GiB)": 75.3, |
|
"step": 570, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.5418657557039469, |
|
"grad_norm": 0.9765625, |
|
"learning_rate": 4.349817794436805e-06, |
|
"loss": 0.05673685, |
|
"memory(GiB)": 75.3, |
|
"step": 575, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.546577631840503, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 4.276426822449682e-06, |
|
"loss": 0.05527523, |
|
"memory(GiB)": 75.3, |
|
"step": 580, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.551289507977059, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.203194742646893e-06, |
|
"loss": 0.05317973, |
|
"memory(GiB)": 75.3, |
|
"step": 585, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.5560013841136151, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 4.130137636339191e-06, |
|
"loss": 0.05449303, |
|
"memory(GiB)": 75.3, |
|
"step": 590, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.5607132602501712, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 4.057271546414242e-06, |
|
"loss": 0.05341119, |
|
"memory(GiB)": 75.3, |
|
"step": 595, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.5654251363867272, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.984612473813689e-06, |
|
"loss": 0.05254069, |
|
"memory(GiB)": 75.3, |
|
"step": 600, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.5701370125232833, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 3.912176374019462e-06, |
|
"loss": 0.05324795, |
|
"memory(GiB)": 75.3, |
|
"step": 605, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.5748488886598394, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 3.839979153550039e-06, |
|
"loss": 0.05177047, |
|
"memory(GiB)": 75.3, |
|
"step": 610, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.5795607647963954, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.768036666467486e-06, |
|
"loss": 0.05265539, |
|
"memory(GiB)": 75.3, |
|
"step": 615, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.5842726409329515, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 3.6963647108959868e-06, |
|
"loss": 0.05418316, |
|
"memory(GiB)": 75.3, |
|
"step": 620, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.5889845170695075, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 3.6249790255526916e-06, |
|
"loss": 0.05562772, |
|
"memory(GiB)": 75.3, |
|
"step": 625, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.5936963932060636, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 3.553895286291577e-06, |
|
"loss": 0.05445199, |
|
"memory(GiB)": 75.3, |
|
"step": 630, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.5984082693426197, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 3.483129102661137e-06, |
|
"loss": 0.05333483, |
|
"memory(GiB)": 75.3, |
|
"step": 635, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6031201454791757, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.4126960144766107e-06, |
|
"loss": 0.05417204, |
|
"memory(GiB)": 75.3, |
|
"step": 640, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6078320216157318, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 3.3426114884075488e-06, |
|
"loss": 0.05412987, |
|
"memory(GiB)": 75.3, |
|
"step": 645, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6125438977522878, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 3.272890914581417e-06, |
|
"loss": 0.05388454, |
|
"memory(GiB)": 75.3, |
|
"step": 650, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.6172557738888439, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 3.2035496032040303e-06, |
|
"loss": 0.05097753, |
|
"memory(GiB)": 75.3, |
|
"step": 655, |
|
"train_speed(iter/s)": 0.003349 |
|
}, |
|
{ |
|
"epoch": 0.6219676500254, |
|
"grad_norm": 0.875, |
|
"learning_rate": 3.134602781197515e-06, |
|
"loss": 0.05341196, |
|
"memory(GiB)": 75.3, |
|
"step": 660, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.626679526161956, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 3.0660655888565827e-06, |
|
"loss": 0.05016219, |
|
"memory(GiB)": 75.3, |
|
"step": 665, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6313914022985121, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 2.997953076523803e-06, |
|
"loss": 0.05216441, |
|
"memory(GiB)": 75.3, |
|
"step": 670, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6361032784350681, |
|
"grad_norm": 1.015625, |
|
"learning_rate": 2.930280201284654e-06, |
|
"loss": 0.05449665, |
|
"memory(GiB)": 75.3, |
|
"step": 675, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6408151545716242, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 2.863061823683032e-06, |
|
"loss": 0.05129569, |
|
"memory(GiB)": 75.3, |
|
"step": 680, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6455270307081803, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 2.7963127044579697e-06, |
|
"loss": 0.05290835, |
|
"memory(GiB)": 75.3, |
|
"step": 685, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6502389068447363, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 2.7300475013022666e-06, |
|
"loss": 0.0528672, |
|
"memory(GiB)": 75.3, |
|
"step": 690, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6549507829812924, |
|
"grad_norm": 0.984375, |
|
"learning_rate": 2.6642807656437565e-06, |
|
"loss": 0.05229232, |
|
"memory(GiB)": 75.3, |
|
"step": 695, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6596626591178485, |
|
"grad_norm": 0.9609375, |
|
"learning_rate": 2.599026939449899e-06, |
|
"loss": 0.05371115, |
|
"memory(GiB)": 75.3, |
|
"step": 700, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6643745352544045, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 2.534300352056416e-06, |
|
"loss": 0.05234203, |
|
"memory(GiB)": 75.3, |
|
"step": 705, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6690864113909606, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 2.470115217020654e-06, |
|
"loss": 0.05360326, |
|
"memory(GiB)": 75.3, |
|
"step": 710, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6737982875275166, |
|
"grad_norm": 0.92578125, |
|
"learning_rate": 2.4064856290003863e-06, |
|
"loss": 0.05475932, |
|
"memory(GiB)": 75.3, |
|
"step": 715, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6785101636640727, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 2.3434255606586925e-06, |
|
"loss": 0.05548735, |
|
"memory(GiB)": 75.3, |
|
"step": 720, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6832220398006288, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 2.2809488595956746e-06, |
|
"loss": 0.05201564, |
|
"memory(GiB)": 75.3, |
|
"step": 725, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.6879339159371848, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 2.219069245307589e-06, |
|
"loss": 0.05408272, |
|
"memory(GiB)": 75.3, |
|
"step": 730, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.6926457920737409, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 2.157800306174139e-06, |
|
"loss": 0.05537663, |
|
"memory(GiB)": 75.3, |
|
"step": 735, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.6973576682102969, |
|
"grad_norm": 1.125, |
|
"learning_rate": 2.0971554964745476e-06, |
|
"loss": 0.05455139, |
|
"memory(GiB)": 75.3, |
|
"step": 740, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.702069544346853, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 2.0371481334330913e-06, |
|
"loss": 0.05394316, |
|
"memory(GiB)": 75.3, |
|
"step": 745, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.7067814204834091, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 1.9777913942946987e-06, |
|
"loss": 0.05269849, |
|
"memory(GiB)": 75.3, |
|
"step": 750, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.7114932966199651, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 1.919098313431335e-06, |
|
"loss": 0.05057405, |
|
"memory(GiB)": 75.3, |
|
"step": 755, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.7162051727565212, |
|
"grad_norm": 0.9375, |
|
"learning_rate": 1.8610817794797164e-06, |
|
"loss": 0.05438253, |
|
"memory(GiB)": 75.3, |
|
"step": 760, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.7209170488930772, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 1.8037545325110506e-06, |
|
"loss": 0.05222658, |
|
"memory(GiB)": 75.3, |
|
"step": 765, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.7256289250296333, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 1.7471291612333997e-06, |
|
"loss": 0.05131737, |
|
"memory(GiB)": 75.3, |
|
"step": 770, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.7303408011661894, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 1.6912181002272714e-06, |
|
"loss": 0.05391481, |
|
"memory(GiB)": 75.3, |
|
"step": 775, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.7350526773027454, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 1.6360336272150684e-06, |
|
"loss": 0.05078862, |
|
"memory(GiB)": 75.3, |
|
"step": 780, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.7397645534393015, |
|
"grad_norm": 0.953125, |
|
"learning_rate": 1.581587860364977e-06, |
|
"loss": 0.05192038, |
|
"memory(GiB)": 75.3, |
|
"step": 785, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.7444764295758576, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 1.52789275562988e-06, |
|
"loss": 0.05364103, |
|
"memory(GiB)": 75.3, |
|
"step": 790, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.7491883057124136, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 1.4749601041219246e-06, |
|
"loss": 0.0536845, |
|
"memory(GiB)": 75.3, |
|
"step": 795, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.7539001818489697, |
|
"grad_norm": 0.90234375, |
|
"learning_rate": 1.4228015295232484e-06, |
|
"loss": 0.05084696, |
|
"memory(GiB)": 75.3, |
|
"step": 800, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.7586120579855257, |
|
"grad_norm": 0.875, |
|
"learning_rate": 1.371428485533498e-06, |
|
"loss": 0.05773014, |
|
"memory(GiB)": 75.3, |
|
"step": 805, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.7633239341220818, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 1.3208522533546748e-06, |
|
"loss": 0.05219783, |
|
"memory(GiB)": 75.3, |
|
"step": 810, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.7680358102586379, |
|
"grad_norm": 0.96875, |
|
"learning_rate": 1.2710839392138386e-06, |
|
"loss": 0.05375321, |
|
"memory(GiB)": 75.3, |
|
"step": 815, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.7727476863951939, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 1.222134471924259e-06, |
|
"loss": 0.05204231, |
|
"memory(GiB)": 75.3, |
|
"step": 820, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.77745956253175, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 1.1740146004855141e-06, |
|
"loss": 0.0559127, |
|
"memory(GiB)": 75.3, |
|
"step": 825, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.782171438668306, |
|
"grad_norm": 0.89453125, |
|
"learning_rate": 1.1267348917230737e-06, |
|
"loss": 0.05298336, |
|
"memory(GiB)": 75.3, |
|
"step": 830, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.7868833148048621, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 1.080305727967893e-06, |
|
"loss": 0.05347639, |
|
"memory(GiB)": 75.3, |
|
"step": 835, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.7915951909414182, |
|
"grad_norm": 0.84765625, |
|
"learning_rate": 1.0347373047765202e-06, |
|
"loss": 0.05329442, |
|
"memory(GiB)": 75.3, |
|
"step": 840, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.7963070670779742, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 9.900396286922025e-07, |
|
"loss": 0.0537856, |
|
"memory(GiB)": 75.3, |
|
"step": 845, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.8010189432145303, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 9.462225150475296e-07, |
|
"loss": 0.05233877, |
|
"memory(GiB)": 75.3, |
|
"step": 850, |
|
"train_speed(iter/s)": 0.00335 |
|
}, |
|
{ |
|
"epoch": 0.8057308193510863, |
|
"grad_norm": 0.88671875, |
|
"learning_rate": 9.032955858090319e-07, |
|
"loss": 0.0549244, |
|
"memory(GiB)": 75.3, |
|
"step": 855, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8104426954876424, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 8.612682674642647e-07, |
|
"loss": 0.04935811, |
|
"memory(GiB)": 75.3, |
|
"step": 860, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8151545716241985, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 8.201497889518073e-07, |
|
"loss": 0.05281691, |
|
"memory(GiB)": 75.3, |
|
"step": 865, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8198664477607545, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 7.799491796346487e-07, |
|
"loss": 0.05795277, |
|
"memory(GiB)": 75.3, |
|
"step": 870, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8245783238973106, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 7.406752673173851e-07, |
|
"loss": 0.05225162, |
|
"memory(GiB)": 75.3, |
|
"step": 875, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8292902000338667, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 7.023366763077044e-07, |
|
"loss": 0.0509973, |
|
"memory(GiB)": 75.3, |
|
"step": 880, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8340020761704227, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 6.649418255225298e-07, |
|
"loss": 0.05142277, |
|
"memory(GiB)": 75.3, |
|
"step": 885, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8387139523069788, |
|
"grad_norm": 0.95703125, |
|
"learning_rate": 6.284989266392805e-07, |
|
"loss": 0.05023923, |
|
"memory(GiB)": 75.3, |
|
"step": 890, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8434258284435348, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 5.930159822926407e-07, |
|
"loss": 0.0534648, |
|
"memory(GiB)": 75.3, |
|
"step": 895, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8481377045800909, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 5.585007843172286e-07, |
|
"loss": 0.05155768, |
|
"memory(GiB)": 75.3, |
|
"step": 900, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.852849580716647, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 5.249609120365579e-07, |
|
"loss": 0.05368913, |
|
"memory(GiB)": 75.3, |
|
"step": 905, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.857561456853203, |
|
"grad_norm": 0.859375, |
|
"learning_rate": 4.924037305986696e-07, |
|
"loss": 0.05452033, |
|
"memory(GiB)": 75.3, |
|
"step": 910, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8622733329897591, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 4.6083638935878025e-07, |
|
"loss": 0.05384221, |
|
"memory(GiB)": 75.3, |
|
"step": 915, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.866985209126315, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 4.302658203093418e-07, |
|
"loss": 0.05272598, |
|
"memory(GiB)": 75.3, |
|
"step": 920, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8716970852628712, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 4.00698736557808e-07, |
|
"loss": 0.05447989, |
|
"memory(GiB)": 75.3, |
|
"step": 925, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8764089613994273, |
|
"grad_norm": 0.9453125, |
|
"learning_rate": 3.721416308524839e-07, |
|
"loss": 0.05123619, |
|
"memory(GiB)": 75.3, |
|
"step": 930, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8811208375359832, |
|
"grad_norm": 0.8515625, |
|
"learning_rate": 3.4460077415675473e-07, |
|
"loss": 0.05347574, |
|
"memory(GiB)": 75.3, |
|
"step": 935, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8858327136725394, |
|
"grad_norm": 0.7890625, |
|
"learning_rate": 3.1808221427202636e-07, |
|
"loss": 0.05334803, |
|
"memory(GiB)": 75.3, |
|
"step": 940, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8905445898090953, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 2.925917745096568e-07, |
|
"loss": 0.05249671, |
|
"memory(GiB)": 75.3, |
|
"step": 945, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8952564659456514, |
|
"grad_norm": 0.91015625, |
|
"learning_rate": 2.681350524122045e-07, |
|
"loss": 0.05494893, |
|
"memory(GiB)": 75.3, |
|
"step": 950, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.8999683420822076, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 2.447174185242324e-07, |
|
"loss": 0.05149726, |
|
"memory(GiB)": 75.3, |
|
"step": 955, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.9046802182187635, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 2.2234401521297576e-07, |
|
"loss": 0.05425293, |
|
"memory(GiB)": 75.3, |
|
"step": 960, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.9093920943553196, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 2.01019755539108e-07, |
|
"loss": 0.0552171, |
|
"memory(GiB)": 75.3, |
|
"step": 965, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.9141039704918758, |
|
"grad_norm": 0.84375, |
|
"learning_rate": 1.8074932217786445e-07, |
|
"loss": 0.05237709, |
|
"memory(GiB)": 75.3, |
|
"step": 970, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.9188158466284317, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 1.6153716639075223e-07, |
|
"loss": 0.05221198, |
|
"memory(GiB)": 75.3, |
|
"step": 975, |
|
"train_speed(iter/s)": 0.003351 |
|
}, |
|
{ |
|
"epoch": 0.9235277227649878, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 1.433875070480878e-07, |
|
"loss": 0.05134506, |
|
"memory(GiB)": 75.3, |
|
"step": 980, |
|
"train_speed(iter/s)": 0.003352 |
|
}, |
|
{ |
|
"epoch": 0.9282395989015438, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 1.2630432970255014e-07, |
|
"loss": 0.05436495, |
|
"memory(GiB)": 75.3, |
|
"step": 985, |
|
"train_speed(iter/s)": 0.003352 |
|
}, |
|
{ |
|
"epoch": 0.9329514750380999, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 1.1029138571398645e-07, |
|
"loss": 0.05440986, |
|
"memory(GiB)": 75.3, |
|
"step": 990, |
|
"train_speed(iter/s)": 0.003352 |
|
}, |
|
{ |
|
"epoch": 0.937663351174656, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 9.535219142563168e-08, |
|
"loss": 0.05418127, |
|
"memory(GiB)": 75.3, |
|
"step": 995, |
|
"train_speed(iter/s)": 0.003352 |
|
}, |
|
{ |
|
"epoch": 0.942375227311212, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 8.149002739194222e-08, |
|
"loss": 0.05519557, |
|
"memory(GiB)": 75.3, |
|
"step": 1000, |
|
"train_speed(iter/s)": 0.003352 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1061, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.440049406181114e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|