|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 100, |
|
"global_step": 1896, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005274261603375527, |
|
"grad_norm": 43.75, |
|
"learning_rate": 1.4062816455696203e-05, |
|
"loss": 0.8954, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.010548523206751054, |
|
"grad_norm": 29.0, |
|
"learning_rate": 1.4025632911392405e-05, |
|
"loss": 0.7418, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.015822784810126583, |
|
"grad_norm": 27.5, |
|
"learning_rate": 1.3988449367088608e-05, |
|
"loss": 0.793, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.02109704641350211, |
|
"grad_norm": 31.25, |
|
"learning_rate": 1.395126582278481e-05, |
|
"loss": 0.6991, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.026371308016877638, |
|
"grad_norm": 26.625, |
|
"learning_rate": 1.3914082278481013e-05, |
|
"loss": 0.7284, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03164556962025317, |
|
"grad_norm": 32.75, |
|
"learning_rate": 1.3876898734177215e-05, |
|
"loss": 0.7174, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03691983122362869, |
|
"grad_norm": 25.25, |
|
"learning_rate": 1.3839715189873418e-05, |
|
"loss": 0.7091, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04219409282700422, |
|
"grad_norm": 21.375, |
|
"learning_rate": 1.3802531645569622e-05, |
|
"loss": 0.6764, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04746835443037975, |
|
"grad_norm": 23.75, |
|
"learning_rate": 1.3765348101265823e-05, |
|
"loss": 0.6988, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.052742616033755275, |
|
"grad_norm": 21.625, |
|
"learning_rate": 1.3728164556962027e-05, |
|
"loss": 0.6627, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.052742616033755275, |
|
"eval_accuracy": 0.6675191815856778, |
|
"eval_loss": 0.6305665969848633, |
|
"eval_runtime": 31.833, |
|
"eval_samples_per_second": 61.414, |
|
"eval_steps_per_second": 1.948, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0580168776371308, |
|
"grad_norm": 22.125, |
|
"learning_rate": 1.3690981012658228e-05, |
|
"loss": 0.5756, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06329113924050633, |
|
"grad_norm": 17.5, |
|
"learning_rate": 1.3653797468354432e-05, |
|
"loss": 0.5999, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06856540084388185, |
|
"grad_norm": 24.125, |
|
"learning_rate": 1.3616613924050634e-05, |
|
"loss": 0.6658, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.07383966244725738, |
|
"grad_norm": 17.5, |
|
"learning_rate": 1.3579430379746835e-05, |
|
"loss": 0.5995, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07911392405063292, |
|
"grad_norm": 18.0, |
|
"learning_rate": 1.354224683544304e-05, |
|
"loss": 0.5795, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.08438818565400844, |
|
"grad_norm": 14.75, |
|
"learning_rate": 1.350506329113924e-05, |
|
"loss": 0.5548, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08966244725738397, |
|
"grad_norm": 16.125, |
|
"learning_rate": 1.3467879746835444e-05, |
|
"loss": 0.6347, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.0949367088607595, |
|
"grad_norm": 17.0, |
|
"learning_rate": 1.3430696202531645e-05, |
|
"loss": 0.5786, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.10021097046413502, |
|
"grad_norm": 16.625, |
|
"learning_rate": 1.3393512658227849e-05, |
|
"loss": 0.5929, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.10548523206751055, |
|
"grad_norm": 13.3125, |
|
"learning_rate": 1.3356329113924052e-05, |
|
"loss": 0.5604, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10548523206751055, |
|
"eval_accuracy": 0.689002557544757, |
|
"eval_loss": 0.5953558087348938, |
|
"eval_runtime": 31.9307, |
|
"eval_samples_per_second": 61.226, |
|
"eval_steps_per_second": 1.942, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11075949367088607, |
|
"grad_norm": 12.3125, |
|
"learning_rate": 1.3319145569620254e-05, |
|
"loss": 0.5708, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1160337552742616, |
|
"grad_norm": 20.625, |
|
"learning_rate": 1.3281962025316456e-05, |
|
"loss": 0.6226, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.12130801687763713, |
|
"grad_norm": 10.625, |
|
"learning_rate": 1.3244778481012659e-05, |
|
"loss": 0.5384, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.12658227848101267, |
|
"grad_norm": 14.5625, |
|
"learning_rate": 1.3207594936708861e-05, |
|
"loss": 0.6306, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.13185654008438819, |
|
"grad_norm": 12.0, |
|
"learning_rate": 1.3170411392405064e-05, |
|
"loss": 0.556, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1371308016877637, |
|
"grad_norm": 11.0625, |
|
"learning_rate": 1.3133227848101266e-05, |
|
"loss": 0.5659, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.14240506329113925, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 1.3096044303797469e-05, |
|
"loss": 0.6133, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.14767932489451477, |
|
"grad_norm": 15.125, |
|
"learning_rate": 1.3058860759493671e-05, |
|
"loss": 0.59, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1529535864978903, |
|
"grad_norm": 12.9375, |
|
"learning_rate": 1.3021677215189874e-05, |
|
"loss": 0.5685, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.15822784810126583, |
|
"grad_norm": 13.25, |
|
"learning_rate": 1.2984493670886076e-05, |
|
"loss": 0.5743, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15822784810126583, |
|
"eval_accuracy": 0.6879795396419437, |
|
"eval_loss": 0.5773088932037354, |
|
"eval_runtime": 31.9222, |
|
"eval_samples_per_second": 61.243, |
|
"eval_steps_per_second": 1.942, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.16350210970464135, |
|
"grad_norm": 12.0, |
|
"learning_rate": 1.2947310126582279e-05, |
|
"loss": 0.5435, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.16877637130801687, |
|
"grad_norm": 12.1875, |
|
"learning_rate": 1.2910126582278483e-05, |
|
"loss": 0.5873, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.17405063291139242, |
|
"grad_norm": 13.125, |
|
"learning_rate": 1.2872943037974684e-05, |
|
"loss": 0.5687, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.17932489451476794, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 1.2835759493670888e-05, |
|
"loss": 0.5496, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.18459915611814345, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 1.2798575949367088e-05, |
|
"loss": 0.5872, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.189873417721519, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 1.2761392405063293e-05, |
|
"loss": 0.557, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.19514767932489452, |
|
"grad_norm": 11.625, |
|
"learning_rate": 1.2724208860759493e-05, |
|
"loss": 0.5815, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.20042194092827004, |
|
"grad_norm": 10.5, |
|
"learning_rate": 1.2687025316455696e-05, |
|
"loss": 0.5662, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.20569620253164558, |
|
"grad_norm": 10.5, |
|
"learning_rate": 1.26498417721519e-05, |
|
"loss": 0.5674, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2109704641350211, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.26126582278481e-05, |
|
"loss": 0.573, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2109704641350211, |
|
"eval_accuracy": 0.718158567774936, |
|
"eval_loss": 0.5407843589782715, |
|
"eval_runtime": 31.895, |
|
"eval_samples_per_second": 61.295, |
|
"eval_steps_per_second": 1.944, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.21624472573839662, |
|
"grad_norm": 8.375, |
|
"learning_rate": 1.2575474683544305e-05, |
|
"loss": 0.5875, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.22151898734177214, |
|
"grad_norm": 9.25, |
|
"learning_rate": 1.2538291139240506e-05, |
|
"loss": 0.5336, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.22679324894514769, |
|
"grad_norm": 9.875, |
|
"learning_rate": 1.250110759493671e-05, |
|
"loss": 0.509, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2320675105485232, |
|
"grad_norm": 11.625, |
|
"learning_rate": 1.246392405063291e-05, |
|
"loss": 0.5688, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.23734177215189872, |
|
"grad_norm": 8.625, |
|
"learning_rate": 1.2426740506329115e-05, |
|
"loss": 0.5802, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.24261603375527427, |
|
"grad_norm": 8.75, |
|
"learning_rate": 1.2389556962025317e-05, |
|
"loss": 0.5117, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2478902953586498, |
|
"grad_norm": 10.25, |
|
"learning_rate": 1.235237341772152e-05, |
|
"loss": 0.5687, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.25316455696202533, |
|
"grad_norm": 7.53125, |
|
"learning_rate": 1.2315189873417722e-05, |
|
"loss": 0.5465, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.25843881856540085, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 1.2278006329113925e-05, |
|
"loss": 0.585, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.26371308016877637, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 1.2240822784810127e-05, |
|
"loss": 0.5644, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.26371308016877637, |
|
"eval_accuracy": 0.7360613810741689, |
|
"eval_loss": 0.5284575819969177, |
|
"eval_runtime": 31.9441, |
|
"eval_samples_per_second": 61.201, |
|
"eval_steps_per_second": 1.941, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.2689873417721519, |
|
"grad_norm": 8.375, |
|
"learning_rate": 1.220363924050633e-05, |
|
"loss": 0.5357, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2742616033755274, |
|
"grad_norm": 8.375, |
|
"learning_rate": 1.2166455696202532e-05, |
|
"loss": 0.5818, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.2795358649789029, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 1.2129272151898735e-05, |
|
"loss": 0.5275, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.2848101265822785, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.2092088607594937e-05, |
|
"loss": 0.5201, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.290084388185654, |
|
"grad_norm": 9.75, |
|
"learning_rate": 1.205490506329114e-05, |
|
"loss": 0.5351, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.29535864978902954, |
|
"grad_norm": 10.625, |
|
"learning_rate": 1.2017721518987342e-05, |
|
"loss": 0.5406, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.30063291139240506, |
|
"grad_norm": 11.625, |
|
"learning_rate": 1.1980537974683544e-05, |
|
"loss": 0.5758, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3059071729957806, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.1943354430379749e-05, |
|
"loss": 0.5494, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.3111814345991561, |
|
"grad_norm": 7.4375, |
|
"learning_rate": 1.190617088607595e-05, |
|
"loss": 0.5516, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.31645569620253167, |
|
"grad_norm": 10.375, |
|
"learning_rate": 1.1868987341772153e-05, |
|
"loss": 0.5482, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.31645569620253167, |
|
"eval_accuracy": 0.7365728900255755, |
|
"eval_loss": 0.5250852704048157, |
|
"eval_runtime": 31.8662, |
|
"eval_samples_per_second": 61.35, |
|
"eval_steps_per_second": 1.946, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3217299578059072, |
|
"grad_norm": 8.25, |
|
"learning_rate": 1.1831803797468354e-05, |
|
"loss": 0.5412, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3270042194092827, |
|
"grad_norm": 8.25, |
|
"learning_rate": 1.1794620253164558e-05, |
|
"loss": 0.5614, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3322784810126582, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 1.175743670886076e-05, |
|
"loss": 0.5152, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.33755274261603374, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 1.1720253164556962e-05, |
|
"loss": 0.5914, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.34282700421940926, |
|
"grad_norm": 7.71875, |
|
"learning_rate": 1.1683069620253166e-05, |
|
"loss": 0.5333, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.34810126582278483, |
|
"grad_norm": 7.5, |
|
"learning_rate": 1.1645886075949367e-05, |
|
"loss": 0.5196, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.35337552742616035, |
|
"grad_norm": 8.75, |
|
"learning_rate": 1.160870253164557e-05, |
|
"loss": 0.5901, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.35864978902953587, |
|
"grad_norm": 9.75, |
|
"learning_rate": 1.1571518987341771e-05, |
|
"loss": 0.5537, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3639240506329114, |
|
"grad_norm": 10.4375, |
|
"learning_rate": 1.1534335443037976e-05, |
|
"loss": 0.5316, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.3691983122362869, |
|
"grad_norm": 8.375, |
|
"learning_rate": 1.1497151898734178e-05, |
|
"loss": 0.5673, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3691983122362869, |
|
"eval_accuracy": 0.7278772378516624, |
|
"eval_loss": 0.5267060399055481, |
|
"eval_runtime": 31.9212, |
|
"eval_samples_per_second": 61.245, |
|
"eval_steps_per_second": 1.942, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3744725738396624, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 1.145996835443038e-05, |
|
"loss": 0.6081, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.379746835443038, |
|
"grad_norm": 8.5, |
|
"learning_rate": 1.1422784810126583e-05, |
|
"loss": 0.5328, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.3850210970464135, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 1.1385601265822785e-05, |
|
"loss": 0.5353, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.39029535864978904, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 1.1348417721518988e-05, |
|
"loss": 0.5502, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.39556962025316456, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 1.1311234177215189e-05, |
|
"loss": 0.5072, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4008438818565401, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.1274050632911393e-05, |
|
"loss": 0.5366, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.4061181434599156, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 1.1236867088607595e-05, |
|
"loss": 0.5221, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.41139240506329117, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 1.1199683544303798e-05, |
|
"loss": 0.5226, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 1.11625e-05, |
|
"loss": 0.5562, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4219409282700422, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.1125316455696203e-05, |
|
"loss": 0.5701, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4219409282700422, |
|
"eval_accuracy": 0.7452685421994885, |
|
"eval_loss": 0.5122529864311218, |
|
"eval_runtime": 31.8853, |
|
"eval_samples_per_second": 61.314, |
|
"eval_steps_per_second": 1.944, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.4272151898734177, |
|
"grad_norm": 8.875, |
|
"learning_rate": 1.1088132911392405e-05, |
|
"loss": 0.5108, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.43248945147679324, |
|
"grad_norm": 7.375, |
|
"learning_rate": 1.1050949367088608e-05, |
|
"loss": 0.5223, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.43776371308016876, |
|
"grad_norm": 9.0, |
|
"learning_rate": 1.101376582278481e-05, |
|
"loss": 0.5463, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.4430379746835443, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 1.0976582278481014e-05, |
|
"loss": 0.5222, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.44831223628691985, |
|
"grad_norm": 7.5, |
|
"learning_rate": 1.0939398734177215e-05, |
|
"loss": 0.593, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.45358649789029537, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 1.090221518987342e-05, |
|
"loss": 0.5828, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.4588607594936709, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 1.086503164556962e-05, |
|
"loss": 0.5251, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.4641350210970464, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 1.0827848101265822e-05, |
|
"loss": 0.5284, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.4694092827004219, |
|
"grad_norm": 7.25, |
|
"learning_rate": 1.0790664556962027e-05, |
|
"loss": 0.5502, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.47468354430379744, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 1.0753481012658227e-05, |
|
"loss": 0.5199, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.47468354430379744, |
|
"eval_accuracy": 0.7375959079283887, |
|
"eval_loss": 0.514769971370697, |
|
"eval_runtime": 31.9299, |
|
"eval_samples_per_second": 61.228, |
|
"eval_steps_per_second": 1.942, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.479957805907173, |
|
"grad_norm": 8.0, |
|
"learning_rate": 1.0716297468354432e-05, |
|
"loss": 0.5431, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.48523206751054854, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 1.0679113924050632e-05, |
|
"loss": 0.5744, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.49050632911392406, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 1.0641930379746836e-05, |
|
"loss": 0.5749, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.4957805907172996, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 1.0604746835443037e-05, |
|
"loss": 0.5595, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5010548523206751, |
|
"grad_norm": 6.875, |
|
"learning_rate": 1.0567563291139241e-05, |
|
"loss": 0.5198, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5063291139240507, |
|
"grad_norm": 10.5, |
|
"learning_rate": 1.0530379746835444e-05, |
|
"loss": 0.57, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5116033755274262, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 1.0493196202531646e-05, |
|
"loss": 0.5725, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5168776371308017, |
|
"grad_norm": 8.125, |
|
"learning_rate": 1.0456012658227849e-05, |
|
"loss": 0.5162, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5221518987341772, |
|
"grad_norm": 8.125, |
|
"learning_rate": 1.0418829113924051e-05, |
|
"loss": 0.4939, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5274261603375527, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 1.0381645569620254e-05, |
|
"loss": 0.5525, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5274261603375527, |
|
"eval_accuracy": 0.7493606138107417, |
|
"eval_loss": 0.5132544040679932, |
|
"eval_runtime": 31.9116, |
|
"eval_samples_per_second": 61.263, |
|
"eval_steps_per_second": 1.943, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5327004219409283, |
|
"grad_norm": 7.125, |
|
"learning_rate": 1.0344462025316456e-05, |
|
"loss": 0.5266, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.5379746835443038, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.0307278481012659e-05, |
|
"loss": 0.5283, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5432489451476793, |
|
"grad_norm": 7.0625, |
|
"learning_rate": 1.0270094936708861e-05, |
|
"loss": 0.5323, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5485232067510548, |
|
"grad_norm": 7.0, |
|
"learning_rate": 1.0232911392405064e-05, |
|
"loss": 0.4994, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.5537974683544303, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 1.0195727848101266e-05, |
|
"loss": 0.5333, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5590717299578059, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 1.0158544303797469e-05, |
|
"loss": 0.5548, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.5643459915611815, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 1.0121360759493671e-05, |
|
"loss": 0.5212, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.569620253164557, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 1.0084177215189875e-05, |
|
"loss": 0.5402, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.5748945147679325, |
|
"grad_norm": 8.625, |
|
"learning_rate": 1.0046993670886076e-05, |
|
"loss": 0.5508, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.580168776371308, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 1.000981012658228e-05, |
|
"loss": 0.5197, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.580168776371308, |
|
"eval_accuracy": 0.7488491048593351, |
|
"eval_loss": 0.5085062980651855, |
|
"eval_runtime": 31.9268, |
|
"eval_samples_per_second": 61.234, |
|
"eval_steps_per_second": 1.942, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.5854430379746836, |
|
"grad_norm": 7.53125, |
|
"learning_rate": 9.97262658227848e-06, |
|
"loss": 0.5004, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5907172995780591, |
|
"grad_norm": 8.75, |
|
"learning_rate": 9.935443037974685e-06, |
|
"loss": 0.5404, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5959915611814346, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 9.898259493670886e-06, |
|
"loss": 0.5352, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6012658227848101, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 9.861075949367088e-06, |
|
"loss": 0.4879, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.6065400843881856, |
|
"grad_norm": 9.5, |
|
"learning_rate": 9.823892405063292e-06, |
|
"loss": 0.5985, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6118143459915611, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 9.786708860759493e-06, |
|
"loss": 0.5321, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6170886075949367, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 9.749525316455697e-06, |
|
"loss": 0.5005, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6223628691983122, |
|
"grad_norm": 6.625, |
|
"learning_rate": 9.712341772151898e-06, |
|
"loss": 0.4947, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.6276371308016878, |
|
"grad_norm": 7.78125, |
|
"learning_rate": 9.675158227848102e-06, |
|
"loss": 0.4745, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.6329113924050633, |
|
"grad_norm": 7.71875, |
|
"learning_rate": 9.637974683544305e-06, |
|
"loss": 0.4977, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6329113924050633, |
|
"eval_accuracy": 0.7411764705882353, |
|
"eval_loss": 0.5146331787109375, |
|
"eval_runtime": 31.9288, |
|
"eval_samples_per_second": 61.23, |
|
"eval_steps_per_second": 1.942, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.6381856540084389, |
|
"grad_norm": 10.9375, |
|
"learning_rate": 9.600791139240507e-06, |
|
"loss": 0.5176, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.6434599156118144, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 9.56360759493671e-06, |
|
"loss": 0.5472, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.6487341772151899, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 9.526424050632912e-06, |
|
"loss": 0.4825, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.6540084388185654, |
|
"grad_norm": 7.40625, |
|
"learning_rate": 9.489240506329115e-06, |
|
"loss": 0.4956, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.6592827004219409, |
|
"grad_norm": 6.75, |
|
"learning_rate": 9.452056962025315e-06, |
|
"loss": 0.5199, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6645569620253164, |
|
"grad_norm": 9.25, |
|
"learning_rate": 9.41487341772152e-06, |
|
"loss": 0.5871, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.669831223628692, |
|
"grad_norm": 7.75, |
|
"learning_rate": 9.377689873417722e-06, |
|
"loss": 0.5269, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.6751054852320675, |
|
"grad_norm": 7.71875, |
|
"learning_rate": 9.340506329113924e-06, |
|
"loss": 0.4983, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.680379746835443, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 9.303322784810127e-06, |
|
"loss": 0.5544, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6856540084388185, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 9.26613924050633e-06, |
|
"loss": 0.492, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6856540084388185, |
|
"eval_accuracy": 0.7416879795396419, |
|
"eval_loss": 0.511603593826294, |
|
"eval_runtime": 31.9424, |
|
"eval_samples_per_second": 61.204, |
|
"eval_steps_per_second": 1.941, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6909282700421941, |
|
"grad_norm": 8.0, |
|
"learning_rate": 9.228955696202532e-06, |
|
"loss": 0.5052, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.6962025316455697, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 9.191772151898734e-06, |
|
"loss": 0.5295, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7014767932489452, |
|
"grad_norm": 9.0, |
|
"learning_rate": 9.154588607594937e-06, |
|
"loss": 0.5589, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.7067510548523207, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 9.117405063291141e-06, |
|
"loss": 0.536, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.7120253164556962, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 9.080221518987342e-06, |
|
"loss": 0.473, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.7172995780590717, |
|
"grad_norm": 9.25, |
|
"learning_rate": 9.043037974683546e-06, |
|
"loss": 0.503, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.7225738396624473, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 9.005854430379747e-06, |
|
"loss": 0.4972, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.7278481012658228, |
|
"grad_norm": 8.875, |
|
"learning_rate": 8.968670886075949e-06, |
|
"loss": 0.5291, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.7331223628691983, |
|
"grad_norm": 7.625, |
|
"learning_rate": 8.931487341772152e-06, |
|
"loss": 0.5637, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.7383966244725738, |
|
"grad_norm": 7.25, |
|
"learning_rate": 8.894303797468354e-06, |
|
"loss": 0.5046, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.7383966244725738, |
|
"eval_accuracy": 0.7452685421994885, |
|
"eval_loss": 0.5069195628166199, |
|
"eval_runtime": 31.9183, |
|
"eval_samples_per_second": 61.25, |
|
"eval_steps_per_second": 1.942, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.7436708860759493, |
|
"grad_norm": 10.625, |
|
"learning_rate": 8.857120253164558e-06, |
|
"loss": 0.587, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.7489451476793249, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 8.819936708860759e-06, |
|
"loss": 0.5255, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.7542194092827004, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 8.782753164556963e-06, |
|
"loss": 0.532, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.759493670886076, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 8.745569620253164e-06, |
|
"loss": 0.4791, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.7647679324894515, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 8.708386075949368e-06, |
|
"loss": 0.5277, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.770042194092827, |
|
"grad_norm": 9.5, |
|
"learning_rate": 8.67120253164557e-06, |
|
"loss": 0.5335, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.7753164556962026, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 8.634018987341773e-06, |
|
"loss": 0.5746, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.7805907172995781, |
|
"grad_norm": 10.125, |
|
"learning_rate": 8.596835443037975e-06, |
|
"loss": 0.5555, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.7858649789029536, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 8.559651898734178e-06, |
|
"loss": 0.4913, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.7911392405063291, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 8.52246835443038e-06, |
|
"loss": 0.5476, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7911392405063291, |
|
"eval_accuracy": 0.7478260869565218, |
|
"eval_loss": 0.504403293132782, |
|
"eval_runtime": 31.9252, |
|
"eval_samples_per_second": 61.237, |
|
"eval_steps_per_second": 1.942, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7964135021097046, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 8.485284810126581e-06, |
|
"loss": 0.5078, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.8016877637130801, |
|
"grad_norm": 6.75, |
|
"learning_rate": 8.448101265822785e-06, |
|
"loss": 0.4789, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.8069620253164557, |
|
"grad_norm": 9.5, |
|
"learning_rate": 8.410917721518988e-06, |
|
"loss": 0.5122, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.8122362869198312, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 8.37373417721519e-06, |
|
"loss": 0.5184, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.8175105485232067, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 8.336550632911393e-06, |
|
"loss": 0.5303, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.8227848101265823, |
|
"grad_norm": 7.125, |
|
"learning_rate": 8.299367088607595e-06, |
|
"loss": 0.5199, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.8280590717299579, |
|
"grad_norm": 6.96875, |
|
"learning_rate": 8.262183544303798e-06, |
|
"loss": 0.4956, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 8.225e-06, |
|
"loss": 0.4543, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.8386075949367089, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 8.187816455696202e-06, |
|
"loss": 0.5797, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.8438818565400844, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 8.150632911392407e-06, |
|
"loss": 0.5247, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.8438818565400844, |
|
"eval_accuracy": 0.7468030690537084, |
|
"eval_loss": 0.5038452744483948, |
|
"eval_runtime": 31.9203, |
|
"eval_samples_per_second": 61.246, |
|
"eval_steps_per_second": 1.942, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.8491561181434599, |
|
"grad_norm": 7.625, |
|
"learning_rate": 8.113449367088607e-06, |
|
"loss": 0.5348, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.8544303797468354, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 8.076265822784812e-06, |
|
"loss": 0.5507, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.859704641350211, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 8.039082278481012e-06, |
|
"loss": 0.4819, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.8649789029535865, |
|
"grad_norm": 6.875, |
|
"learning_rate": 8.001898734177215e-06, |
|
"loss": 0.4581, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.870253164556962, |
|
"grad_norm": 7.53125, |
|
"learning_rate": 7.964715189873419e-06, |
|
"loss": 0.4928, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8755274261603375, |
|
"grad_norm": 8.75, |
|
"learning_rate": 7.92753164556962e-06, |
|
"loss": 0.5144, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.880801687763713, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 7.890348101265824e-06, |
|
"loss": 0.5475, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.8860759493670886, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 7.853164556962025e-06, |
|
"loss": 0.5443, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.8913502109704642, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 7.815981012658229e-06, |
|
"loss": 0.4987, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.8966244725738397, |
|
"grad_norm": 7.5625, |
|
"learning_rate": 7.77879746835443e-06, |
|
"loss": 0.5591, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8966244725738397, |
|
"eval_accuracy": 0.7452685421994885, |
|
"eval_loss": 0.507918119430542, |
|
"eval_runtime": 31.9838, |
|
"eval_samples_per_second": 61.125, |
|
"eval_steps_per_second": 1.938, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9018987341772152, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 7.741613924050634e-06, |
|
"loss": 0.4957, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.9071729957805907, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 7.704430379746836e-06, |
|
"loss": 0.5035, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.9124472573839663, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 7.667246835443039e-06, |
|
"loss": 0.5108, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.9177215189873418, |
|
"grad_norm": 7.5, |
|
"learning_rate": 7.630063291139241e-06, |
|
"loss": 0.5288, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.9229957805907173, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 7.592879746835443e-06, |
|
"loss": 0.4739, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.9282700421940928, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 7.555696202531646e-06, |
|
"loss": 0.49, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.9335443037974683, |
|
"grad_norm": 6.75, |
|
"learning_rate": 7.518512658227848e-06, |
|
"loss": 0.4745, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.9388185654008439, |
|
"grad_norm": 8.375, |
|
"learning_rate": 7.481329113924051e-06, |
|
"loss": 0.4974, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.9440928270042194, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 7.444145569620253e-06, |
|
"loss": 0.5397, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.9493670886075949, |
|
"grad_norm": 6.78125, |
|
"learning_rate": 7.406962025316456e-06, |
|
"loss": 0.5228, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.9493670886075949, |
|
"eval_accuracy": 0.7457800511508952, |
|
"eval_loss": 0.5040280222892761, |
|
"eval_runtime": 31.9049, |
|
"eval_samples_per_second": 61.276, |
|
"eval_steps_per_second": 1.943, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.9546413502109705, |
|
"grad_norm": 9.8125, |
|
"learning_rate": 7.369778481012658e-06, |
|
"loss": 0.4839, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.959915611814346, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 7.332594936708862e-06, |
|
"loss": 0.4897, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.9651898734177216, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 7.295411392405063e-06, |
|
"loss": 0.5778, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.9704641350210971, |
|
"grad_norm": 8.5, |
|
"learning_rate": 7.258227848101267e-06, |
|
"loss": 0.5402, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.9757383966244726, |
|
"grad_norm": 10.75, |
|
"learning_rate": 7.221044303797468e-06, |
|
"loss": 0.5665, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.9810126582278481, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 7.1838607594936716e-06, |
|
"loss": 0.5238, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.9862869198312236, |
|
"grad_norm": 8.875, |
|
"learning_rate": 7.146677215189874e-06, |
|
"loss": 0.5707, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.9915611814345991, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 7.109493670886076e-06, |
|
"loss": 0.5202, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.9968354430379747, |
|
"grad_norm": 8.0, |
|
"learning_rate": 7.072310126582279e-06, |
|
"loss": 0.5018, |
|
"step": 1890 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3792, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|