|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9996830427892235, |
|
"eval_steps": 200, |
|
"global_step": 1577, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006339144215530904, |
|
"grad_norm": 0.1844951284226507, |
|
"learning_rate": 1.2658227848101265e-06, |
|
"loss": 0.2445, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003169572107765452, |
|
"grad_norm": 0.4228474323435137, |
|
"learning_rate": 6.329113924050633e-06, |
|
"loss": 0.5751, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.006339144215530904, |
|
"grad_norm": 0.44943947868261674, |
|
"learning_rate": 1.2658227848101267e-05, |
|
"loss": 0.6355, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009508716323296355, |
|
"grad_norm": 0.4388677072263688, |
|
"learning_rate": 1.89873417721519e-05, |
|
"loss": 0.5644, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.012678288431061807, |
|
"grad_norm": 0.42271873762000256, |
|
"learning_rate": 2.5316455696202533e-05, |
|
"loss": 0.6074, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01584786053882726, |
|
"grad_norm": 0.394168038759961, |
|
"learning_rate": 3.1645569620253167e-05, |
|
"loss": 0.5618, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01901743264659271, |
|
"grad_norm": 0.3572253194331872, |
|
"learning_rate": 3.79746835443038e-05, |
|
"loss": 0.3782, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.022187004754358162, |
|
"grad_norm": 0.412555556410598, |
|
"learning_rate": 4.430379746835443e-05, |
|
"loss": 0.5004, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.025356576862123614, |
|
"grad_norm": 0.3794217410787016, |
|
"learning_rate": 5.0632911392405066e-05, |
|
"loss": 0.4466, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.028526148969889066, |
|
"grad_norm": 0.35048440162730615, |
|
"learning_rate": 5.69620253164557e-05, |
|
"loss": 0.4673, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03169572107765452, |
|
"grad_norm": 0.5944470916433041, |
|
"learning_rate": 6.329113924050633e-05, |
|
"loss": 0.4321, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03486529318541997, |
|
"grad_norm": 0.42565747209046223, |
|
"learning_rate": 6.962025316455697e-05, |
|
"loss": 0.3458, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03803486529318542, |
|
"grad_norm": 0.4086754614850085, |
|
"learning_rate": 7.59493670886076e-05, |
|
"loss": 0.3853, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04120443740095087, |
|
"grad_norm": 0.4100402315211435, |
|
"learning_rate": 8.227848101265824e-05, |
|
"loss": 0.3248, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.044374009508716325, |
|
"grad_norm": 0.439691902474589, |
|
"learning_rate": 8.860759493670887e-05, |
|
"loss": 0.3902, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04754358161648178, |
|
"grad_norm": 0.4789983229510449, |
|
"learning_rate": 9.493670886075949e-05, |
|
"loss": 0.4375, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.05071315372424723, |
|
"grad_norm": 0.4326295423185557, |
|
"learning_rate": 0.00010126582278481013, |
|
"loss": 0.344, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.05388272583201268, |
|
"grad_norm": 0.4082745764246507, |
|
"learning_rate": 0.00010759493670886076, |
|
"loss": 0.373, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.05705229793977813, |
|
"grad_norm": 0.44917869809554406, |
|
"learning_rate": 0.0001139240506329114, |
|
"loss": 0.3366, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.060221870047543584, |
|
"grad_norm": 0.4099898854278625, |
|
"learning_rate": 0.00012025316455696203, |
|
"loss": 0.3827, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.06339144215530904, |
|
"grad_norm": 0.5173612120396457, |
|
"learning_rate": 0.00012658227848101267, |
|
"loss": 0.3913, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06656101426307448, |
|
"grad_norm": 0.4695908910723305, |
|
"learning_rate": 0.0001329113924050633, |
|
"loss": 0.3285, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06973058637083994, |
|
"grad_norm": 0.34610029250066504, |
|
"learning_rate": 0.00013924050632911395, |
|
"loss": 0.3542, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.07290015847860538, |
|
"grad_norm": 0.3833563232036365, |
|
"learning_rate": 0.00014556962025316457, |
|
"loss": 0.3442, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.07606973058637084, |
|
"grad_norm": 0.38597736664868315, |
|
"learning_rate": 0.0001518987341772152, |
|
"loss": 0.3499, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07923930269413629, |
|
"grad_norm": 0.4555824320300245, |
|
"learning_rate": 0.00015822784810126583, |
|
"loss": 0.3843, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.08240887480190175, |
|
"grad_norm": 0.44058959604469733, |
|
"learning_rate": 0.00016455696202531648, |
|
"loss": 0.3321, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08557844690966719, |
|
"grad_norm": 0.37513672150754146, |
|
"learning_rate": 0.0001708860759493671, |
|
"loss": 0.3409, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.08874801901743265, |
|
"grad_norm": 0.3532888739409051, |
|
"learning_rate": 0.00017721518987341773, |
|
"loss": 0.3388, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0919175911251981, |
|
"grad_norm": 0.31398944959900404, |
|
"learning_rate": 0.00018354430379746836, |
|
"loss": 0.3407, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.09508716323296355, |
|
"grad_norm": 0.48473648286443866, |
|
"learning_rate": 0.00018987341772151899, |
|
"loss": 0.4109, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.098256735340729, |
|
"grad_norm": 0.3832743712760423, |
|
"learning_rate": 0.00019620253164556964, |
|
"loss": 0.2894, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.10142630744849446, |
|
"grad_norm": 0.3576599310136604, |
|
"learning_rate": 0.00019999901968817678, |
|
"loss": 0.3685, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.1045958795562599, |
|
"grad_norm": 0.4041268184733326, |
|
"learning_rate": 0.0001999879914008964, |
|
"loss": 0.3103, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.10776545166402536, |
|
"grad_norm": 0.348710082889974, |
|
"learning_rate": 0.00019996471079244477, |
|
"loss": 0.3686, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.1109350237717908, |
|
"grad_norm": 0.3641139077278622, |
|
"learning_rate": 0.0001999291807155794, |
|
"loss": 0.3672, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.11410459587955626, |
|
"grad_norm": 0.34875291735749603, |
|
"learning_rate": 0.0001998814055240823, |
|
"loss": 0.3289, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.11727416798732171, |
|
"grad_norm": 0.35868082118594846, |
|
"learning_rate": 0.00019982139107222632, |
|
"loss": 0.3843, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.12044374009508717, |
|
"grad_norm": 0.2975053354861811, |
|
"learning_rate": 0.000199749144714058, |
|
"loss": 0.3187, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.12361331220285261, |
|
"grad_norm": 0.3926097041806586, |
|
"learning_rate": 0.00019966467530249627, |
|
"loss": 0.3711, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.12678288431061807, |
|
"grad_norm": 0.39235636818547276, |
|
"learning_rate": 0.00019956799318824776, |
|
"loss": 0.3599, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12678288431061807, |
|
"eval_loss": 0.31717613339424133, |
|
"eval_runtime": 878.4135, |
|
"eval_samples_per_second": 4.554, |
|
"eval_steps_per_second": 0.569, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12995245641838352, |
|
"grad_norm": 0.32366959300654363, |
|
"learning_rate": 0.00019945911021853818, |
|
"loss": 0.2671, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.13312202852614896, |
|
"grad_norm": 0.34183927553766114, |
|
"learning_rate": 0.00019933803973566102, |
|
"loss": 0.3491, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.13629160063391443, |
|
"grad_norm": 0.355629049879592, |
|
"learning_rate": 0.0001992047965753422, |
|
"loss": 0.2778, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.13946117274167988, |
|
"grad_norm": 0.31194706241410036, |
|
"learning_rate": 0.00019905939706492238, |
|
"loss": 0.3278, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.14263074484944532, |
|
"grad_norm": 0.37190501088914274, |
|
"learning_rate": 0.0001989018590213561, |
|
"loss": 0.3757, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.14580031695721077, |
|
"grad_norm": 0.30859177154159206, |
|
"learning_rate": 0.00019873220174902858, |
|
"loss": 0.2952, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.14896988906497624, |
|
"grad_norm": 0.4072493051692793, |
|
"learning_rate": 0.0001985504460373903, |
|
"loss": 0.3576, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.15213946117274169, |
|
"grad_norm": 0.3117614582623609, |
|
"learning_rate": 0.00019835661415840928, |
|
"loss": 0.3127, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.15530903328050713, |
|
"grad_norm": 0.3433870206019631, |
|
"learning_rate": 0.00019815072986384218, |
|
"loss": 0.3424, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.15847860538827258, |
|
"grad_norm": 0.3252374107324197, |
|
"learning_rate": 0.0001979328183823236, |
|
"loss": 0.3509, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.16164817749603805, |
|
"grad_norm": 0.32574757253252834, |
|
"learning_rate": 0.00019770290641627468, |
|
"loss": 0.2913, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.1648177496038035, |
|
"grad_norm": 0.37343408069668577, |
|
"learning_rate": 0.00019746102213863114, |
|
"loss": 0.3524, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.16798732171156894, |
|
"grad_norm": 0.30197216412790706, |
|
"learning_rate": 0.00019720719518939083, |
|
"loss": 0.295, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.17115689381933438, |
|
"grad_norm": 0.37750434171669517, |
|
"learning_rate": 0.00019694145667198195, |
|
"loss": 0.3215, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.17432646592709986, |
|
"grad_norm": 0.3368196048030473, |
|
"learning_rate": 0.0001966638391494514, |
|
"loss": 0.35, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.1774960380348653, |
|
"grad_norm": 0.3232595651729065, |
|
"learning_rate": 0.0001963743766404749, |
|
"loss": 0.2637, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.18066561014263074, |
|
"grad_norm": 0.32199548202560035, |
|
"learning_rate": 0.00019607310461518818, |
|
"loss": 0.3262, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.1838351822503962, |
|
"grad_norm": 0.29117926540088634, |
|
"learning_rate": 0.0001957600599908406, |
|
"loss": 0.3129, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.18700475435816163, |
|
"grad_norm": 0.2836794081153409, |
|
"learning_rate": 0.00019543528112727146, |
|
"loss": 0.3207, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.1901743264659271, |
|
"grad_norm": 0.37478385305484463, |
|
"learning_rate": 0.0001950988078222093, |
|
"loss": 0.3503, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.19334389857369255, |
|
"grad_norm": 0.3323790483161259, |
|
"learning_rate": 0.00019475068130639543, |
|
"loss": 0.2873, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.196513470681458, |
|
"grad_norm": 0.31045326503955184, |
|
"learning_rate": 0.0001943909442385313, |
|
"loss": 0.3379, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.19968304278922344, |
|
"grad_norm": 0.295428110940092, |
|
"learning_rate": 0.00019401964070005144, |
|
"loss": 0.2913, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.20285261489698891, |
|
"grad_norm": 0.31381749704770145, |
|
"learning_rate": 0.00019363681618972164, |
|
"loss": 0.3167, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.20602218700475436, |
|
"grad_norm": 0.3799683908480184, |
|
"learning_rate": 0.00019324251761806374, |
|
"loss": 0.3203, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2091917591125198, |
|
"grad_norm": 0.25669447806119594, |
|
"learning_rate": 0.00019283679330160726, |
|
"loss": 0.2598, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.21236133122028525, |
|
"grad_norm": 0.3253285501894849, |
|
"learning_rate": 0.00019241969295696879, |
|
"loss": 0.321, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.21553090332805072, |
|
"grad_norm": 0.3015776648780859, |
|
"learning_rate": 0.0001919912676947598, |
|
"loss": 0.2912, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.21870047543581617, |
|
"grad_norm": 0.3548152436637532, |
|
"learning_rate": 0.00019155157001332374, |
|
"loss": 0.3398, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.2218700475435816, |
|
"grad_norm": 0.3562179525646546, |
|
"learning_rate": 0.00019110065379230289, |
|
"loss": 0.3575, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.22503961965134706, |
|
"grad_norm": 0.33759944051182883, |
|
"learning_rate": 0.00019063857428603615, |
|
"loss": 0.2644, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.22820919175911253, |
|
"grad_norm": 0.3478332359179607, |
|
"learning_rate": 0.00019016538811678823, |
|
"loss": 0.3421, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.23137876386687797, |
|
"grad_norm": 0.3107602080624315, |
|
"learning_rate": 0.0001896811532678113, |
|
"loss": 0.262, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.23454833597464342, |
|
"grad_norm": 0.26971775917740104, |
|
"learning_rate": 0.00018918592907623985, |
|
"loss": 0.3378, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.23771790808240886, |
|
"grad_norm": 0.32413332448217697, |
|
"learning_rate": 0.00018867977622581957, |
|
"loss": 0.3316, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.24088748019017434, |
|
"grad_norm": 0.3522975093101741, |
|
"learning_rate": 0.00018816275673947148, |
|
"loss": 0.2678, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.24405705229793978, |
|
"grad_norm": 0.31661852350790726, |
|
"learning_rate": 0.00018763493397169146, |
|
"loss": 0.3275, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.24722662440570523, |
|
"grad_norm": 0.27090727261610936, |
|
"learning_rate": 0.00018709637260078729, |
|
"loss": 0.2858, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.25039619651347067, |
|
"grad_norm": 0.3143474617991223, |
|
"learning_rate": 0.0001865471386209527, |
|
"loss": 0.3317, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.25356576862123614, |
|
"grad_norm": 0.48811153855723693, |
|
"learning_rate": 0.000185987299334181, |
|
"loss": 0.3295, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.25356576862123614, |
|
"eval_loss": 0.29194891452789307, |
|
"eval_runtime": 872.9978, |
|
"eval_samples_per_second": 4.582, |
|
"eval_steps_per_second": 0.573, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.25673534072900156, |
|
"grad_norm": 0.31755342222995686, |
|
"learning_rate": 0.00018541692334201771, |
|
"loss": 0.2643, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.25990491283676703, |
|
"grad_norm": 0.34778059073770806, |
|
"learning_rate": 0.0001848360805371544, |
|
"loss": 0.3339, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.2630744849445325, |
|
"grad_norm": 0.3183073063986642, |
|
"learning_rate": 0.00018424484209486416, |
|
"loss": 0.2673, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.2662440570522979, |
|
"grad_norm": 0.2788199901083398, |
|
"learning_rate": 0.00018364328046428, |
|
"loss": 0.3272, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2694136291600634, |
|
"grad_norm": 0.3666143727147526, |
|
"learning_rate": 0.00018303146935951689, |
|
"loss": 0.3247, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.27258320126782887, |
|
"grad_norm": 0.28586548327038175, |
|
"learning_rate": 0.00018240948375063926, |
|
"loss": 0.2792, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2757527733755943, |
|
"grad_norm": 0.9727255846044429, |
|
"learning_rate": 0.00018177739985447412, |
|
"loss": 0.3485, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.27892234548335976, |
|
"grad_norm": 0.29065854553956355, |
|
"learning_rate": 0.0001811352951252717, |
|
"loss": 0.2729, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2820919175911252, |
|
"grad_norm": 0.320575993183303, |
|
"learning_rate": 0.0001804832482452142, |
|
"loss": 0.3354, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.28526148969889065, |
|
"grad_norm": 0.34869737354697955, |
|
"learning_rate": 0.0001798213391147746, |
|
"loss": 0.3385, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2884310618066561, |
|
"grad_norm": 0.31478642211651564, |
|
"learning_rate": 0.00017914964884292544, |
|
"loss": 0.3133, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.29160063391442154, |
|
"grad_norm": 0.36834278711947965, |
|
"learning_rate": 0.0001784682597372, |
|
"loss": 0.3593, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.294770206022187, |
|
"grad_norm": 0.2791902388221146, |
|
"learning_rate": 0.00017777725529360676, |
|
"loss": 0.3005, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.2979397781299525, |
|
"grad_norm": 0.30096452678752406, |
|
"learning_rate": 0.00017707672018639758, |
|
"loss": 0.3354, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3011093502377179, |
|
"grad_norm": 0.3708048891578612, |
|
"learning_rate": 0.00017636674025769215, |
|
"loss": 0.3147, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.30427892234548337, |
|
"grad_norm": 0.305209122691005, |
|
"learning_rate": 0.00017564740250695904, |
|
"loss": 0.2713, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3074484944532488, |
|
"grad_norm": 0.3018873391630076, |
|
"learning_rate": 0.0001749187950803549, |
|
"loss": 0.3202, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.31061806656101426, |
|
"grad_norm": 0.3464422287874134, |
|
"learning_rate": 0.00017418100725992316, |
|
"loss": 0.3042, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.31378763866877973, |
|
"grad_norm": 0.31036543367721087, |
|
"learning_rate": 0.00017343412945265382, |
|
"loss": 0.3105, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.31695721077654515, |
|
"grad_norm": 0.3090116757558095, |
|
"learning_rate": 0.00017267825317940493, |
|
"loss": 0.3086, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3201267828843106, |
|
"grad_norm": 0.32015559999952525, |
|
"learning_rate": 0.00017191347106368797, |
|
"loss": 0.2595, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.3232963549920761, |
|
"grad_norm": 0.28242640929152685, |
|
"learning_rate": 0.0001711398768203178, |
|
"loss": 0.3171, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3264659270998415, |
|
"grad_norm": 0.3373697781712397, |
|
"learning_rate": 0.00017035756524392924, |
|
"loss": 0.2897, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.329635499207607, |
|
"grad_norm": 0.3187883343723006, |
|
"learning_rate": 0.0001695666321973609, |
|
"loss": 0.303, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3328050713153724, |
|
"grad_norm": 0.4060972163443389, |
|
"learning_rate": 0.00016876717459990862, |
|
"loss": 0.3273, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.3359746434231379, |
|
"grad_norm": 0.2709960074426642, |
|
"learning_rate": 0.0001679592904154489, |
|
"loss": 0.2629, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.33914421553090335, |
|
"grad_norm": 0.2828719972128079, |
|
"learning_rate": 0.00016714307864043487, |
|
"loss": 0.2946, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.34231378763866877, |
|
"grad_norm": 0.29485357171410065, |
|
"learning_rate": 0.00016631863929176524, |
|
"loss": 0.2704, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.34548335974643424, |
|
"grad_norm": 0.3140677978027709, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 0.3211, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.3486529318541997, |
|
"grad_norm": 0.30224374704766904, |
|
"learning_rate": 0.00016464548296962373, |
|
"loss": 0.3289, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3518225039619651, |
|
"grad_norm": 0.3015178734291492, |
|
"learning_rate": 0.0001637969710212588, |
|
"loss": 0.262, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.3549920760697306, |
|
"grad_norm": 0.3261808476280464, |
|
"learning_rate": 0.00016294064152432879, |
|
"loss": 0.3524, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.358161648177496, |
|
"grad_norm": 0.30420040263110554, |
|
"learning_rate": 0.00016207659941167485, |
|
"loss": 0.2888, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.3613312202852615, |
|
"grad_norm": 0.29855740633395794, |
|
"learning_rate": 0.00016120495056122622, |
|
"loss": 0.3075, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.36450079239302696, |
|
"grad_norm": 0.3775755682614953, |
|
"learning_rate": 0.00016032580178302583, |
|
"loss": 0.3452, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.3676703645007924, |
|
"grad_norm": 0.3189277602131783, |
|
"learning_rate": 0.00015943926080614235, |
|
"loss": 0.2643, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.37083993660855785, |
|
"grad_norm": 0.32115548282274786, |
|
"learning_rate": 0.00015854543626546915, |
|
"loss": 0.3126, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.37400950871632327, |
|
"grad_norm": 0.29230296850863174, |
|
"learning_rate": 0.00015764443768841234, |
|
"loss": 0.2949, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.37717908082408874, |
|
"grad_norm": 0.32187057297721217, |
|
"learning_rate": 0.0001567363754814696, |
|
"loss": 0.3166, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.3803486529318542, |
|
"grad_norm": 0.3766752931165212, |
|
"learning_rate": 0.0001558213609167012, |
|
"loss": 0.323, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3803486529318542, |
|
"eval_loss": 0.2788923680782318, |
|
"eval_runtime": 873.5171, |
|
"eval_samples_per_second": 4.579, |
|
"eval_steps_per_second": 0.572, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.38351822503961963, |
|
"grad_norm": 0.31877960462977273, |
|
"learning_rate": 0.00015489950611809484, |
|
"loss": 0.2803, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.3866877971473851, |
|
"grad_norm": 0.2903622851026156, |
|
"learning_rate": 0.00015397092404782642, |
|
"loss": 0.3178, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.3898573692551506, |
|
"grad_norm": 0.2639727101749139, |
|
"learning_rate": 0.00015303572849241764, |
|
"loss": 0.2703, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.393026941362916, |
|
"grad_norm": 0.3491709894849581, |
|
"learning_rate": 0.00015209403404879303, |
|
"loss": 0.3049, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.39619651347068147, |
|
"grad_norm": 0.3651420024997032, |
|
"learning_rate": 0.00015114595611023744, |
|
"loss": 0.3265, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.3993660855784469, |
|
"grad_norm": 0.3071330073578763, |
|
"learning_rate": 0.0001501916108522558, |
|
"loss": 0.2645, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.40253565768621236, |
|
"grad_norm": 0.2739471545543727, |
|
"learning_rate": 0.00014923111521833758, |
|
"loss": 0.3035, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.40570522979397783, |
|
"grad_norm": 0.30630113259525843, |
|
"learning_rate": 0.00014826458690562642, |
|
"loss": 0.2606, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.40887480190174325, |
|
"grad_norm": 0.2988843883769528, |
|
"learning_rate": 0.00014729214435049793, |
|
"loss": 0.3111, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.4120443740095087, |
|
"grad_norm": 0.3110979862585215, |
|
"learning_rate": 0.0001463139067140468, |
|
"loss": 0.2948, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.4152139461172742, |
|
"grad_norm": 0.30767657253531316, |
|
"learning_rate": 0.0001453299938674849, |
|
"loss": 0.2638, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.4183835182250396, |
|
"grad_norm": 0.27014842841388653, |
|
"learning_rate": 0.00014434052637745257, |
|
"loss": 0.2819, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.4215530903328051, |
|
"grad_norm": 0.2739393681355767, |
|
"learning_rate": 0.00014334562549124467, |
|
"loss": 0.2466, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.4247226624405705, |
|
"grad_norm": 0.31758998023523244, |
|
"learning_rate": 0.00014234541312195323, |
|
"loss": 0.2873, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.42789223454833597, |
|
"grad_norm": 0.39847849128188423, |
|
"learning_rate": 0.00014134001183352832, |
|
"loss": 0.2979, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.43106180665610144, |
|
"grad_norm": 0.30950118355401873, |
|
"learning_rate": 0.00014032954482575937, |
|
"loss": 0.2617, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.43423137876386686, |
|
"grad_norm": 0.3260587574739946, |
|
"learning_rate": 0.0001393141359191787, |
|
"loss": 0.3109, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.43740095087163233, |
|
"grad_norm": 0.3114375419997854, |
|
"learning_rate": 0.00013829390953988853, |
|
"loss": 0.2845, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4405705229793978, |
|
"grad_norm": 0.30019871836883555, |
|
"learning_rate": 0.00013726899070431423, |
|
"loss": 0.324, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.4437400950871632, |
|
"grad_norm": 0.38021042516470643, |
|
"learning_rate": 0.00013623950500388506, |
|
"loss": 0.3269, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4469096671949287, |
|
"grad_norm": 0.3089060241706131, |
|
"learning_rate": 0.00013520557858964446, |
|
"loss": 0.2584, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.4500792393026941, |
|
"grad_norm": 0.27984586622582663, |
|
"learning_rate": 0.00013416733815679166, |
|
"loss": 0.2909, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.4532488114104596, |
|
"grad_norm": 0.2923559292409706, |
|
"learning_rate": 0.00013312491092915682, |
|
"loss": 0.2489, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.45641838351822506, |
|
"grad_norm": 0.29223045315786345, |
|
"learning_rate": 0.00013207842464361125, |
|
"loss": 0.3135, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4595879556259905, |
|
"grad_norm": 0.33907899924090856, |
|
"learning_rate": 0.00013102800753441487, |
|
"loss": 0.3148, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.46275752773375595, |
|
"grad_norm": 0.26110455456342696, |
|
"learning_rate": 0.00012997378831750242, |
|
"loss": 0.2505, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.4659270998415214, |
|
"grad_norm": 0.2855563878095534, |
|
"learning_rate": 0.00012891589617471122, |
|
"loss": 0.322, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.46909667194928684, |
|
"grad_norm": 0.27089962197787903, |
|
"learning_rate": 0.00012785446073795118, |
|
"loss": 0.2629, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.4722662440570523, |
|
"grad_norm": 0.2787588891548799, |
|
"learning_rate": 0.00012678961207332015, |
|
"loss": 0.3071, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.4754358161648177, |
|
"grad_norm": 0.35249049637057156, |
|
"learning_rate": 0.00012572148066516584, |
|
"loss": 0.3265, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.4786053882725832, |
|
"grad_norm": 0.33307560406452336, |
|
"learning_rate": 0.00012465019740009662, |
|
"loss": 0.2403, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.48177496038034867, |
|
"grad_norm": 0.3035753509057755, |
|
"learning_rate": 0.00012357589355094275, |
|
"loss": 0.3057, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.4849445324881141, |
|
"grad_norm": 0.2950972689886197, |
|
"learning_rate": 0.00012249870076067067, |
|
"loss": 0.2637, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.48811410459587956, |
|
"grad_norm": 0.2713040409786771, |
|
"learning_rate": 0.00012141875102625167, |
|
"loss": 0.3196, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.49128367670364503, |
|
"grad_norm": 0.37005187803966516, |
|
"learning_rate": 0.00012033617668248723, |
|
"loss": 0.3265, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.49445324881141045, |
|
"grad_norm": 0.3678796577106568, |
|
"learning_rate": 0.00011925111038579309, |
|
"loss": 0.2283, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4976228209191759, |
|
"grad_norm": 0.3021844529595635, |
|
"learning_rate": 0.00011816368509794364, |
|
"loss": 0.2967, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.5007923930269413, |
|
"grad_norm": 0.3028161473676034, |
|
"learning_rate": 0.00011707403406977928, |
|
"loss": 0.2841, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5039619651347068, |
|
"grad_norm": 0.27418964538735746, |
|
"learning_rate": 0.00011598229082487784, |
|
"loss": 0.2803, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.5071315372424723, |
|
"grad_norm": 0.3426638434156249, |
|
"learning_rate": 0.0001148885891431932, |
|
"loss": 0.3274, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5071315372424723, |
|
"eval_loss": 0.26855266094207764, |
|
"eval_runtime": 873.628, |
|
"eval_samples_per_second": 4.579, |
|
"eval_steps_per_second": 0.572, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5103011093502378, |
|
"grad_norm": 0.2681269338020656, |
|
"learning_rate": 0.00011379306304466198, |
|
"loss": 0.2381, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.5134706814580031, |
|
"grad_norm": 0.2987060218422062, |
|
"learning_rate": 0.00011269584677278102, |
|
"loss": 0.3076, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.5166402535657686, |
|
"grad_norm": 0.2804222341073312, |
|
"learning_rate": 0.00011159707477815755, |
|
"loss": 0.2395, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.5198098256735341, |
|
"grad_norm": 0.25835895356413513, |
|
"learning_rate": 0.00011049688170203383, |
|
"loss": 0.3041, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5229793977812995, |
|
"grad_norm": 0.3313190058494361, |
|
"learning_rate": 0.00010939540235978845, |
|
"loss": 0.297, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.526148969889065, |
|
"grad_norm": 0.2564972143294916, |
|
"learning_rate": 0.00010829277172441648, |
|
"loss": 0.2359, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.5293185419968305, |
|
"grad_norm": 0.31632766018739716, |
|
"learning_rate": 0.00010718912490998991, |
|
"loss": 0.3112, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.5324881141045958, |
|
"grad_norm": 0.2738970193614327, |
|
"learning_rate": 0.00010608459715510139, |
|
"loss": 0.2416, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5356576862123613, |
|
"grad_norm": 0.35306801364530893, |
|
"learning_rate": 0.00010497932380629207, |
|
"loss": 0.3334, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.5388272583201268, |
|
"grad_norm": 0.3617753781992424, |
|
"learning_rate": 0.00010387344030146665, |
|
"loss": 0.3071, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5419968304278923, |
|
"grad_norm": 0.284695185318866, |
|
"learning_rate": 0.0001027670821532971, |
|
"loss": 0.2516, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.5451664025356577, |
|
"grad_norm": 0.28641499966999695, |
|
"learning_rate": 0.00010166038493261722, |
|
"loss": 0.3268, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5483359746434231, |
|
"grad_norm": 0.29940254299061986, |
|
"learning_rate": 0.00010055348425181, |
|
"loss": 0.2667, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.5515055467511886, |
|
"grad_norm": 0.33784906825030664, |
|
"learning_rate": 9.944651574819003e-05, |
|
"loss": 0.3006, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.554675118858954, |
|
"grad_norm": 0.33800198210916443, |
|
"learning_rate": 9.83396150673828e-05, |
|
"loss": 0.3009, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.5578446909667195, |
|
"grad_norm": 0.27814752259908526, |
|
"learning_rate": 9.72329178467029e-05, |
|
"loss": 0.25, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.561014263074485, |
|
"grad_norm": 0.3120985607406773, |
|
"learning_rate": 9.612655969853336e-05, |
|
"loss": 0.3079, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.5641838351822503, |
|
"grad_norm": 0.32270045792226343, |
|
"learning_rate": 9.502067619370794e-05, |
|
"loss": 0.2465, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5673534072900158, |
|
"grad_norm": 0.2522429392869884, |
|
"learning_rate": 9.391540284489862e-05, |
|
"loss": 0.3049, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.5705229793977813, |
|
"grad_norm": 0.32479021947356745, |
|
"learning_rate": 9.281087509001011e-05, |
|
"loss": 0.3109, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5736925515055468, |
|
"grad_norm": 0.3071871099500722, |
|
"learning_rate": 9.170722827558358e-05, |
|
"loss": 0.2566, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.5768621236133122, |
|
"grad_norm": 0.2808358292017096, |
|
"learning_rate": 9.060459764021156e-05, |
|
"loss": 0.2981, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5800316957210776, |
|
"grad_norm": 0.36613518181258947, |
|
"learning_rate": 8.950311829796619e-05, |
|
"loss": 0.2812, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.5832012678288431, |
|
"grad_norm": 0.29120302112196544, |
|
"learning_rate": 8.840292522184247e-05, |
|
"loss": 0.2958, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5863708399366085, |
|
"grad_norm": 0.3008146054202439, |
|
"learning_rate": 8.730415322721897e-05, |
|
"loss": 0.3119, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.589540412044374, |
|
"grad_norm": 0.30809505125548203, |
|
"learning_rate": 8.620693695533803e-05, |
|
"loss": 0.2603, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5927099841521395, |
|
"grad_norm": 0.3464042931932695, |
|
"learning_rate": 8.511141085680683e-05, |
|
"loss": 0.3217, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.595879556259905, |
|
"grad_norm": 0.28395404105986655, |
|
"learning_rate": 8.401770917512221e-05, |
|
"loss": 0.2339, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5990491283676703, |
|
"grad_norm": 0.32456815689823176, |
|
"learning_rate": 8.292596593022075e-05, |
|
"loss": 0.2761, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.6022187004754358, |
|
"grad_norm": 0.35814205267620147, |
|
"learning_rate": 8.183631490205637e-05, |
|
"loss": 0.3064, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6053882725832013, |
|
"grad_norm": 0.3307025804465351, |
|
"learning_rate": 8.074888961420695e-05, |
|
"loss": 0.2317, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.6085578446909667, |
|
"grad_norm": 0.3035093202164917, |
|
"learning_rate": 7.966382331751277e-05, |
|
"loss": 0.3024, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.6117274167987322, |
|
"grad_norm": 0.23483953416505404, |
|
"learning_rate": 7.858124897374837e-05, |
|
"loss": 0.2616, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.6148969889064976, |
|
"grad_norm": 0.24795445024402282, |
|
"learning_rate": 7.750129923932939e-05, |
|
"loss": 0.2889, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.618066561014263, |
|
"grad_norm": 0.39470726118892546, |
|
"learning_rate": 7.642410644905726e-05, |
|
"loss": 0.3255, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.6212361331220285, |
|
"grad_norm": 0.28578857562483734, |
|
"learning_rate": 7.534980259990341e-05, |
|
"loss": 0.2177, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.624405705229794, |
|
"grad_norm": 0.293120691065387, |
|
"learning_rate": 7.427851933483418e-05, |
|
"loss": 0.3008, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.6275752773375595, |
|
"grad_norm": 0.28050824031198807, |
|
"learning_rate": 7.321038792667987e-05, |
|
"loss": 0.2617, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.6307448494453248, |
|
"grad_norm": 0.3421819179459905, |
|
"learning_rate": 7.214553926204883e-05, |
|
"loss": 0.2827, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.6339144215530903, |
|
"grad_norm": 0.3825000717076991, |
|
"learning_rate": 7.108410382528879e-05, |
|
"loss": 0.3171, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6339144215530903, |
|
"eval_loss": 0.2597305178642273, |
|
"eval_runtime": 873.3574, |
|
"eval_samples_per_second": 4.58, |
|
"eval_steps_per_second": 0.573, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6370839936608558, |
|
"grad_norm": 0.293460396656183, |
|
"learning_rate": 7.002621168249759e-05, |
|
"loss": 0.2297, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.6402535657686212, |
|
"grad_norm": 0.3006160040194, |
|
"learning_rate": 6.897199246558514e-05, |
|
"loss": 0.2956, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.6434231378763867, |
|
"grad_norm": 0.2791223126874652, |
|
"learning_rate": 6.792157535638874e-05, |
|
"loss": 0.2496, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.6465927099841522, |
|
"grad_norm": 0.2894662197144813, |
|
"learning_rate": 6.687508907084319e-05, |
|
"loss": 0.2866, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.6497622820919176, |
|
"grad_norm": 0.33156274133370534, |
|
"learning_rate": 6.583266184320836e-05, |
|
"loss": 0.32, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.652931854199683, |
|
"grad_norm": 0.3447301699746775, |
|
"learning_rate": 6.479442141035556e-05, |
|
"loss": 0.2555, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.6561014263074485, |
|
"grad_norm": 0.3019937172628048, |
|
"learning_rate": 6.376049499611496e-05, |
|
"loss": 0.2632, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.659270998415214, |
|
"grad_norm": 0.25047087286035274, |
|
"learning_rate": 6.273100929568578e-05, |
|
"loss": 0.2472, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.6624405705229794, |
|
"grad_norm": 0.31801398649186896, |
|
"learning_rate": 6.170609046011151e-05, |
|
"loss": 0.2793, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.6656101426307448, |
|
"grad_norm": 0.3464523898432614, |
|
"learning_rate": 6.068586408082133e-05, |
|
"loss": 0.3138, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6687797147385103, |
|
"grad_norm": 0.2919062799416737, |
|
"learning_rate": 5.9670455174240614e-05, |
|
"loss": 0.2427, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.6719492868462758, |
|
"grad_norm": 0.29267872629520425, |
|
"learning_rate": 5.865998816647171e-05, |
|
"loss": 0.3038, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6751188589540412, |
|
"grad_norm": 0.27361822239828004, |
|
"learning_rate": 5.765458687804679e-05, |
|
"loss": 0.2566, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.6782884310618067, |
|
"grad_norm": 0.3050132066017946, |
|
"learning_rate": 5.665437450875534e-05, |
|
"loss": 0.2752, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6814580031695721, |
|
"grad_norm": 0.3580338711915158, |
|
"learning_rate": 5.565947362254746e-05, |
|
"loss": 0.3331, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.6846275752773375, |
|
"grad_norm": 0.26747930377415474, |
|
"learning_rate": 5.467000613251516e-05, |
|
"loss": 0.2429, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.687797147385103, |
|
"grad_norm": 0.32226567868782413, |
|
"learning_rate": 5.368609328595323e-05, |
|
"loss": 0.3208, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.6909667194928685, |
|
"grad_norm": 0.27314417996148593, |
|
"learning_rate": 5.270785564950208e-05, |
|
"loss": 0.2351, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.694136291600634, |
|
"grad_norm": 0.31179553442460595, |
|
"learning_rate": 5.1735413094373594e-05, |
|
"loss": 0.2791, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.6973058637083994, |
|
"grad_norm": 0.2983027582550753, |
|
"learning_rate": 5.0768884781662465e-05, |
|
"loss": 0.3123, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7004754358161648, |
|
"grad_norm": 0.268619063810808, |
|
"learning_rate": 4.9808389147744195e-05, |
|
"loss": 0.2675, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.7036450079239303, |
|
"grad_norm": 0.34151620569667657, |
|
"learning_rate": 4.885404388976261e-05, |
|
"loss": 0.3171, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.7068145800316957, |
|
"grad_norm": 0.25963093128586956, |
|
"learning_rate": 4.790596595120699e-05, |
|
"loss": 0.2533, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.7099841521394612, |
|
"grad_norm": 0.3373621924020373, |
|
"learning_rate": 4.696427150758238e-05, |
|
"loss": 0.3017, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.7131537242472267, |
|
"grad_norm": 0.32633352666577314, |
|
"learning_rate": 4.6029075952173596e-05, |
|
"loss": 0.3052, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.716323296354992, |
|
"grad_norm": 0.24971258370165642, |
|
"learning_rate": 4.510049388190518e-05, |
|
"loss": 0.2044, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.7194928684627575, |
|
"grad_norm": 0.29602844393415106, |
|
"learning_rate": 4.417863908329884e-05, |
|
"loss": 0.2959, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.722662440570523, |
|
"grad_norm": 0.23146594836780063, |
|
"learning_rate": 4.32636245185304e-05, |
|
"loss": 0.2252, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.7258320126782885, |
|
"grad_norm": 0.2744736835188008, |
|
"learning_rate": 4.235556231158765e-05, |
|
"loss": 0.2884, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.7290015847860539, |
|
"grad_norm": 0.27538990975844047, |
|
"learning_rate": 4.145456373453087e-05, |
|
"loss": 0.2981, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.7321711568938193, |
|
"grad_norm": 0.3032208366026702, |
|
"learning_rate": 4.0560739193857625e-05, |
|
"loss": 0.2158, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.7353407290015848, |
|
"grad_norm": 0.27204457210068295, |
|
"learning_rate": 3.96741982169742e-05, |
|
"loss": 0.3028, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.7385103011093502, |
|
"grad_norm": 0.28301662262727184, |
|
"learning_rate": 3.8795049438773825e-05, |
|
"loss": 0.2946, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.7416798732171157, |
|
"grad_norm": 0.2884264535746388, |
|
"learning_rate": 3.7923400588325155e-05, |
|
"loss": 0.3015, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.7448494453248812, |
|
"grad_norm": 0.3186549926460967, |
|
"learning_rate": 3.7059358475671224e-05, |
|
"loss": 0.2773, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.7480190174326465, |
|
"grad_norm": 0.2997708530371057, |
|
"learning_rate": 3.6203028978741226e-05, |
|
"loss": 0.2469, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.751188589540412, |
|
"grad_norm": 0.32430776300917263, |
|
"learning_rate": 3.535451703037626e-05, |
|
"loss": 0.2726, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.7543581616481775, |
|
"grad_norm": 0.2946578935656507, |
|
"learning_rate": 3.45139266054715e-05, |
|
"loss": 0.2645, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.757527733755943, |
|
"grad_norm": 0.26638481808591286, |
|
"learning_rate": 3.368136070823478e-05, |
|
"loss": 0.2465, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.7606973058637084, |
|
"grad_norm": 0.3677636374426017, |
|
"learning_rate": 3.285692135956515e-05, |
|
"loss": 0.3034, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7606973058637084, |
|
"eval_loss": 0.2539891302585602, |
|
"eval_runtime": 873.4669, |
|
"eval_samples_per_second": 4.579, |
|
"eval_steps_per_second": 0.572, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7638668779714739, |
|
"grad_norm": 0.29762017072344943, |
|
"learning_rate": 3.2040709584551095e-05, |
|
"loss": 0.2547, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.7670364500792393, |
|
"grad_norm": 0.35066724794986226, |
|
"learning_rate": 3.123282540009139e-05, |
|
"loss": 0.3043, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.7702060221870047, |
|
"grad_norm": 0.27108651599825634, |
|
"learning_rate": 3.0433367802639112e-05, |
|
"loss": 0.2195, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.7733755942947702, |
|
"grad_norm": 0.24030479810127725, |
|
"learning_rate": 2.9642434756070793e-05, |
|
"loss": 0.2545, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7765451664025357, |
|
"grad_norm": 0.288327556838552, |
|
"learning_rate": 2.8860123179682242e-05, |
|
"loss": 0.2942, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.7797147385103012, |
|
"grad_norm": 0.29997783643544385, |
|
"learning_rate": 2.8086528936312073e-05, |
|
"loss": 0.2407, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7828843106180665, |
|
"grad_norm": 0.2665313932594352, |
|
"learning_rate": 2.7321746820595086e-05, |
|
"loss": 0.2863, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.786053882725832, |
|
"grad_norm": 0.24138106294481415, |
|
"learning_rate": 2.6565870547346196e-05, |
|
"loss": 0.2443, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.7892234548335975, |
|
"grad_norm": 0.27410565336257203, |
|
"learning_rate": 2.5818992740076873e-05, |
|
"loss": 0.2714, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.7923930269413629, |
|
"grad_norm": 0.3607807135248553, |
|
"learning_rate": 2.508120491964512e-05, |
|
"loss": 0.3131, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.7955625990491284, |
|
"grad_norm": 0.2752324746545014, |
|
"learning_rate": 2.435259749304096e-05, |
|
"loss": 0.2352, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.7987321711568938, |
|
"grad_norm": 0.33701412326580854, |
|
"learning_rate": 2.3633259742307844e-05, |
|
"loss": 0.3121, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.8019017432646592, |
|
"grad_norm": 0.2719696587030905, |
|
"learning_rate": 2.292327981360245e-05, |
|
"loss": 0.2569, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.8050713153724247, |
|
"grad_norm": 0.321470064394813, |
|
"learning_rate": 2.222274470639324e-05, |
|
"loss": 0.2903, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.8082408874801902, |
|
"grad_norm": 0.33376441935823614, |
|
"learning_rate": 2.1531740262800004e-05, |
|
"loss": 0.2712, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.8114104595879557, |
|
"grad_norm": 0.3559808478292093, |
|
"learning_rate": 2.0850351157074598e-05, |
|
"loss": 0.2485, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.8145800316957211, |
|
"grad_norm": 0.3006799560470683, |
|
"learning_rate": 2.017866088522541e-05, |
|
"loss": 0.2735, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.8177496038034865, |
|
"grad_norm": 0.27868991819615774, |
|
"learning_rate": 1.951675175478579e-05, |
|
"loss": 0.2479, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.820919175911252, |
|
"grad_norm": 0.30796745550467525, |
|
"learning_rate": 1.8864704874728346e-05, |
|
"loss": 0.2693, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.8240887480190174, |
|
"grad_norm": 0.327384705590186, |
|
"learning_rate": 1.822260014552587e-05, |
|
"loss": 0.2787, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8272583201267829, |
|
"grad_norm": 0.2993843751525639, |
|
"learning_rate": 1.7590516249360754e-05, |
|
"loss": 0.2455, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.8304278922345484, |
|
"grad_norm": 0.2979918507317238, |
|
"learning_rate": 1.6968530640483127e-05, |
|
"loss": 0.2889, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.8335974643423137, |
|
"grad_norm": 0.2942240760065363, |
|
"learning_rate": 1.6356719535720056e-05, |
|
"loss": 0.2557, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.8367670364500792, |
|
"grad_norm": 0.31698805935759067, |
|
"learning_rate": 1.5755157905135843e-05, |
|
"loss": 0.2842, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.8399366085578447, |
|
"grad_norm": 0.3795639487558114, |
|
"learning_rate": 1.5163919462845622e-05, |
|
"loss": 0.2979, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.8431061806656102, |
|
"grad_norm": 0.2933950396246441, |
|
"learning_rate": 1.4583076657982297e-05, |
|
"loss": 0.2291, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.8462757527733756, |
|
"grad_norm": 0.25934135222761445, |
|
"learning_rate": 1.401270066581899e-05, |
|
"loss": 0.2981, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.849445324881141, |
|
"grad_norm": 0.2512793866151091, |
|
"learning_rate": 1.3452861379047287e-05, |
|
"loss": 0.2299, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.8526148969889065, |
|
"grad_norm": 0.27890392188122143, |
|
"learning_rate": 1.2903627399212747e-05, |
|
"loss": 0.2714, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.8557844690966719, |
|
"grad_norm": 0.3540435753559853, |
|
"learning_rate": 1.2365066028308547e-05, |
|
"loss": 0.3208, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.8589540412044374, |
|
"grad_norm": 0.3170188652169802, |
|
"learning_rate": 1.183724326052854e-05, |
|
"loss": 0.261, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.8621236133122029, |
|
"grad_norm": 0.287259110452561, |
|
"learning_rate": 1.1320223774180428e-05, |
|
"loss": 0.2918, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.8652931854199684, |
|
"grad_norm": 0.3145063929825825, |
|
"learning_rate": 1.0814070923760178e-05, |
|
"loss": 0.2562, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.8684627575277337, |
|
"grad_norm": 0.29883537499670176, |
|
"learning_rate": 1.0318846732188737e-05, |
|
"loss": 0.2585, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.8716323296354992, |
|
"grad_norm": 0.33602754178177113, |
|
"learning_rate": 9.834611883211797e-06, |
|
"loss": 0.303, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.8748019017432647, |
|
"grad_norm": 0.27917699955310804, |
|
"learning_rate": 9.361425713963878e-06, |
|
"loss": 0.2399, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.8779714738510301, |
|
"grad_norm": 0.29322424380757633, |
|
"learning_rate": 8.899346207697134e-06, |
|
"loss": 0.3192, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.8811410459587956, |
|
"grad_norm": 0.32716078301472046, |
|
"learning_rate": 8.448429986676298e-06, |
|
"loss": 0.256, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.884310618066561, |
|
"grad_norm": 0.28468261231564157, |
|
"learning_rate": 8.00873230524023e-06, |
|
"loss": 0.2864, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.8874801901743264, |
|
"grad_norm": 0.3481974787604397, |
|
"learning_rate": 7.580307043031232e-06, |
|
"loss": 0.265, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8874801901743264, |
|
"eval_loss": 0.25099214911460876, |
|
"eval_runtime": 873.7854, |
|
"eval_samples_per_second": 4.578, |
|
"eval_steps_per_second": 0.572, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8906497622820919, |
|
"grad_norm": 0.2756744352775957, |
|
"learning_rate": 7.163206698392744e-06, |
|
"loss": 0.2392, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.8938193343898574, |
|
"grad_norm": 0.3070714015760399, |
|
"learning_rate": 6.757482381936264e-06, |
|
"loss": 0.2722, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.8969889064976229, |
|
"grad_norm": 0.2719682030351016, |
|
"learning_rate": 6.36318381027835e-06, |
|
"loss": 0.2553, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.9001584786053882, |
|
"grad_norm": 0.30754515515844727, |
|
"learning_rate": 5.980359299948568e-06, |
|
"loss": 0.2763, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.9033280507131537, |
|
"grad_norm": 0.3599613866897873, |
|
"learning_rate": 5.609055761468707e-06, |
|
"loss": 0.2987, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.9064976228209192, |
|
"grad_norm": 0.26662442413818216, |
|
"learning_rate": 5.249318693604577e-06, |
|
"loss": 0.2632, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.9096671949286846, |
|
"grad_norm": 0.2965993748242227, |
|
"learning_rate": 4.901192177790692e-06, |
|
"loss": 0.2799, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.9128367670364501, |
|
"grad_norm": 0.2923839300339188, |
|
"learning_rate": 4.564718872728568e-06, |
|
"loss": 0.2464, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.9160063391442155, |
|
"grad_norm": 0.3004256474409844, |
|
"learning_rate": 4.2399400091594154e-06, |
|
"loss": 0.2775, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.919175911251981, |
|
"grad_norm": 0.30636844288189197, |
|
"learning_rate": 3.926895384811835e-06, |
|
"loss": 0.2917, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.9223454833597464, |
|
"grad_norm": 0.27018058178290905, |
|
"learning_rate": 3.625623359525099e-06, |
|
"loss": 0.2522, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.9255150554675119, |
|
"grad_norm": 0.3069766309513976, |
|
"learning_rate": 3.33616085054862e-06, |
|
"loss": 0.2722, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.9286846275752774, |
|
"grad_norm": 0.2673579253849767, |
|
"learning_rate": 3.0585433280180707e-06, |
|
"loss": 0.2561, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.9318541996830428, |
|
"grad_norm": 0.2688001276727079, |
|
"learning_rate": 2.792804810609173e-06, |
|
"loss": 0.2718, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.9350237717908082, |
|
"grad_norm": 0.3331860222359942, |
|
"learning_rate": 2.538977861368874e-06, |
|
"loss": 0.3163, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.9381933438985737, |
|
"grad_norm": 0.2668325932813764, |
|
"learning_rate": 2.2970935837253182e-06, |
|
"loss": 0.2393, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.9413629160063391, |
|
"grad_norm": 0.3285498156618503, |
|
"learning_rate": 2.0671816176764058e-06, |
|
"loss": 0.2862, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.9445324881141046, |
|
"grad_norm": 0.36573862269188245, |
|
"learning_rate": 1.8492701361578324e-06, |
|
"loss": 0.2447, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.9477020602218701, |
|
"grad_norm": 0.2864139944423568, |
|
"learning_rate": 1.6433858415907278e-06, |
|
"loss": 0.2777, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.9508716323296355, |
|
"grad_norm": 0.323741034773291, |
|
"learning_rate": 1.4495539626097288e-06, |
|
"loss": 0.3086, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9540412044374009, |
|
"grad_norm": 0.2857388007026186, |
|
"learning_rate": 1.2677982509714415e-06, |
|
"loss": 0.2175, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.9572107765451664, |
|
"grad_norm": 0.2813011213045847, |
|
"learning_rate": 1.0981409786439355e-06, |
|
"loss": 0.2882, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.9603803486529319, |
|
"grad_norm": 0.27685594779071976, |
|
"learning_rate": 9.40602935077639e-07, |
|
"loss": 0.23, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.9635499207606973, |
|
"grad_norm": 0.278082958417837, |
|
"learning_rate": 7.952034246577977e-07, |
|
"loss": 0.2814, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.9667194928684627, |
|
"grad_norm": 0.332411253150925, |
|
"learning_rate": 6.619602643389899e-07, |
|
"loss": 0.2772, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.9698890649762282, |
|
"grad_norm": 0.28541188324654354, |
|
"learning_rate": 5.408897814618175e-07, |
|
"loss": 0.2456, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.9730586370839936, |
|
"grad_norm": 0.289051402982161, |
|
"learning_rate": 4.320068117522835e-07, |
|
"loss": 0.2659, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.9762282091917591, |
|
"grad_norm": 0.2896831822321737, |
|
"learning_rate": 3.35324697503725e-07, |
|
"loss": 0.2721, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.9793977812995246, |
|
"grad_norm": 0.31950029347694936, |
|
"learning_rate": 2.508552859419977e-07, |
|
"loss": 0.2622, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.9825673534072901, |
|
"grad_norm": 0.33661523682392047, |
|
"learning_rate": 1.7860892777367133e-07, |
|
"loss": 0.2731, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.9857369255150554, |
|
"grad_norm": 0.2522879758084615, |
|
"learning_rate": 1.1859447591769934e-07, |
|
"loss": 0.2291, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.9889064976228209, |
|
"grad_norm": 0.2923729662272973, |
|
"learning_rate": 7.081928442057573e-08, |
|
"loss": 0.2972, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.9920760697305864, |
|
"grad_norm": 0.24814229174923821, |
|
"learning_rate": 3.5289207555233573e-08, |
|
"loss": 0.2586, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.9952456418383518, |
|
"grad_norm": 0.24322900794711846, |
|
"learning_rate": 1.2008599103618956e-08, |
|
"loss": 0.2751, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.9984152139461173, |
|
"grad_norm": 0.4374899080765362, |
|
"learning_rate": 9.803118232398768e-10, |
|
"loss": 0.2981, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.9996830427892235, |
|
"step": 1577, |
|
"total_flos": 8013042675351552.0, |
|
"train_loss": 0.3010184336819827, |
|
"train_runtime": 18281.3669, |
|
"train_samples_per_second": 1.381, |
|
"train_steps_per_second": 0.086 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1577, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8013042675351552.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|