|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9946949602122017, |
|
"eval_steps": 500, |
|
"global_step": 282, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007073386383731211, |
|
"grad_norm": 0.4045802652835846, |
|
"learning_rate": 0.0, |
|
"loss": 2.592, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.014146772767462422, |
|
"grad_norm": 0.4087854325771332, |
|
"learning_rate": 0.00018927892607143717, |
|
"loss": 2.3663, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.021220159151193633, |
|
"grad_norm": 0.391991525888443, |
|
"learning_rate": 0.0003, |
|
"loss": 2.3427, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.028293545534924844, |
|
"grad_norm": 0.47497037053108215, |
|
"learning_rate": 0.0003, |
|
"loss": 2.4095, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03536693191865606, |
|
"grad_norm": 0.3936399221420288, |
|
"learning_rate": 0.0003, |
|
"loss": 1.7048, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.042440318302387266, |
|
"grad_norm": 0.6155605316162109, |
|
"learning_rate": 0.0003, |
|
"loss": 1.8204, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04951370468611848, |
|
"grad_norm": 0.49080851674079895, |
|
"learning_rate": 0.0003, |
|
"loss": 1.4646, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05658709106984969, |
|
"grad_norm": 0.5759713053703308, |
|
"learning_rate": 0.0003, |
|
"loss": 1.4984, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0636604774535809, |
|
"grad_norm": 0.5349287390708923, |
|
"learning_rate": 0.0003, |
|
"loss": 1.3691, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07073386383731212, |
|
"grad_norm": 0.3948557674884796, |
|
"learning_rate": 0.0003, |
|
"loss": 1.4401, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07780725022104333, |
|
"grad_norm": 0.37507522106170654, |
|
"learning_rate": 0.0003, |
|
"loss": 1.1852, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08488063660477453, |
|
"grad_norm": 0.32405033707618713, |
|
"learning_rate": 0.0003, |
|
"loss": 1.051, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09195402298850575, |
|
"grad_norm": 0.4525175392627716, |
|
"learning_rate": 0.0003, |
|
"loss": 1.2695, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09902740937223696, |
|
"grad_norm": 0.42692625522613525, |
|
"learning_rate": 0.0003, |
|
"loss": 1.1057, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10610079575596817, |
|
"grad_norm": 0.5049455761909485, |
|
"learning_rate": 0.0003, |
|
"loss": 1.6851, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.11317418213969938, |
|
"grad_norm": 0.38740119338035583, |
|
"learning_rate": 0.0003, |
|
"loss": 1.2632, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.12024756852343059, |
|
"grad_norm": 0.3729807138442993, |
|
"learning_rate": 0.0003, |
|
"loss": 1.2857, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.1273209549071618, |
|
"grad_norm": 0.4548921287059784, |
|
"learning_rate": 0.0003, |
|
"loss": 1.2233, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.134394341290893, |
|
"grad_norm": 0.4324336051940918, |
|
"learning_rate": 0.0003, |
|
"loss": 1.1058, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.14146772767462423, |
|
"grad_norm": 0.5775079727172852, |
|
"learning_rate": 0.0003, |
|
"loss": 1.0475, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14854111405835543, |
|
"grad_norm": 0.40563157200813293, |
|
"learning_rate": 0.0003, |
|
"loss": 1.1364, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.15561450044208666, |
|
"grad_norm": 0.4697245657444, |
|
"learning_rate": 0.0003, |
|
"loss": 1.3599, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.16268788682581786, |
|
"grad_norm": 0.42879530787467957, |
|
"learning_rate": 0.0003, |
|
"loss": 1.1086, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.16976127320954906, |
|
"grad_norm": 0.42367979884147644, |
|
"learning_rate": 0.0003, |
|
"loss": 0.9705, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1768346595932803, |
|
"grad_norm": 0.3987770080566406, |
|
"learning_rate": 0.0003, |
|
"loss": 1.0087, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1839080459770115, |
|
"grad_norm": 0.3194337487220764, |
|
"learning_rate": 0.0003, |
|
"loss": 0.8143, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1909814323607427, |
|
"grad_norm": 0.3626921474933624, |
|
"learning_rate": 0.0003, |
|
"loss": 0.9763, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.19805481874447392, |
|
"grad_norm": 0.38496437668800354, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6315, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.20512820512820512, |
|
"grad_norm": 0.41984379291534424, |
|
"learning_rate": 0.0003, |
|
"loss": 1.0303, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.21220159151193635, |
|
"grad_norm": 0.4012935161590576, |
|
"learning_rate": 0.0003, |
|
"loss": 0.9862, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.21927497789566755, |
|
"grad_norm": 0.40578627586364746, |
|
"learning_rate": 0.0003, |
|
"loss": 1.0094, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.22634836427939875, |
|
"grad_norm": 0.41153454780578613, |
|
"learning_rate": 0.0003, |
|
"loss": 0.966, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.23342175066312998, |
|
"grad_norm": 0.3835723400115967, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5704, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.24049513704686118, |
|
"grad_norm": 0.4588032066822052, |
|
"learning_rate": 0.0003, |
|
"loss": 0.8564, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2475685234305924, |
|
"grad_norm": 0.42644572257995605, |
|
"learning_rate": 0.0003, |
|
"loss": 0.8448, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2546419098143236, |
|
"grad_norm": 0.44491246342658997, |
|
"learning_rate": 0.0003, |
|
"loss": 1.1484, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.26171529619805484, |
|
"grad_norm": 0.44271302223205566, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7746, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.268788682581786, |
|
"grad_norm": 0.4080619215965271, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5377, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 0.3697488605976105, |
|
"learning_rate": 0.0003, |
|
"loss": 0.9936, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.28293545534924847, |
|
"grad_norm": 0.37987953424453735, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7066, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.29000884173297964, |
|
"grad_norm": 0.5652127861976624, |
|
"learning_rate": 0.0003, |
|
"loss": 0.8813, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.29708222811671087, |
|
"grad_norm": 0.45179855823516846, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6442, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3041556145004421, |
|
"grad_norm": 0.40251022577285767, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6876, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.3112290008841733, |
|
"grad_norm": 0.3425946831703186, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4759, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.3183023872679045, |
|
"grad_norm": 0.3156929016113281, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5237, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3253757736516357, |
|
"grad_norm": 0.5097647309303284, |
|
"learning_rate": 0.0003, |
|
"loss": 1.0965, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.33244916003536695, |
|
"grad_norm": 0.4245418906211853, |
|
"learning_rate": 0.0003, |
|
"loss": 0.717, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3395225464190981, |
|
"grad_norm": 0.36271074414253235, |
|
"learning_rate": 0.0003, |
|
"loss": 0.925, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.34659593280282935, |
|
"grad_norm": 0.3543199300765991, |
|
"learning_rate": 0.0003, |
|
"loss": 0.52, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3536693191865606, |
|
"grad_norm": 0.4760311245918274, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6514, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.36074270557029176, |
|
"grad_norm": 0.36290043592453003, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6391, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.367816091954023, |
|
"grad_norm": 0.4390805959701538, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7822, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3748894783377542, |
|
"grad_norm": 0.402041494846344, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5967, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3819628647214854, |
|
"grad_norm": 0.42580777406692505, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7591, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3890362511052166, |
|
"grad_norm": 0.4342993199825287, |
|
"learning_rate": 0.0003, |
|
"loss": 0.9428, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.39610963748894784, |
|
"grad_norm": 0.42949816584587097, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6546, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.40318302387267907, |
|
"grad_norm": 0.44655221700668335, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6999, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.41025641025641024, |
|
"grad_norm": 0.38236895203590393, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5464, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.41732979664014147, |
|
"grad_norm": 0.39055347442626953, |
|
"learning_rate": 0.0003, |
|
"loss": 0.8726, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.4244031830238727, |
|
"grad_norm": 0.47743409872055054, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6839, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.43147656940760387, |
|
"grad_norm": 0.5571391582489014, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6384, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4385499557913351, |
|
"grad_norm": 0.4612606465816498, |
|
"learning_rate": 0.0003, |
|
"loss": 0.8187, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.44562334217506633, |
|
"grad_norm": 0.3999072313308716, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6792, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4526967285587975, |
|
"grad_norm": 0.4889736771583557, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7837, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.45977011494252873, |
|
"grad_norm": 0.4411163628101349, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7325, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.46684350132625996, |
|
"grad_norm": 0.4137038588523865, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5974, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4739168877099912, |
|
"grad_norm": 0.4226423501968384, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6251, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.48099027409372236, |
|
"grad_norm": 0.4461803734302521, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5721, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4880636604774536, |
|
"grad_norm": 0.4135233461856842, |
|
"learning_rate": 0.0003, |
|
"loss": 0.708, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4951370468611848, |
|
"grad_norm": 0.40338656306266785, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6943, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.502210433244916, |
|
"grad_norm": 0.47266095876693726, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6883, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5092838196286472, |
|
"grad_norm": 0.45008358359336853, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6347, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5163572060123784, |
|
"grad_norm": 0.36589792370796204, |
|
"learning_rate": 0.0003, |
|
"loss": 0.746, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5234305923961097, |
|
"grad_norm": 0.36300450563430786, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7846, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5305039787798409, |
|
"grad_norm": 0.42305129766464233, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7909, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.537577365163572, |
|
"grad_norm": 0.36807361245155334, |
|
"learning_rate": 0.0003, |
|
"loss": 0.578, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5446507515473032, |
|
"grad_norm": 0.3479249179363251, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4358, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.4373302161693573, |
|
"learning_rate": 0.0003, |
|
"loss": 0.8263, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5587975243147657, |
|
"grad_norm": 0.5427613854408264, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7728, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5658709106984969, |
|
"grad_norm": 0.4510067403316498, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7188, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5729442970822282, |
|
"grad_norm": 0.3964546322822571, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6707, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5800176834659593, |
|
"grad_norm": 0.40177956223487854, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7056, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5870910698496905, |
|
"grad_norm": 0.4081084728240967, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6588, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5941644562334217, |
|
"grad_norm": 0.3595137298107147, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6469, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.601237842617153, |
|
"grad_norm": 0.40407031774520874, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6954, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.6083112290008842, |
|
"grad_norm": 0.47531482577323914, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5842, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 0.3669019639492035, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6278, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6224580017683466, |
|
"grad_norm": 0.3638778030872345, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4731, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.6295313881520778, |
|
"grad_norm": 0.39883217215538025, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6891, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.636604774535809, |
|
"grad_norm": 0.627139687538147, |
|
"learning_rate": 0.0003, |
|
"loss": 0.58, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6436781609195402, |
|
"grad_norm": 0.5339258313179016, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6198, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.6507515473032714, |
|
"grad_norm": 0.4699147939682007, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7175, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6578249336870027, |
|
"grad_norm": 0.3144320249557495, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4438, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6648983200707339, |
|
"grad_norm": 0.47343114018440247, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7511, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.671971706454465, |
|
"grad_norm": 0.43690529465675354, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4847, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6790450928381963, |
|
"grad_norm": 0.5092759728431702, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6703, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6861184792219275, |
|
"grad_norm": 0.7045844793319702, |
|
"learning_rate": 0.0003, |
|
"loss": 0.717, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6931918656056587, |
|
"grad_norm": 0.34709087014198303, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5597, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7002652519893899, |
|
"grad_norm": 0.39407986402511597, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5079, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7073386383731212, |
|
"grad_norm": 0.6836314797401428, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5947, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7144120247568524, |
|
"grad_norm": 0.4487530291080475, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5638, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7214854111405835, |
|
"grad_norm": 0.34299322962760925, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4268, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7285587975243147, |
|
"grad_norm": 0.4325425624847412, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7195, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.735632183908046, |
|
"grad_norm": 0.3857167959213257, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5525, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.7427055702917772, |
|
"grad_norm": 0.5439281463623047, |
|
"learning_rate": 0.0003, |
|
"loss": 0.8488, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7497789566755084, |
|
"grad_norm": 0.5054299831390381, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5801, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.7568523430592397, |
|
"grad_norm": 0.5152317881584167, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6918, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.7639257294429708, |
|
"grad_norm": 0.32669249176979065, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5322, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.770999115826702, |
|
"grad_norm": 0.4302417039871216, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6439, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7780725022104332, |
|
"grad_norm": 0.4388223886489868, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6196, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7851458885941645, |
|
"grad_norm": 0.42924442887306213, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5175, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7922192749778957, |
|
"grad_norm": 0.4361798167228699, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5342, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7992926613616269, |
|
"grad_norm": 0.4133489429950714, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5639, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8063660477453581, |
|
"grad_norm": 0.34224194288253784, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4695, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.8134394341290893, |
|
"grad_norm": 0.4219891428947449, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6307, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8205128205128205, |
|
"grad_norm": 0.44273802638053894, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5475, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 0.42054426670074463, |
|
"learning_rate": 0.0003, |
|
"loss": 0.827, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.8346595932802829, |
|
"grad_norm": 0.4792965054512024, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.8417329796640142, |
|
"grad_norm": 0.5182773470878601, |
|
"learning_rate": 0.0003, |
|
"loss": 0.8832, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.8488063660477454, |
|
"grad_norm": 0.41087284684181213, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5825, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8558797524314765, |
|
"grad_norm": 0.36328765749931335, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4198, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.8629531388152077, |
|
"grad_norm": 0.43922775983810425, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5495, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.870026525198939, |
|
"grad_norm": 0.5079771876335144, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6814, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8770999115826702, |
|
"grad_norm": 0.3167728781700134, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5706, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8841732979664014, |
|
"grad_norm": 0.45660603046417236, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7102, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8912466843501327, |
|
"grad_norm": 0.42243629693984985, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5449, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8983200707338639, |
|
"grad_norm": 0.32169416546821594, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3933, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.905393457117595, |
|
"grad_norm": 0.32228872179985046, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6444, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.9124668435013262, |
|
"grad_norm": 0.47969621419906616, |
|
"learning_rate": 0.0003, |
|
"loss": 0.7959, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9195402298850575, |
|
"grad_norm": 0.35543474555015564, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6535, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9266136162687887, |
|
"grad_norm": 0.4273511469364166, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6058, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.9336870026525199, |
|
"grad_norm": 0.3400624692440033, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6066, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.9407603890362511, |
|
"grad_norm": 0.3195785582065582, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5878, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.9478337754199824, |
|
"grad_norm": 0.34657567739486694, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6462, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.9549071618037135, |
|
"grad_norm": 0.4706454873085022, |
|
"learning_rate": 0.0003, |
|
"loss": 0.8299, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9619805481874447, |
|
"grad_norm": 0.41353291273117065, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6372, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.969053934571176, |
|
"grad_norm": 0.34282562136650085, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5901, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.9761273209549072, |
|
"grad_norm": 0.4154914617538452, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6213, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.9832007073386384, |
|
"grad_norm": 0.2933409810066223, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4435, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9902740937223696, |
|
"grad_norm": 0.3763149082660675, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4754, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9973474801061007, |
|
"grad_norm": 0.4369047284126282, |
|
"learning_rate": 0.0003, |
|
"loss": 0.6313, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.004420866489832, |
|
"grad_norm": 0.40332600474357605, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4778, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.0114942528735633, |
|
"grad_norm": 0.31336432695388794, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4599, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.0185676392572944, |
|
"grad_norm": 0.3116231858730316, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3823, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 0.47887638211250305, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4838, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.032714412024757, |
|
"grad_norm": 0.3979848325252533, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3765, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.039787798408488, |
|
"grad_norm": 0.3911687433719635, |
|
"learning_rate": 0.0003, |
|
"loss": 0.379, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.0468611847922193, |
|
"grad_norm": 0.41035008430480957, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4544, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.0539345711759505, |
|
"grad_norm": 0.3448046147823334, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3809, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.0610079575596818, |
|
"grad_norm": 0.3258429765701294, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3027, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.068081343943413, |
|
"grad_norm": 0.4393693208694458, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4825, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.075154730327144, |
|
"grad_norm": 0.29749980568885803, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2696, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.0822281167108754, |
|
"grad_norm": 0.3464600741863251, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2812, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.0893015030946065, |
|
"grad_norm": 0.3517362177371979, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4352, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0963748894783378, |
|
"grad_norm": 0.3475998640060425, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3298, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.103448275862069, |
|
"grad_norm": 0.41514718532562256, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2779, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.1105216622458003, |
|
"grad_norm": 0.38064250349998474, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3552, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.1175950486295314, |
|
"grad_norm": 0.48406025767326355, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4691, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.1246684350132625, |
|
"grad_norm": 0.3856564462184906, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3817, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.1317418213969939, |
|
"grad_norm": 0.40879660844802856, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3555, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.138815207780725, |
|
"grad_norm": 0.4073532223701477, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3218, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.1458885941644563, |
|
"grad_norm": 0.5433499217033386, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4749, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.1529619805481874, |
|
"grad_norm": 0.47047749161720276, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3945, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.1600353669319188, |
|
"grad_norm": 0.3000759184360504, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3944, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.16710875331565, |
|
"grad_norm": 0.38655105233192444, |
|
"learning_rate": 0.0003, |
|
"loss": 0.458, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.174182139699381, |
|
"grad_norm": 0.3441111743450165, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3388, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.1812555260831124, |
|
"grad_norm": 0.5380314588546753, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5506, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.1883289124668435, |
|
"grad_norm": 0.2528212070465088, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3144, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.1954022988505748, |
|
"grad_norm": 0.3783420920372009, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5596, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.202475685234306, |
|
"grad_norm": 0.3812076449394226, |
|
"learning_rate": 0.0003, |
|
"loss": 0.42, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.209549071618037, |
|
"grad_norm": 0.43172749876976013, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4931, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.2166224580017684, |
|
"grad_norm": 0.41426223516464233, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2998, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.2236958443854995, |
|
"grad_norm": 0.35829058289527893, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4243, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 0.4014543294906616, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3049, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.237842617152962, |
|
"grad_norm": 0.3007238507270813, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2005, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.244916003536693, |
|
"grad_norm": 0.3595844507217407, |
|
"learning_rate": 0.0003, |
|
"loss": 0.344, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.2519893899204244, |
|
"grad_norm": 0.34730204939842224, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2573, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.2590627763041558, |
|
"grad_norm": 0.39390042424201965, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3177, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.2661361626878869, |
|
"grad_norm": 0.41631364822387695, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4541, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.273209549071618, |
|
"grad_norm": 0.4117166996002197, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4597, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2802829354553493, |
|
"grad_norm": 0.46357792615890503, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3166, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.2873563218390804, |
|
"grad_norm": 0.31492120027542114, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2183, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.2944297082228116, |
|
"grad_norm": 0.31738027930259705, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3114, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.301503094606543, |
|
"grad_norm": 0.37768757343292236, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2977, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.308576480990274, |
|
"grad_norm": 0.45224347710609436, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3788, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.3156498673740054, |
|
"grad_norm": 0.42707428336143494, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3065, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.3227232537577365, |
|
"grad_norm": 0.359110027551651, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3916, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.3297966401414678, |
|
"grad_norm": 0.4212663173675537, |
|
"learning_rate": 0.0003, |
|
"loss": 0.592, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.336870026525199, |
|
"grad_norm": 0.4227355122566223, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4278, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.34394341290893, |
|
"grad_norm": 0.45795100927352905, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4068, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.3510167992926614, |
|
"grad_norm": 0.47883355617523193, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5285, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.3580901856763925, |
|
"grad_norm": 0.36151745915412903, |
|
"learning_rate": 0.0003, |
|
"loss": 0.365, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.3651635720601238, |
|
"grad_norm": 0.38841187953948975, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4783, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.372236958443855, |
|
"grad_norm": 0.3572918772697449, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4407, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 0.36447620391845703, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3111, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.3863837312113174, |
|
"grad_norm": 0.31043165922164917, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3809, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.3934571175950485, |
|
"grad_norm": 0.4331524670124054, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3464, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.4005305039787799, |
|
"grad_norm": 0.5187276005744934, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4041, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.407603890362511, |
|
"grad_norm": 0.3016161322593689, |
|
"learning_rate": 0.0003, |
|
"loss": 0.1315, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.4146772767462423, |
|
"grad_norm": 0.3778589069843292, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2563, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.4217506631299734, |
|
"grad_norm": 0.4542739987373352, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3676, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.4288240495137048, |
|
"grad_norm": 0.37201106548309326, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4023, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.435897435897436, |
|
"grad_norm": 0.3098253607749939, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2013, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.442970822281167, |
|
"grad_norm": 0.41762611269950867, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2562, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.4500442086648984, |
|
"grad_norm": 0.3805309534072876, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2091, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.4571175950486295, |
|
"grad_norm": 0.30562469363212585, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3204, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.4641909814323608, |
|
"grad_norm": 0.40833625197410583, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3828, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.471264367816092, |
|
"grad_norm": 0.44443726539611816, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3023, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.4783377541998233, |
|
"grad_norm": 0.3216983675956726, |
|
"learning_rate": 0.0003, |
|
"loss": 0.148, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.4854111405835544, |
|
"grad_norm": 0.49379777908325195, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3597, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4924845269672855, |
|
"grad_norm": 0.41881895065307617, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3724, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.4995579133510168, |
|
"grad_norm": 0.37855106592178345, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2177, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.506631299734748, |
|
"grad_norm": 0.4481782615184784, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4668, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.513704686118479, |
|
"grad_norm": 0.45132726430892944, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5844, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.5207780725022104, |
|
"grad_norm": 0.4039032459259033, |
|
"learning_rate": 0.0003, |
|
"loss": 0.411, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.5278514588859418, |
|
"grad_norm": 0.3423170745372772, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3069, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.5349248452696729, |
|
"grad_norm": 0.3927661180496216, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5008, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.541998231653404, |
|
"grad_norm": 0.43571972846984863, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4626, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.5490716180371353, |
|
"grad_norm": 0.370449423789978, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2882, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.5561450044208665, |
|
"grad_norm": 0.3305343687534332, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2781, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.5632183908045976, |
|
"grad_norm": 0.40083616971969604, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2652, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.570291777188329, |
|
"grad_norm": 0.38695937395095825, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4565, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.5773651635720602, |
|
"grad_norm": 0.5376386046409607, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4184, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.5844385499557914, |
|
"grad_norm": 0.5290461182594299, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3836, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.5915119363395225, |
|
"grad_norm": 0.39294925332069397, |
|
"learning_rate": 0.0003, |
|
"loss": 0.446, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.5985853227232538, |
|
"grad_norm": 0.3946995139122009, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3433, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.605658709106985, |
|
"grad_norm": 0.3850666880607605, |
|
"learning_rate": 0.0003, |
|
"loss": 0.515, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.612732095490716, |
|
"grad_norm": 0.3812507688999176, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4666, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.6198054818744474, |
|
"grad_norm": 0.34343773126602173, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3437, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.6268788682581787, |
|
"grad_norm": 0.42423132061958313, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2998, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.6339522546419099, |
|
"grad_norm": 0.36676838994026184, |
|
"learning_rate": 0.0003, |
|
"loss": 0.381, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.641025641025641, |
|
"grad_norm": 0.45891061425209045, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4426, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.6480990274093723, |
|
"grad_norm": 0.4290439188480377, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3475, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.6551724137931034, |
|
"grad_norm": 0.3556974232196808, |
|
"learning_rate": 0.0003, |
|
"loss": 0.328, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.6622458001768345, |
|
"grad_norm": 0.30578428506851196, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2591, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.6693191865605659, |
|
"grad_norm": 0.3522488474845886, |
|
"learning_rate": 0.0003, |
|
"loss": 0.416, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.6763925729442972, |
|
"grad_norm": 0.3940620720386505, |
|
"learning_rate": 0.0003, |
|
"loss": 0.548, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.6834659593280283, |
|
"grad_norm": 0.4076889455318451, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5044, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.6905393457117595, |
|
"grad_norm": 0.49337613582611084, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4355, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.6976127320954908, |
|
"grad_norm": 0.37077927589416504, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4739, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.704686118479222, |
|
"grad_norm": 0.4110550880432129, |
|
"learning_rate": 0.0003, |
|
"loss": 0.428, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.711759504862953, |
|
"grad_norm": 0.49631252884864807, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4227, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.7188328912466844, |
|
"grad_norm": 0.3230995535850525, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3451, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.7259062776304157, |
|
"grad_norm": 0.36575183272361755, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2817, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.7329796640141468, |
|
"grad_norm": 0.4187852740287781, |
|
"learning_rate": 0.0003, |
|
"loss": 0.319, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.740053050397878, |
|
"grad_norm": 0.3224227726459503, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3406, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.7471264367816093, |
|
"grad_norm": 0.379561185836792, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3817, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.7541998231653404, |
|
"grad_norm": 0.44703027606010437, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3879, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.7612732095490715, |
|
"grad_norm": 0.34053027629852295, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2767, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.7683465959328029, |
|
"grad_norm": 0.48519593477249146, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5043, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.7754199823165342, |
|
"grad_norm": 0.3466756045818329, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2593, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.782493368700265, |
|
"grad_norm": 0.5155137777328491, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3529, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.7895667550839964, |
|
"grad_norm": 0.4184979796409607, |
|
"learning_rate": 0.0003, |
|
"loss": 0.535, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.7966401414677278, |
|
"grad_norm": 0.3188352882862091, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2358, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.8037135278514589, |
|
"grad_norm": 0.42813432216644287, |
|
"learning_rate": 0.0003, |
|
"loss": 0.374, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.81078691423519, |
|
"grad_norm": 0.40070992708206177, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4326, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.8178603006189213, |
|
"grad_norm": 0.45408982038497925, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4945, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.8249336870026527, |
|
"grad_norm": 0.42870137095451355, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4528, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.8320070733863836, |
|
"grad_norm": 0.3272749185562134, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2587, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.839080459770115, |
|
"grad_norm": 0.4601209759712219, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5043, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 0.48971623182296753, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4837, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.8532272325375774, |
|
"grad_norm": 0.37702813744544983, |
|
"learning_rate": 0.0003, |
|
"loss": 0.421, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.8603006189213085, |
|
"grad_norm": 0.37648722529411316, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2666, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.8673740053050398, |
|
"grad_norm": 0.5787553787231445, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2987, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.874447391688771, |
|
"grad_norm": 0.4249975085258484, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5577, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.881520778072502, |
|
"grad_norm": 0.3846690356731415, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3106, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.8885941644562334, |
|
"grad_norm": 0.37595272064208984, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3638, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.8956675508399647, |
|
"grad_norm": 0.4609120190143585, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4356, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.9027409372236959, |
|
"grad_norm": 0.3405689299106598, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3113, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.909814323607427, |
|
"grad_norm": 0.30769774317741394, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2626, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.9168877099911583, |
|
"grad_norm": 0.36806437373161316, |
|
"learning_rate": 0.0003, |
|
"loss": 0.401, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.9239610963748894, |
|
"grad_norm": 0.45491501688957214, |
|
"learning_rate": 0.0003, |
|
"loss": 0.4295, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.9310344827586206, |
|
"grad_norm": 0.3272283971309662, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3143, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.938107869142352, |
|
"grad_norm": 0.32763826847076416, |
|
"learning_rate": 0.0003, |
|
"loss": 0.246, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.9451812555260832, |
|
"grad_norm": 0.43065381050109863, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3338, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.9522546419098143, |
|
"grad_norm": 0.43713968992233276, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3136, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.9593280282935455, |
|
"grad_norm": 0.2735891342163086, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2381, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.9664014146772768, |
|
"grad_norm": 0.3156580626964569, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3336, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.973474801061008, |
|
"grad_norm": 0.4958134591579437, |
|
"learning_rate": 0.0003, |
|
"loss": 0.5279, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.980548187444739, |
|
"grad_norm": 0.41325512528419495, |
|
"learning_rate": 0.0003, |
|
"loss": 0.3997, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.9876215738284704, |
|
"grad_norm": 0.29986992478370667, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2996, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.9946949602122017, |
|
"grad_norm": 0.3219819962978363, |
|
"learning_rate": 0.0003, |
|
"loss": 0.2875, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.9946949602122017, |
|
"step": 282, |
|
"total_flos": 1.061363392708608e+16, |
|
"train_loss": 0.5953954255327265, |
|
"train_runtime": 9564.3104, |
|
"train_samples_per_second": 0.473, |
|
"train_steps_per_second": 0.029 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 282, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 1000, |
|
"total_flos": 1.061363392708608e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|