|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9996828417380272, |
|
"eval_steps": 500, |
|
"global_step": 1576, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0006343165239454488, |
|
"grad_norm": 4.072216572643748, |
|
"learning_rate": 6.329113924050633e-06, |
|
"loss": 3.2618, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003171582619727244, |
|
"grad_norm": 3.6569951400637444, |
|
"learning_rate": 3.1645569620253167e-05, |
|
"loss": 3.3026, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.006343165239454488, |
|
"grad_norm": 2.7101736291338634, |
|
"learning_rate": 6.329113924050633e-05, |
|
"loss": 3.1879, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.009514747859181731, |
|
"grad_norm": 0.8292602783930105, |
|
"learning_rate": 9.49367088607595e-05, |
|
"loss": 2.9431, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.012686330478908976, |
|
"grad_norm": 0.4014812685185947, |
|
"learning_rate": 0.00012658227848101267, |
|
"loss": 2.7946, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01585791309863622, |
|
"grad_norm": 0.3318705203128404, |
|
"learning_rate": 0.00015822784810126583, |
|
"loss": 2.6345, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.019029495718363463, |
|
"grad_norm": 0.4987722149702972, |
|
"learning_rate": 0.000189873417721519, |
|
"loss": 2.547, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.022201078338090707, |
|
"grad_norm": 0.3868479554032603, |
|
"learning_rate": 0.00022151898734177215, |
|
"loss": 2.462, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.02537266095781795, |
|
"grad_norm": 0.3440357488186946, |
|
"learning_rate": 0.00025316455696202533, |
|
"loss": 2.4119, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.028544243577545196, |
|
"grad_norm": 0.2542091158949379, |
|
"learning_rate": 0.0002848101265822785, |
|
"loss": 2.3337, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03171582619727244, |
|
"grad_norm": 0.5028112177822042, |
|
"learning_rate": 0.00031645569620253165, |
|
"loss": 2.3079, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.034887408816999685, |
|
"grad_norm": 0.3647071858886548, |
|
"learning_rate": 0.00034810126582278487, |
|
"loss": 2.2772, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.038058991436726926, |
|
"grad_norm": 0.3365279113242041, |
|
"learning_rate": 0.000379746835443038, |
|
"loss": 2.2643, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.041230574056454174, |
|
"grad_norm": 0.3437920074360608, |
|
"learning_rate": 0.0004113924050632912, |
|
"loss": 2.2467, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.044402156676181415, |
|
"grad_norm": 0.2180989181512006, |
|
"learning_rate": 0.0004430379746835443, |
|
"loss": 2.2192, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.047573739295908656, |
|
"grad_norm": 0.3740832044447792, |
|
"learning_rate": 0.00047468354430379745, |
|
"loss": 2.2202, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0507453219156359, |
|
"grad_norm": 0.4042428788012064, |
|
"learning_rate": 0.0005063291139240507, |
|
"loss": 2.206, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.053916904535363144, |
|
"grad_norm": 0.5502810429404877, |
|
"learning_rate": 0.0005379746835443038, |
|
"loss": 2.1792, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.05708848715509039, |
|
"grad_norm": 0.9725610364599878, |
|
"learning_rate": 0.000569620253164557, |
|
"loss": 2.1717, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06026006977481763, |
|
"grad_norm": 0.4008236462318082, |
|
"learning_rate": 0.0006012658227848101, |
|
"loss": 2.1581, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.06343165239454487, |
|
"grad_norm": 1.1094243654374898, |
|
"learning_rate": 0.0006329113924050633, |
|
"loss": 2.1497, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06660323501427212, |
|
"grad_norm": 0.257873202758346, |
|
"learning_rate": 0.0006645569620253165, |
|
"loss": 2.1357, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06977481763399937, |
|
"grad_norm": 0.4530053303085577, |
|
"learning_rate": 0.0006962025316455697, |
|
"loss": 2.1319, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0729464002537266, |
|
"grad_norm": 0.237617906265262, |
|
"learning_rate": 0.0007278481012658228, |
|
"loss": 2.114, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.07611798287345385, |
|
"grad_norm": 1.309988153168323, |
|
"learning_rate": 0.000759493670886076, |
|
"loss": 2.101, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0792895654931811, |
|
"grad_norm": 0.33147972360135136, |
|
"learning_rate": 0.0007911392405063291, |
|
"loss": 2.0983, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.08246114811290835, |
|
"grad_norm": 0.6923953247791184, |
|
"learning_rate": 0.0008227848101265824, |
|
"loss": 2.0775, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.08563273073263558, |
|
"grad_norm": 0.6108397955993198, |
|
"learning_rate": 0.0008544303797468354, |
|
"loss": 2.076, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.08880431335236283, |
|
"grad_norm": 0.25451367249745316, |
|
"learning_rate": 0.0008860759493670886, |
|
"loss": 2.0613, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09197589597209008, |
|
"grad_norm": 0.5696237036165042, |
|
"learning_rate": 0.0009177215189873418, |
|
"loss": 2.0571, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.09514747859181731, |
|
"grad_norm": 0.47600183429795934, |
|
"learning_rate": 0.0009493670886075949, |
|
"loss": 2.0597, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09831906121154456, |
|
"grad_norm": 0.49360558045014563, |
|
"learning_rate": 0.0009810126582278482, |
|
"loss": 2.0276, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.1014906438312718, |
|
"grad_norm": 0.2619769061768294, |
|
"learning_rate": 0.0009999950915251159, |
|
"loss": 2.049, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.10466222645099905, |
|
"grad_norm": 0.25933552218187667, |
|
"learning_rate": 0.0009999398722894419, |
|
"loss": 2.0304, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.10783380907072629, |
|
"grad_norm": 0.334897031747453, |
|
"learning_rate": 0.0009998233050230736, |
|
"loss": 2.0144, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.11100539169045354, |
|
"grad_norm": 0.31275160877263003, |
|
"learning_rate": 0.0009996454040300758, |
|
"loss": 1.9773, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.11417697431018078, |
|
"grad_norm": 0.5307747748580993, |
|
"learning_rate": 0.0009994061911408245, |
|
"loss": 1.9863, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.11734855692990802, |
|
"grad_norm": 0.4066591897491239, |
|
"learning_rate": 0.0009991056957093295, |
|
"loss": 1.9812, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.12052013954963527, |
|
"grad_norm": 0.40898061678871916, |
|
"learning_rate": 0.0009987439546096308, |
|
"loss": 1.9983, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.12369172216936251, |
|
"grad_norm": 0.6281156992876311, |
|
"learning_rate": 0.0009983210122312745, |
|
"loss": 1.9663, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.12686330478908975, |
|
"grad_norm": 0.418957878268899, |
|
"learning_rate": 0.000997836920473866, |
|
"loss": 1.9443, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.130034887408817, |
|
"grad_norm": 0.2967784254205414, |
|
"learning_rate": 0.000997291738740701, |
|
"loss": 1.9496, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.13320647002854424, |
|
"grad_norm": 0.32659030796363414, |
|
"learning_rate": 0.0009966855339314756, |
|
"loss": 1.9394, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.1363780526482715, |
|
"grad_norm": 0.44336971944165454, |
|
"learning_rate": 0.0009960183804340781, |
|
"loss": 1.9274, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.13954963526799874, |
|
"grad_norm": 0.7513841493295751, |
|
"learning_rate": 0.0009952903601154596, |
|
"loss": 1.937, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.142721217887726, |
|
"grad_norm": 0.8840919710762163, |
|
"learning_rate": 0.0009945015623115897, |
|
"loss": 1.9222, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.1458928005074532, |
|
"grad_norm": 0.3088103069995084, |
|
"learning_rate": 0.000993652083816491, |
|
"loss": 1.9272, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.14906438312718046, |
|
"grad_norm": 0.3205509173648325, |
|
"learning_rate": 0.0009927420288703658, |
|
"loss": 1.9282, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.1522359657469077, |
|
"grad_norm": 0.5725328480987164, |
|
"learning_rate": 0.0009917715091467998, |
|
"loss": 1.9092, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.15540754836663495, |
|
"grad_norm": 0.6881811737021382, |
|
"learning_rate": 0.000990740643739063, |
|
"loss": 1.9257, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1585791309863622, |
|
"grad_norm": 0.3928160747689014, |
|
"learning_rate": 0.000989649559145493, |
|
"loss": 1.9075, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.16175071360608945, |
|
"grad_norm": 0.3457762887590973, |
|
"learning_rate": 0.000988498389253972, |
|
"loss": 1.8954, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.1649222962258167, |
|
"grad_norm": 0.6130106535523941, |
|
"learning_rate": 0.0009872872753254995, |
|
"loss": 1.8869, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.16809387884554391, |
|
"grad_norm": 0.2006935276789736, |
|
"learning_rate": 0.0009860163659768566, |
|
"loss": 1.8764, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.17126546146527116, |
|
"grad_norm": 0.2519600708036085, |
|
"learning_rate": 0.0009846858171623687, |
|
"loss": 1.8592, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.1744370440849984, |
|
"grad_norm": 0.21430434382675823, |
|
"learning_rate": 0.0009832957921547696, |
|
"loss": 1.8588, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.17760862670472566, |
|
"grad_norm": 0.7316176065198735, |
|
"learning_rate": 0.000981846461525165, |
|
"loss": 1.8442, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.1807802093244529, |
|
"grad_norm": 0.5438158046657656, |
|
"learning_rate": 0.0009803380031221018, |
|
"loss": 1.8681, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.18395179194418015, |
|
"grad_norm": 0.22290789589006946, |
|
"learning_rate": 0.000978770602049745, |
|
"loss": 1.8342, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1871233745639074, |
|
"grad_norm": 0.2561355818352734, |
|
"learning_rate": 0.0009771444506451621, |
|
"loss": 1.8408, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.19029495718363462, |
|
"grad_norm": 0.3381776052738623, |
|
"learning_rate": 0.0009754597484547223, |
|
"loss": 1.829, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.19346653980336187, |
|
"grad_norm": 0.2267569653346989, |
|
"learning_rate": 0.0009737167022096094, |
|
"loss": 1.8283, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.19663812242308912, |
|
"grad_norm": 0.23165580428938548, |
|
"learning_rate": 0.0009719155258004541, |
|
"loss": 1.8071, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.19980970504281637, |
|
"grad_norm": 0.25586494282771377, |
|
"learning_rate": 0.0009700564402510871, |
|
"loss": 1.8145, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2029812876625436, |
|
"grad_norm": 0.2540371949506308, |
|
"learning_rate": 0.0009681396736914168, |
|
"loss": 1.8015, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.20615287028227086, |
|
"grad_norm": 0.6388348558478815, |
|
"learning_rate": 0.0009661654613294355, |
|
"loss": 1.8127, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.2093244529019981, |
|
"grad_norm": 0.3864015903258655, |
|
"learning_rate": 0.0009641340454223575, |
|
"loss": 1.7935, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.21249603552172533, |
|
"grad_norm": 0.2751489116810319, |
|
"learning_rate": 0.0009620456752468903, |
|
"loss": 1.8058, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.21566761814145258, |
|
"grad_norm": 0.7422837235361598, |
|
"learning_rate": 0.0009599006070686467, |
|
"loss": 1.7927, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.21883920076117983, |
|
"grad_norm": 0.4534291854575538, |
|
"learning_rate": 0.0009576991041106973, |
|
"loss": 1.7927, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.22201078338090707, |
|
"grad_norm": 0.3583139746684532, |
|
"learning_rate": 0.0009554414365212709, |
|
"loss": 1.7883, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.22518236600063432, |
|
"grad_norm": 0.20634161577455162, |
|
"learning_rate": 0.0009531278813406046, |
|
"loss": 1.7637, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.22835394862036157, |
|
"grad_norm": 0.5462716024749192, |
|
"learning_rate": 0.000950758722466947, |
|
"loss": 1.7823, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.23152553124008882, |
|
"grad_norm": 0.20847302955466993, |
|
"learning_rate": 0.0009483342506217214, |
|
"loss": 1.7736, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.23469711385981604, |
|
"grad_norm": 0.21809684764751344, |
|
"learning_rate": 0.0009458547633138515, |
|
"loss": 1.7636, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.23786869647954328, |
|
"grad_norm": 0.19220401784144317, |
|
"learning_rate": 0.0009433205648032528, |
|
"loss": 1.7509, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.24104027909927053, |
|
"grad_norm": 0.273271874095809, |
|
"learning_rate": 0.0009407319660634979, |
|
"loss": 1.7488, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.24421186171899778, |
|
"grad_norm": 0.31458786826276625, |
|
"learning_rate": 0.0009380892847436555, |
|
"loss": 1.7342, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.24738344433872503, |
|
"grad_norm": 0.19789392284188642, |
|
"learning_rate": 0.0009353928451293121, |
|
"loss": 1.743, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.2505550269584523, |
|
"grad_norm": 0.24499888411428472, |
|
"learning_rate": 0.0009326429781027789, |
|
"loss": 1.7193, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.2537266095781795, |
|
"grad_norm": 0.33173879702411524, |
|
"learning_rate": 0.0009298400211024877, |
|
"loss": 1.729, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.25689819219790677, |
|
"grad_norm": 0.34879621136110506, |
|
"learning_rate": 0.0009269843180815853, |
|
"loss": 1.7241, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.260069774817634, |
|
"grad_norm": 0.20572899222991778, |
|
"learning_rate": 0.0009240762194657253, |
|
"loss": 1.7229, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.26324135743736127, |
|
"grad_norm": 0.21561079654661294, |
|
"learning_rate": 0.0009211160821100679, |
|
"loss": 1.7155, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.2664129400570885, |
|
"grad_norm": 0.443393553505543, |
|
"learning_rate": 0.0009181042692554893, |
|
"loss": 1.7111, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2695845226768157, |
|
"grad_norm": 0.2266683853424827, |
|
"learning_rate": 0.0009150411504840086, |
|
"loss": 1.7009, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.272756105296543, |
|
"grad_norm": 0.3582735361142464, |
|
"learning_rate": 0.000911927101673436, |
|
"loss": 1.7016, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2759276879162702, |
|
"grad_norm": 0.43342116834945776, |
|
"learning_rate": 0.0009087625049512488, |
|
"loss": 1.7037, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.2790992705359975, |
|
"grad_norm": 0.3295875571024751, |
|
"learning_rate": 0.0009055477486476991, |
|
"loss": 1.682, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2822708531557247, |
|
"grad_norm": 0.1891978276034803, |
|
"learning_rate": 0.0009022832272481627, |
|
"loss": 1.6899, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.285442435775452, |
|
"grad_norm": 0.26615608448970285, |
|
"learning_rate": 0.000898969341344731, |
|
"loss": 1.6909, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2886140183951792, |
|
"grad_norm": 0.26554406802462666, |
|
"learning_rate": 0.0008956064975870544, |
|
"loss": 1.6764, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.2917856010149064, |
|
"grad_norm": 0.20008546513645153, |
|
"learning_rate": 0.0008921951086324411, |
|
"loss": 1.6571, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.2949571836346337, |
|
"grad_norm": 0.25575390463894654, |
|
"learning_rate": 0.0008887355930952202, |
|
"loss": 1.6636, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.2981287662543609, |
|
"grad_norm": 0.3501161922386378, |
|
"learning_rate": 0.0008852283754953732, |
|
"loss": 1.657, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3013003488740882, |
|
"grad_norm": 0.20707308621635875, |
|
"learning_rate": 0.0008816738862064412, |
|
"loss": 1.6659, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.3044719314938154, |
|
"grad_norm": 0.2572060719794171, |
|
"learning_rate": 0.0008780725614027123, |
|
"loss": 1.6521, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.3076435141135427, |
|
"grad_norm": 0.2773641851176988, |
|
"learning_rate": 0.000874424843005699, |
|
"loss": 1.6545, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.3108150967332699, |
|
"grad_norm": 0.5151669199508683, |
|
"learning_rate": 0.0008707311786299099, |
|
"loss": 1.6512, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.3139866793529971, |
|
"grad_norm": 0.35976330322294225, |
|
"learning_rate": 0.0008669920215279222, |
|
"loss": 1.6489, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.3171582619727244, |
|
"grad_norm": 0.18626964833018503, |
|
"learning_rate": 0.0008632078305347623, |
|
"loss": 1.6292, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3203298445924516, |
|
"grad_norm": 0.26834931489718644, |
|
"learning_rate": 0.0008593790700116029, |
|
"loss": 1.6244, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.3235014272121789, |
|
"grad_norm": 0.24263398553664595, |
|
"learning_rate": 0.0008555062097887796, |
|
"loss": 1.6173, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.3266730098319061, |
|
"grad_norm": 0.20406088273168801, |
|
"learning_rate": 0.0008515897251081384, |
|
"loss": 1.6273, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.3298445924516334, |
|
"grad_norm": 0.18611145873310309, |
|
"learning_rate": 0.0008476300965647186, |
|
"loss": 1.5954, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3330161750713606, |
|
"grad_norm": 0.2455194958653317, |
|
"learning_rate": 0.0008436278100477775, |
|
"loss": 1.6284, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.33618775769108783, |
|
"grad_norm": 0.3089123130431731, |
|
"learning_rate": 0.0008395833566811676, |
|
"loss": 1.6043, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.3393593403108151, |
|
"grad_norm": 0.278226103442385, |
|
"learning_rate": 0.0008354972327630705, |
|
"loss": 1.5991, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.3425309229305423, |
|
"grad_norm": 0.40283564646452896, |
|
"learning_rate": 0.000831369939705094, |
|
"loss": 1.5942, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3457025055502696, |
|
"grad_norm": 0.667879925495927, |
|
"learning_rate": 0.0008272019839707461, |
|
"loss": 1.5968, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.3488740881699968, |
|
"grad_norm": 0.34787949832129605, |
|
"learning_rate": 0.0008229938770132843, |
|
"loss": 1.5815, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3520456707897241, |
|
"grad_norm": 0.2584482438633063, |
|
"learning_rate": 0.0008187461352129555, |
|
"loss": 1.5884, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.3552172534094513, |
|
"grad_norm": 0.22742176340374293, |
|
"learning_rate": 0.0008144592798136309, |
|
"loss": 1.5919, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.35838883602917854, |
|
"grad_norm": 0.42255520990843093, |
|
"learning_rate": 0.0008101338368588436, |
|
"loss": 1.5913, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.3615604186489058, |
|
"grad_norm": 0.3824113474293145, |
|
"learning_rate": 0.0008057703371272366, |
|
"loss": 1.5611, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.36473200126863303, |
|
"grad_norm": 0.2091017723989841, |
|
"learning_rate": 0.0008013693160674316, |
|
"loss": 1.5626, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.3679035838883603, |
|
"grad_norm": 0.23814734243563393, |
|
"learning_rate": 0.0007969313137323229, |
|
"loss": 1.5656, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.37107516650808753, |
|
"grad_norm": 0.2597004458168679, |
|
"learning_rate": 0.0007924568747128076, |
|
"loss": 1.5624, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.3742467491278148, |
|
"grad_norm": 0.2949069544402481, |
|
"learning_rate": 0.0007879465480709576, |
|
"loss": 1.5516, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.377418331747542, |
|
"grad_norm": 0.21263382790898516, |
|
"learning_rate": 0.0007834008872726453, |
|
"loss": 1.5409, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.38058991436726924, |
|
"grad_norm": 0.27681275720229476, |
|
"learning_rate": 0.0007788204501196254, |
|
"loss": 1.5507, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3837614969869965, |
|
"grad_norm": 0.5196324383707882, |
|
"learning_rate": 0.000774205798681088, |
|
"loss": 1.5435, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.38693307960672374, |
|
"grad_norm": 0.3397151418636398, |
|
"learning_rate": 0.000769557499224686, |
|
"loss": 1.5292, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.390104662226451, |
|
"grad_norm": 0.21757261564984298, |
|
"learning_rate": 0.0007648761221470481, |
|
"loss": 1.5342, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.39327624484617824, |
|
"grad_norm": 0.23799713493080946, |
|
"learning_rate": 0.000760162241903785, |
|
"loss": 1.5314, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3964478274659055, |
|
"grad_norm": 0.20955913102047505, |
|
"learning_rate": 0.0007554164369389975, |
|
"loss": 1.5149, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.39961941008563273, |
|
"grad_norm": 0.19465193626198848, |
|
"learning_rate": 0.0007506392896142951, |
|
"loss": 1.514, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.40279099270535995, |
|
"grad_norm": 0.37370015455345407, |
|
"learning_rate": 0.0007458313861373336, |
|
"loss": 1.5138, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.4059625753250872, |
|
"grad_norm": 0.2112845859224254, |
|
"learning_rate": 0.0007409933164898818, |
|
"loss": 1.5024, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.40913415794481445, |
|
"grad_norm": 0.24626397881644146, |
|
"learning_rate": 0.0007361256743554241, |
|
"loss": 1.519, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.4123057405645417, |
|
"grad_norm": 0.3216374157044185, |
|
"learning_rate": 0.0007312290570463083, |
|
"loss": 1.5039, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.41547732318426894, |
|
"grad_norm": 0.22302629969432056, |
|
"learning_rate": 0.0007263040654304502, |
|
"loss": 1.494, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.4186489058039962, |
|
"grad_norm": 0.2675317557830398, |
|
"learning_rate": 0.0007213513038575998, |
|
"loss": 1.4884, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.42182048842372344, |
|
"grad_norm": 0.2905992631967741, |
|
"learning_rate": 0.0007163713800851811, |
|
"loss": 1.4851, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.42499207104345066, |
|
"grad_norm": 0.20033257450058217, |
|
"learning_rate": 0.0007113649052037139, |
|
"loss": 1.475, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.42816365366317793, |
|
"grad_norm": 0.24204591478150614, |
|
"learning_rate": 0.0007063324935618264, |
|
"loss": 1.4854, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.43133523628290515, |
|
"grad_norm": 0.2121430223132248, |
|
"learning_rate": 0.0007012747626908679, |
|
"loss": 1.4867, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.43450681890263243, |
|
"grad_norm": 0.22539040426730952, |
|
"learning_rate": 0.0006961923332291309, |
|
"loss": 1.467, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.43767840152235965, |
|
"grad_norm": 0.22695514383581045, |
|
"learning_rate": 0.0006910858288456921, |
|
"loss": 1.4657, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4408499841420869, |
|
"grad_norm": 0.22184474318285244, |
|
"learning_rate": 0.0006859558761638819, |
|
"loss": 1.4423, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.44402156676181415, |
|
"grad_norm": 0.29575466467956124, |
|
"learning_rate": 0.0006808031046843901, |
|
"loss": 1.4485, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.44719314938154137, |
|
"grad_norm": 0.21488007270980458, |
|
"learning_rate": 0.0006756281467080205, |
|
"loss": 1.4508, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.45036473200126864, |
|
"grad_norm": 0.38667469007287536, |
|
"learning_rate": 0.0006704316372580989, |
|
"loss": 1.4459, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.45353631462099586, |
|
"grad_norm": 0.5234661249173684, |
|
"learning_rate": 0.0006652142140025517, |
|
"loss": 1.435, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.45670789724072314, |
|
"grad_norm": 0.37462414488518325, |
|
"learning_rate": 0.0006599765171756538, |
|
"loss": 1.4379, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.45987947986045036, |
|
"grad_norm": 0.3040640640559466, |
|
"learning_rate": 0.0006547191894994679, |
|
"loss": 1.4341, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.46305106248017763, |
|
"grad_norm": 0.28687145037107376, |
|
"learning_rate": 0.0006494428761049736, |
|
"loss": 1.4297, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.46622264509990485, |
|
"grad_norm": 0.20728566940658133, |
|
"learning_rate": 0.0006441482244529037, |
|
"loss": 1.4124, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.4693942277196321, |
|
"grad_norm": 0.21956352378213645, |
|
"learning_rate": 0.0006388358842542938, |
|
"loss": 1.4162, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.47256581033935935, |
|
"grad_norm": 0.20961137895482168, |
|
"learning_rate": 0.0006335065073907551, |
|
"loss": 1.4055, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.47573739295908657, |
|
"grad_norm": 0.21979161995613117, |
|
"learning_rate": 0.0006281607478344823, |
|
"loss": 1.4112, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.47890897557881384, |
|
"grad_norm": 0.2707420648256881, |
|
"learning_rate": 0.0006227992615680033, |
|
"loss": 1.4127, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.48208055819854106, |
|
"grad_norm": 0.2829526420993808, |
|
"learning_rate": 0.000617422706503684, |
|
"loss": 1.3905, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.48525214081826834, |
|
"grad_norm": 0.2988909342739172, |
|
"learning_rate": 0.0006120317424029943, |
|
"loss": 1.3941, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.48842372343799556, |
|
"grad_norm": 0.2787477024270155, |
|
"learning_rate": 0.0006066270307955492, |
|
"loss": 1.404, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.4915953060577228, |
|
"grad_norm": 0.22142539860110755, |
|
"learning_rate": 0.000601209234897931, |
|
"loss": 1.3886, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.49476688867745006, |
|
"grad_norm": 0.2777507592841434, |
|
"learning_rate": 0.0005957790195323064, |
|
"loss": 1.3896, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4979384712971773, |
|
"grad_norm": 0.2552429702914411, |
|
"learning_rate": 0.0005903370510448447, |
|
"loss": 1.3779, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.5011100539169046, |
|
"grad_norm": 0.268362490404369, |
|
"learning_rate": 0.0005848839972239511, |
|
"loss": 1.3732, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5042816365366318, |
|
"grad_norm": 0.24326700144489616, |
|
"learning_rate": 0.0005794205272183205, |
|
"loss": 1.3748, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.507453219156359, |
|
"grad_norm": 0.36345823855975784, |
|
"learning_rate": 0.0005739473114548266, |
|
"loss": 1.3755, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5106248017760863, |
|
"grad_norm": 0.4843512449452226, |
|
"learning_rate": 0.000568465021556253, |
|
"loss": 1.3638, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.5137963843958135, |
|
"grad_norm": 0.28627979209509896, |
|
"learning_rate": 0.0005629743302588779, |
|
"loss": 1.3514, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.5169679670155407, |
|
"grad_norm": 0.25100043970170133, |
|
"learning_rate": 0.0005574759113299217, |
|
"loss": 1.341, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.520139549635268, |
|
"grad_norm": 0.294501664896367, |
|
"learning_rate": 0.0005519704394848692, |
|
"loss": 1.3323, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.5233111322549953, |
|
"grad_norm": 0.2382026692611784, |
|
"learning_rate": 0.0005464585903046744, |
|
"loss": 1.3483, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.5264827148747225, |
|
"grad_norm": 0.3233277006590301, |
|
"learning_rate": 0.0005409410401528587, |
|
"loss": 1.3275, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.5296542974944497, |
|
"grad_norm": 0.27343633383605254, |
|
"learning_rate": 0.0005354184660925148, |
|
"loss": 1.3379, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.532825880114177, |
|
"grad_norm": 0.23005655087591242, |
|
"learning_rate": 0.0005298915458032233, |
|
"loss": 1.3213, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.5359974627339043, |
|
"grad_norm": 0.22247227894179622, |
|
"learning_rate": 0.0005243609574978941, |
|
"loss": 1.3295, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.5391690453536314, |
|
"grad_norm": 0.30014284045451645, |
|
"learning_rate": 0.0005188273798395424, |
|
"loss": 1.3214, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5423406279733587, |
|
"grad_norm": 0.3132385853038301, |
|
"learning_rate": 0.0005132914918580093, |
|
"loss": 1.3172, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.545512210593086, |
|
"grad_norm": 0.33728113255378034, |
|
"learning_rate": 0.0005077539728666374, |
|
"loss": 1.3218, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5486837932128132, |
|
"grad_norm": 0.25874007616270794, |
|
"learning_rate": 0.0005022155023789121, |
|
"loss": 1.2957, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.5518553758325404, |
|
"grad_norm": 0.24203527103405384, |
|
"learning_rate": 0.0004966767600250775, |
|
"loss": 1.3035, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5550269584522677, |
|
"grad_norm": 0.21381303917505012, |
|
"learning_rate": 0.0004911384254687388, |
|
"loss": 1.2995, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.558198541071995, |
|
"grad_norm": 0.24304896972645837, |
|
"learning_rate": 0.00048560117832345984, |
|
"loss": 1.2824, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5613701236917221, |
|
"grad_norm": 0.3080366389357483, |
|
"learning_rate": 0.0004800656980693674, |
|
"loss": 1.2898, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.5645417063114494, |
|
"grad_norm": 0.262557897437375, |
|
"learning_rate": 0.00047453266396977174, |
|
"loss": 1.2779, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5677132889311767, |
|
"grad_norm": 0.31083455423034645, |
|
"learning_rate": 0.00046900275498781347, |
|
"loss": 1.2806, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.570884871550904, |
|
"grad_norm": 0.21597926838814396, |
|
"learning_rate": 0.00046347664970314723, |
|
"loss": 1.274, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5740564541706311, |
|
"grad_norm": 0.22596235970597578, |
|
"learning_rate": 0.0004579550262286731, |
|
"loss": 1.2666, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.5772280367903584, |
|
"grad_norm": 0.22827094422158484, |
|
"learning_rate": 0.0004524385621273246, |
|
"loss": 1.2583, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5803996194100857, |
|
"grad_norm": 0.24853325436526866, |
|
"learning_rate": 0.00044692793432892387, |
|
"loss": 1.2693, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.5835712020298128, |
|
"grad_norm": 0.2765479869012326, |
|
"learning_rate": 0.00044142381904711624, |
|
"loss": 1.26, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5867427846495401, |
|
"grad_norm": 0.27285996236330706, |
|
"learning_rate": 0.00043592689169639034, |
|
"loss": 1.246, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.5899143672692674, |
|
"grad_norm": 0.28781941328826144, |
|
"learning_rate": 0.0004304378268091982, |
|
"loss": 1.249, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5930859498889947, |
|
"grad_norm": 0.240504157977766, |
|
"learning_rate": 0.0004249572979531822, |
|
"loss": 1.2534, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.5962575325087218, |
|
"grad_norm": 0.341483100362183, |
|
"learning_rate": 0.0004194859776485216, |
|
"loss": 1.2376, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5994291151284491, |
|
"grad_norm": 0.27130765824409686, |
|
"learning_rate": 0.0004140245372854065, |
|
"loss": 1.2426, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.6026006977481764, |
|
"grad_norm": 0.28496801994375115, |
|
"learning_rate": 0.0004085736470416516, |
|
"loss": 1.2347, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6057722803679035, |
|
"grad_norm": 0.33820479660283104, |
|
"learning_rate": 0.00040313397580045765, |
|
"loss": 1.2397, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.6089438629876308, |
|
"grad_norm": 0.2537502852561033, |
|
"learning_rate": 0.0003977061910683325, |
|
"loss": 1.2319, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.6121154456073581, |
|
"grad_norm": 0.2543562572422921, |
|
"learning_rate": 0.0003922909588931808, |
|
"loss": 1.2221, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.6152870282270854, |
|
"grad_norm": 0.28194628415561285, |
|
"learning_rate": 0.0003868889437825724, |
|
"loss": 1.2213, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.6184586108468125, |
|
"grad_norm": 0.26751445743912233, |
|
"learning_rate": 0.0003815008086222007, |
|
"loss": 1.211, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.6216301934665398, |
|
"grad_norm": 0.22966413613029274, |
|
"learning_rate": 0.0003761272145945388, |
|
"loss": 1.2058, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.6248017760862671, |
|
"grad_norm": 0.24668142278345986, |
|
"learning_rate": 0.0003707688210977055, |
|
"loss": 1.2223, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.6279733587059942, |
|
"grad_norm": 0.23811743937781157, |
|
"learning_rate": 0.00036542628566455025, |
|
"loss": 1.2024, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.6311449413257215, |
|
"grad_norm": 0.2901121774163334, |
|
"learning_rate": 0.0003601002638819665, |
|
"loss": 1.2036, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.6343165239454488, |
|
"grad_norm": 0.2600410825499236, |
|
"learning_rate": 0.0003547914093104439, |
|
"loss": 1.2012, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6374881065651761, |
|
"grad_norm": 0.352563938838776, |
|
"learning_rate": 0.0003495003734038697, |
|
"loss": 1.1751, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.6406596891849032, |
|
"grad_norm": 0.26125000772161344, |
|
"learning_rate": 0.00034422780542958827, |
|
"loss": 1.1919, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.6438312718046305, |
|
"grad_norm": 0.2640437019043301, |
|
"learning_rate": 0.00033897435238872874, |
|
"loss": 1.1781, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.6470028544243578, |
|
"grad_norm": 0.2782272386225361, |
|
"learning_rate": 0.00033374065893681127, |
|
"loss": 1.1821, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.650174437044085, |
|
"grad_norm": 0.24555576527657738, |
|
"learning_rate": 0.0003285273673046409, |
|
"loss": 1.1721, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.6533460196638122, |
|
"grad_norm": 0.40556770599075914, |
|
"learning_rate": 0.00032333511721949817, |
|
"loss": 1.1679, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.6565176022835395, |
|
"grad_norm": 0.25906084663363754, |
|
"learning_rate": 0.00031816454582663856, |
|
"loss": 1.1567, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.6596891849032668, |
|
"grad_norm": 0.27183164743159954, |
|
"learning_rate": 0.0003130162876111074, |
|
"loss": 1.1596, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.6628607675229939, |
|
"grad_norm": 0.24394077297020256, |
|
"learning_rate": 0.0003078909743198817, |
|
"loss": 1.1487, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.6660323501427212, |
|
"grad_norm": 0.23339037702881532, |
|
"learning_rate": 0.000302789234884348, |
|
"loss": 1.1636, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6692039327624485, |
|
"grad_norm": 0.2651227300122355, |
|
"learning_rate": 0.00029771169534312583, |
|
"loss": 1.1475, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.6723755153821757, |
|
"grad_norm": 0.23719809453094406, |
|
"learning_rate": 0.000292658978765246, |
|
"loss": 1.1496, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6755470980019029, |
|
"grad_norm": 0.31466276172538943, |
|
"learning_rate": 0.000287631705173693, |
|
"loss": 1.1404, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.6787186806216302, |
|
"grad_norm": 0.2657362830496313, |
|
"learning_rate": 0.00028263049146932153, |
|
"loss": 1.156, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6818902632413575, |
|
"grad_norm": 0.2353420135393821, |
|
"learning_rate": 0.00027765595135515673, |
|
"loss": 1.1382, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.6850618458610847, |
|
"grad_norm": 0.29180017450918116, |
|
"learning_rate": 0.00027270869526108506, |
|
"loss": 1.1403, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.6882334284808119, |
|
"grad_norm": 0.28381426741820764, |
|
"learning_rate": 0.000267789330268949, |
|
"loss": 1.1351, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.6914050111005392, |
|
"grad_norm": 0.2368326399732858, |
|
"learning_rate": 0.00026289846003805075, |
|
"loss": 1.1264, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.6945765937202664, |
|
"grad_norm": 0.24260754741892487, |
|
"learning_rate": 0.0002580366847310774, |
|
"loss": 1.1318, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.6977481763399936, |
|
"grad_norm": 0.33032504483698477, |
|
"learning_rate": 0.0002532046009404537, |
|
"loss": 1.123, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7009197589597209, |
|
"grad_norm": 0.2626626593890248, |
|
"learning_rate": 0.00024840280161513446, |
|
"loss": 1.1147, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.7040913415794482, |
|
"grad_norm": 0.24734490639888912, |
|
"learning_rate": 0.0002436318759878432, |
|
"loss": 1.1141, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.7072629241991754, |
|
"grad_norm": 0.25777344330608626, |
|
"learning_rate": 0.00023889240950276602, |
|
"loss": 1.1069, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.7104345068189026, |
|
"grad_norm": 0.24965316567346824, |
|
"learning_rate": 0.00023418498374371268, |
|
"loss": 1.0961, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.7136060894386299, |
|
"grad_norm": 0.2588175173420704, |
|
"learning_rate": 0.0002295101763627483, |
|
"loss": 1.1062, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.7167776720583571, |
|
"grad_norm": 0.2617691894820057, |
|
"learning_rate": 0.00022486856100931146, |
|
"loss": 1.0949, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.7199492546780843, |
|
"grad_norm": 0.24640261475787326, |
|
"learning_rate": 0.00022026070725981867, |
|
"loss": 1.1024, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.7231208372978116, |
|
"grad_norm": 0.25512789636052857, |
|
"learning_rate": 0.0002156871805477732, |
|
"loss": 1.0981, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.7262924199175389, |
|
"grad_norm": 0.2380744076277497, |
|
"learning_rate": 0.00021114854209437889, |
|
"loss": 1.0803, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.7294640025372661, |
|
"grad_norm": 0.26280691701304987, |
|
"learning_rate": 0.00020664534883967311, |
|
"loss": 1.0851, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.7326355851569933, |
|
"grad_norm": 0.2565430586115982, |
|
"learning_rate": 0.00020217815337418427, |
|
"loss": 1.076, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.7358071677767206, |
|
"grad_norm": 0.2533752282987412, |
|
"learning_rate": 0.00019774750387112174, |
|
"loss": 1.0826, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.7389787503964478, |
|
"grad_norm": 0.28126577459530283, |
|
"learning_rate": 0.00019335394401911082, |
|
"loss": 1.0719, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.7421503330161751, |
|
"grad_norm": 0.2650849332335025, |
|
"learning_rate": 0.00018899801295547476, |
|
"loss": 1.0742, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.7453219156359023, |
|
"grad_norm": 0.2603829852111257, |
|
"learning_rate": 0.00018468024520007764, |
|
"loss": 1.0772, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.7484934982556296, |
|
"grad_norm": 0.2527087543783394, |
|
"learning_rate": 0.00018040117058973316, |
|
"loss": 1.0595, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.7516650808753568, |
|
"grad_norm": 0.24678722431639855, |
|
"learning_rate": 0.0001761613142131867, |
|
"loss": 1.0469, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.754836663495084, |
|
"grad_norm": 0.25910814410326344, |
|
"learning_rate": 0.00017196119634668293, |
|
"loss": 1.0627, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.7580082461148113, |
|
"grad_norm": 0.26173306347429054, |
|
"learning_rate": 0.00016780133239012075, |
|
"loss": 1.0607, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.7611798287345385, |
|
"grad_norm": 0.24651016867032868, |
|
"learning_rate": 0.0001636822328038095, |
|
"loss": 1.0546, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7643514113542658, |
|
"grad_norm": 0.28020957707447064, |
|
"learning_rate": 0.00015960440304582858, |
|
"loss": 1.0579, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.767522993973993, |
|
"grad_norm": 0.26371476098524943, |
|
"learning_rate": 0.00015556834351000354, |
|
"loss": 1.0537, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.7706945765937203, |
|
"grad_norm": 0.24623457163199874, |
|
"learning_rate": 0.0001515745494645019, |
|
"loss": 1.045, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.7738661592134475, |
|
"grad_norm": 0.3155558400199454, |
|
"learning_rate": 0.0001476235109910576, |
|
"loss": 1.0405, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7770377418331748, |
|
"grad_norm": 0.2789622570986826, |
|
"learning_rate": 0.00014371571292483393, |
|
"loss": 1.0381, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.780209324452902, |
|
"grad_norm": 0.2409114498230053, |
|
"learning_rate": 0.0001398516347949284, |
|
"loss": 1.0394, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7833809070726292, |
|
"grad_norm": 0.267235707838601, |
|
"learning_rate": 0.0001360317507655293, |
|
"loss": 1.0278, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.7865524896923565, |
|
"grad_norm": 0.28458374786381546, |
|
"learning_rate": 0.00013225652957773044, |
|
"loss": 1.0326, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.7897240723120837, |
|
"grad_norm": 0.25695712786415686, |
|
"learning_rate": 0.00012852643449201212, |
|
"loss": 1.023, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.792895654931811, |
|
"grad_norm": 0.2590457354954553, |
|
"learning_rate": 0.0001248419232313938, |
|
"loss": 1.0232, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.7960672375515382, |
|
"grad_norm": 0.2715843775728456, |
|
"learning_rate": 0.000121203447925266, |
|
"loss": 1.0287, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.7992388201712655, |
|
"grad_norm": 0.2398511137856279, |
|
"learning_rate": 0.00011761145505391024, |
|
"loss": 1.0186, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.8024104027909927, |
|
"grad_norm": 0.27281371167245233, |
|
"learning_rate": 0.00011406638539370979, |
|
"loss": 1.0224, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.8055819854107199, |
|
"grad_norm": 0.3188098317762095, |
|
"learning_rate": 0.00011056867396306292, |
|
"loss": 1.0092, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.8087535680304472, |
|
"grad_norm": 0.3265540754130617, |
|
"learning_rate": 0.00010711874996900023, |
|
"loss": 1.0104, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.8119251506501745, |
|
"grad_norm": 0.2607401452606644, |
|
"learning_rate": 0.00010371703675451733, |
|
"loss": 1.0114, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.8150967332699017, |
|
"grad_norm": 0.2883090335939891, |
|
"learning_rate": 0.0001003639517466256, |
|
"loss": 1.0093, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.8182683158896289, |
|
"grad_norm": 0.25562259108760305, |
|
"learning_rate": 9.705990640512907e-05, |
|
"loss": 0.9938, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.8214398985093562, |
|
"grad_norm": 0.2753573095600564, |
|
"learning_rate": 9.380530617213456e-05, |
|
"loss": 1.0114, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.8246114811290834, |
|
"grad_norm": 0.23998112779723507, |
|
"learning_rate": 9.060055042229881e-05, |
|
"loss": 1.0089, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8277830637488106, |
|
"grad_norm": 0.2524204007518801, |
|
"learning_rate": 8.74460324138216e-05, |
|
"loss": 1.007, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.8309546463685379, |
|
"grad_norm": 0.2526736715480949, |
|
"learning_rate": 8.434213924018835e-05, |
|
"loss": 1.0, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.8341262289882652, |
|
"grad_norm": 0.2503643121035892, |
|
"learning_rate": 8.128925178266927e-05, |
|
"loss": 0.9965, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.8372978116079924, |
|
"grad_norm": 0.23244697445873563, |
|
"learning_rate": 7.828774466358179e-05, |
|
"loss": 0.9988, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.8404693942277196, |
|
"grad_norm": 0.2560633353876119, |
|
"learning_rate": 7.53379862003195e-05, |
|
"loss": 1.0048, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.8436409768474469, |
|
"grad_norm": 0.2421969390657872, |
|
"learning_rate": 7.244033836015695e-05, |
|
"loss": 0.9844, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.8468125594671742, |
|
"grad_norm": 0.2607793031238756, |
|
"learning_rate": 6.95951567158305e-05, |
|
"loss": 0.9778, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.8499841420869013, |
|
"grad_norm": 0.2698408656759126, |
|
"learning_rate": 6.680279040190746e-05, |
|
"loss": 0.9828, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.8531557247066286, |
|
"grad_norm": 0.23841421968692497, |
|
"learning_rate": 6.406358207194224e-05, |
|
"loss": 0.9991, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.8563273073263559, |
|
"grad_norm": 0.28084531889088754, |
|
"learning_rate": 6.137786785642985e-05, |
|
"loss": 0.9855, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.8594988899460831, |
|
"grad_norm": 0.24806562901660065, |
|
"learning_rate": 5.8745977321558786e-05, |
|
"loss": 0.9747, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.8626704725658103, |
|
"grad_norm": 0.2511271472454086, |
|
"learning_rate": 5.616823342876931e-05, |
|
"loss": 0.9758, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.8658420551855376, |
|
"grad_norm": 0.24241834259427655, |
|
"learning_rate": 5.364495249512336e-05, |
|
"loss": 0.9765, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.8690136378052649, |
|
"grad_norm": 0.23883983545121304, |
|
"learning_rate": 5.11764441544883e-05, |
|
"loss": 0.9808, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.872185220424992, |
|
"grad_norm": 0.24435079120648112, |
|
"learning_rate": 4.8763011319542025e-05, |
|
"loss": 0.9777, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.8753568030447193, |
|
"grad_norm": 0.2629272614210174, |
|
"learning_rate": 4.6404950144602e-05, |
|
"loss": 0.9819, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.8785283856644466, |
|
"grad_norm": 0.2663926565440222, |
|
"learning_rate": 4.4102549989283756e-05, |
|
"loss": 0.9675, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.8816999682841739, |
|
"grad_norm": 0.2431315966754426, |
|
"learning_rate": 4.1856093382994067e-05, |
|
"loss": 0.9617, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.884871550903901, |
|
"grad_norm": 0.2615985135082817, |
|
"learning_rate": 3.966585599026051e-05, |
|
"loss": 0.9705, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.8880431335236283, |
|
"grad_norm": 0.2437571495377507, |
|
"learning_rate": 3.753210657690537e-05, |
|
"loss": 0.9637, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8912147161433556, |
|
"grad_norm": 0.2867683320851404, |
|
"learning_rate": 3.5455106977064555e-05, |
|
"loss": 0.9813, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.8943862987630827, |
|
"grad_norm": 0.23596873266775822, |
|
"learning_rate": 3.343511206105804e-05, |
|
"loss": 0.9654, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.89755788138281, |
|
"grad_norm": 0.2691994741352151, |
|
"learning_rate": 3.147236970411449e-05, |
|
"loss": 0.9559, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.9007294640025373, |
|
"grad_norm": 0.25848494724563204, |
|
"learning_rate": 2.9567120755953858e-05, |
|
"loss": 0.9631, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.9039010466222646, |
|
"grad_norm": 0.2329147999513356, |
|
"learning_rate": 2.7719599011233333e-05, |
|
"loss": 0.9654, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.9070726292419917, |
|
"grad_norm": 0.24897956262619586, |
|
"learning_rate": 2.593003118085746e-05, |
|
"loss": 0.9686, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.910244211861719, |
|
"grad_norm": 0.24140747398754628, |
|
"learning_rate": 2.4198636864158684e-05, |
|
"loss": 0.9709, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.9134157944814463, |
|
"grad_norm": 0.23859470642150962, |
|
"learning_rate": 2.2525628521949837e-05, |
|
"loss": 0.9723, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.9165873771011734, |
|
"grad_norm": 0.23614657553272203, |
|
"learning_rate": 2.091121145045327e-05, |
|
"loss": 0.96, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.9197589597209007, |
|
"grad_norm": 0.2305582486421353, |
|
"learning_rate": 1.9355583756108407e-05, |
|
"loss": 0.9622, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.922930542340628, |
|
"grad_norm": 0.23818404259591164, |
|
"learning_rate": 1.7858936331262122e-05, |
|
"loss": 0.9612, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.9261021249603553, |
|
"grad_norm": 0.2378653986328734, |
|
"learning_rate": 1.6421452830744365e-05, |
|
"loss": 0.9716, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.9292737075800824, |
|
"grad_norm": 0.2329586634857264, |
|
"learning_rate": 1.5043309649331205e-05, |
|
"loss": 0.9558, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.9324452901998097, |
|
"grad_norm": 0.4063284757979328, |
|
"learning_rate": 1.3724675900099959e-05, |
|
"loss": 0.9654, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.935616872819537, |
|
"grad_norm": 0.24883789905797735, |
|
"learning_rate": 1.246571339367658e-05, |
|
"loss": 0.9603, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.9387884554392641, |
|
"grad_norm": 0.2484198570399255, |
|
"learning_rate": 1.1266576618380097e-05, |
|
"loss": 0.9579, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.9419600380589914, |
|
"grad_norm": 0.24729636213521072, |
|
"learning_rate": 1.0127412721265218e-05, |
|
"loss": 0.9675, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.9451316206787187, |
|
"grad_norm": 0.2505398676306425, |
|
"learning_rate": 9.048361490065548e-06, |
|
"loss": 0.9526, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.948303203298446, |
|
"grad_norm": 0.26466516239326676, |
|
"learning_rate": 8.029555336040383e-06, |
|
"loss": 0.9661, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.9514747859181731, |
|
"grad_norm": 0.24164934386910944, |
|
"learning_rate": 7.071119277726301e-06, |
|
"loss": 0.9577, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.9546463685379004, |
|
"grad_norm": 0.23732953651039135, |
|
"learning_rate": 6.17317092559605e-06, |
|
"loss": 0.9562, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.9578179511576277, |
|
"grad_norm": 0.23388339171880254, |
|
"learning_rate": 5.335820467626485e-06, |
|
"loss": 0.973, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.9609895337773549, |
|
"grad_norm": 0.2318159543809297, |
|
"learning_rate": 4.559170655777267e-06, |
|
"loss": 0.9478, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.9641611163970821, |
|
"grad_norm": 0.24710746850544488, |
|
"learning_rate": 3.843316793382123e-06, |
|
"loss": 0.9707, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.9673326990168094, |
|
"grad_norm": 0.2862292656525582, |
|
"learning_rate": 3.188346723454083e-06, |
|
"loss": 0.9643, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.9705042816365367, |
|
"grad_norm": 0.2380821089662019, |
|
"learning_rate": 2.594340817906271e-06, |
|
"loss": 0.9624, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.9736758642562638, |
|
"grad_norm": 0.23933390384190942, |
|
"learning_rate": 2.0613719676891853e-06, |
|
"loss": 0.9599, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.9768474468759911, |
|
"grad_norm": 0.24854365220185978, |
|
"learning_rate": 1.5895055738465169e-06, |
|
"loss": 0.9592, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.9800190294957184, |
|
"grad_norm": 0.24336558825284596, |
|
"learning_rate": 1.1787995394893502e-06, |
|
"loss": 0.962, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.9831906121154456, |
|
"grad_norm": 0.22749162744166387, |
|
"learning_rate": 8.293042626912328e-07, |
|
"loss": 0.9573, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.9863621947351728, |
|
"grad_norm": 0.2390082283281969, |
|
"learning_rate": 5.410626303034017e-07, |
|
"loss": 0.9556, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.9895337773549001, |
|
"grad_norm": 0.23360184137608606, |
|
"learning_rate": 3.141100126923813e-07, |
|
"loss": 0.9571, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.9927053599746274, |
|
"grad_norm": 0.2367998742996814, |
|
"learning_rate": 1.4847425939956693e-07, |
|
"loss": 0.9495, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.9958769425943546, |
|
"grad_norm": 0.2269345672235601, |
|
"learning_rate": 4.417569572368052e-08, |
|
"loss": 0.9499, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.9990485252140818, |
|
"grad_norm": 0.2484799432231265, |
|
"learning_rate": 1.2271202268210324e-09, |
|
"loss": 0.956, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.9996828417380272, |
|
"eval_loss": 2.245941638946533, |
|
"eval_runtime": 8.442, |
|
"eval_samples_per_second": 46.316, |
|
"eval_steps_per_second": 5.804, |
|
"step": 1576 |
|
}, |
|
{ |
|
"epoch": 0.9996828417380272, |
|
"step": 1576, |
|
"total_flos": 38663670988800.0, |
|
"train_loss": 1.4483577938854393, |
|
"train_runtime": 3409.163, |
|
"train_samples_per_second": 14.797, |
|
"train_steps_per_second": 0.462 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1576, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": false, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 38663670988800.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|