|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.941747572815534, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.032362459546925564, |
|
"grad_norm": 0.19843709468841553, |
|
"learning_rate": 4.999994725208513e-06, |
|
"loss": 1.6354, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06472491909385113, |
|
"grad_norm": 0.19520695507526398, |
|
"learning_rate": 4.998353859072719e-06, |
|
"loss": 1.5795, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0970873786407767, |
|
"grad_norm": 0.15560713410377502, |
|
"learning_rate": 4.9937847678082505e-06, |
|
"loss": 1.4644, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12944983818770225, |
|
"grad_norm": 0.113382488489151, |
|
"learning_rate": 4.986292806672453e-06, |
|
"loss": 1.3852, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.16181229773462782, |
|
"grad_norm": 0.1354118436574936, |
|
"learning_rate": 4.975886756706802e-06, |
|
"loss": 1.3513, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1941747572815534, |
|
"grad_norm": 0.13614092767238617, |
|
"learning_rate": 4.962578814444986e-06, |
|
"loss": 1.3174, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.22653721682847897, |
|
"grad_norm": 0.11831384897232056, |
|
"learning_rate": 4.946384577617804e-06, |
|
"loss": 1.2983, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2588996763754045, |
|
"grad_norm": 0.11915043741464615, |
|
"learning_rate": 4.927323026871671e-06, |
|
"loss": 1.3172, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2912621359223301, |
|
"grad_norm": 0.12973643839359283, |
|
"learning_rate": 4.905416503522124e-06, |
|
"loss": 1.2763, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.32362459546925565, |
|
"grad_norm": 0.15487492084503174, |
|
"learning_rate": 4.880690683368418e-06, |
|
"loss": 1.2858, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3559870550161812, |
|
"grad_norm": 0.1625741720199585, |
|
"learning_rate": 4.853174546599903e-06, |
|
"loss": 1.3051, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.3883495145631068, |
|
"grad_norm": 0.15691585838794708, |
|
"learning_rate": 4.822900343829452e-06, |
|
"loss": 1.2487, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.42071197411003236, |
|
"grad_norm": 0.15258367359638214, |
|
"learning_rate": 4.78990355829375e-06, |
|
"loss": 1.2935, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.45307443365695793, |
|
"grad_norm": 0.149098739027977, |
|
"learning_rate": 4.754222864264748e-06, |
|
"loss": 1.2484, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4854368932038835, |
|
"grad_norm": 0.13359490036964417, |
|
"learning_rate": 4.715900081721021e-06, |
|
"loss": 1.2698, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.517799352750809, |
|
"grad_norm": 0.1649194359779358, |
|
"learning_rate": 4.674980127332175e-06, |
|
"loss": 1.2839, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5501618122977346, |
|
"grad_norm": 0.17222905158996582, |
|
"learning_rate": 4.631510961813731e-06, |
|
"loss": 1.2659, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.5825242718446602, |
|
"grad_norm": 0.17811669409275055, |
|
"learning_rate": 4.585543533714202e-06, |
|
"loss": 1.282, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.6148867313915858, |
|
"grad_norm": 0.18572162091732025, |
|
"learning_rate": 4.537131719700248e-06, |
|
"loss": 1.2637, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.6472491909385113, |
|
"grad_norm": 0.18194597959518433, |
|
"learning_rate": 4.48633226140989e-06, |
|
"loss": 1.2783, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6796116504854369, |
|
"grad_norm": 0.18696360290050507, |
|
"learning_rate": 4.433204698947805e-06, |
|
"loss": 1.2744, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.7119741100323624, |
|
"grad_norm": 0.1837579309940338, |
|
"learning_rate": 4.377811301100642e-06, |
|
"loss": 1.273, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7443365695792881, |
|
"grad_norm": 0.1855243444442749, |
|
"learning_rate": 4.320216992354158e-06, |
|
"loss": 1.2584, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.7766990291262136, |
|
"grad_norm": 0.29544222354888916, |
|
"learning_rate": 4.260489276797709e-06, |
|
"loss": 1.287, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.8090614886731392, |
|
"grad_norm": 0.1854172646999359, |
|
"learning_rate": 4.198698159005278e-06, |
|
"loss": 1.2711, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.8414239482200647, |
|
"grad_norm": 0.1921299248933792, |
|
"learning_rate": 4.134916061985798e-06, |
|
"loss": 1.2735, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8737864077669902, |
|
"grad_norm": 0.19444109499454498, |
|
"learning_rate": 4.069217742298901e-06, |
|
"loss": 1.2659, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.9061488673139159, |
|
"grad_norm": 0.1953461915254593, |
|
"learning_rate": 4.001680202435617e-06, |
|
"loss": 1.2917, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.9385113268608414, |
|
"grad_norm": 0.17386198043823242, |
|
"learning_rate": 3.932382600566691e-06, |
|
"loss": 1.2704, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.970873786407767, |
|
"grad_norm": 0.18356619775295258, |
|
"learning_rate": 3.861406157764314e-06, |
|
"loss": 1.2775, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.0032362459546926, |
|
"grad_norm": 0.21802374720573425, |
|
"learning_rate": 3.7888340628060083e-06, |
|
"loss": 1.3152, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.035598705501618, |
|
"grad_norm": 0.205328568816185, |
|
"learning_rate": 3.714751374672243e-06, |
|
"loss": 1.2973, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.0679611650485437, |
|
"grad_norm": 0.19713380932807922, |
|
"learning_rate": 3.6392449228520456e-06, |
|
"loss": 1.28, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.1003236245954693, |
|
"grad_norm": 0.20696976780891418, |
|
"learning_rate": 3.562403205573474e-06, |
|
"loss": 1.2812, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.132686084142395, |
|
"grad_norm": 0.1874680519104004, |
|
"learning_rate": 3.484316286078233e-06, |
|
"loss": 1.2657, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.1650485436893203, |
|
"grad_norm": 0.17423813045024872, |
|
"learning_rate": 3.4050756870619785e-06, |
|
"loss": 1.2336, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.197411003236246, |
|
"grad_norm": 0.2038557380437851, |
|
"learning_rate": 3.324774283404065e-06, |
|
"loss": 1.2637, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.2297734627831716, |
|
"grad_norm": 0.22192490100860596, |
|
"learning_rate": 3.243506193312452e-06, |
|
"loss": 1.2474, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.262135922330097, |
|
"grad_norm": 0.21069851517677307, |
|
"learning_rate": 3.1613666680113398e-06, |
|
"loss": 1.2604, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.2944983818770226, |
|
"grad_norm": 0.20909801125526428, |
|
"learning_rate": 3.0784519801008546e-06, |
|
"loss": 1.2867, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.3268608414239482, |
|
"grad_norm": 0.20224529504776, |
|
"learning_rate": 2.9948593107196044e-06, |
|
"loss": 1.2946, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.3592233009708738, |
|
"grad_norm": 0.2082245945930481, |
|
"learning_rate": 2.9106866356423845e-06, |
|
"loss": 1.3071, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.3915857605177995, |
|
"grad_norm": 0.1968870759010315, |
|
"learning_rate": 2.8260326104465147e-06, |
|
"loss": 1.2709, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.4239482200647249, |
|
"grad_norm": 0.20573362708091736, |
|
"learning_rate": 2.740996454881397e-06, |
|
"loss": 1.245, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.4563106796116505, |
|
"grad_norm": 0.2424907386302948, |
|
"learning_rate": 2.6556778365768467e-06, |
|
"loss": 1.2665, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.4886731391585761, |
|
"grad_norm": 0.2233143001794815, |
|
"learning_rate": 2.5701767542264663e-06, |
|
"loss": 1.2871, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.5210355987055015, |
|
"grad_norm": 0.3519347012042999, |
|
"learning_rate": 2.4845934203829954e-06, |
|
"loss": 1.2889, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.5533980582524272, |
|
"grad_norm": 0.22095517814159393, |
|
"learning_rate": 2.3990281440030106e-06, |
|
"loss": 1.2895, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.5857605177993528, |
|
"grad_norm": 0.22900894284248352, |
|
"learning_rate": 2.3135812128786207e-06, |
|
"loss": 1.2986, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.6181229773462782, |
|
"grad_norm": 0.22317421436309814, |
|
"learning_rate": 2.228352776093982e-06, |
|
"loss": 1.2647, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.650485436893204, |
|
"grad_norm": 0.21420355141162872, |
|
"learning_rate": 2.143442726644385e-06, |
|
"loss": 1.2726, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.6828478964401294, |
|
"grad_norm": 0.23652972280979156, |
|
"learning_rate": 2.05895058435548e-06, |
|
"loss": 1.3003, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.715210355987055, |
|
"grad_norm": 0.21717728674411774, |
|
"learning_rate": 1.9749753792399e-06, |
|
"loss": 1.2808, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 1.7475728155339807, |
|
"grad_norm": 0.22195585072040558, |
|
"learning_rate": 1.8916155354279652e-06, |
|
"loss": 1.2856, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.779935275080906, |
|
"grad_norm": 0.25054916739463806, |
|
"learning_rate": 1.8089687558085122e-06, |
|
"loss": 1.2958, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 1.8122977346278317, |
|
"grad_norm": 0.2313874512910843, |
|
"learning_rate": 1.7271319075150786e-06, |
|
"loss": 1.2985, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.8446601941747574, |
|
"grad_norm": 0.2183278501033783, |
|
"learning_rate": 1.6462009083916275e-06, |
|
"loss": 1.288, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 1.8770226537216828, |
|
"grad_norm": 0.2164868861436844, |
|
"learning_rate": 1.5662706145709128e-06, |
|
"loss": 1.2852, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.9093851132686084, |
|
"grad_norm": 0.2470543533563614, |
|
"learning_rate": 1.4874347092972285e-06, |
|
"loss": 1.2768, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 1.941747572815534, |
|
"grad_norm": 0.21252723038196564, |
|
"learning_rate": 1.4097855931238516e-06, |
|
"loss": 1.2506, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 4635, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.453766865339023e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|