|
{ |
|
"best_metric": 0.3398433029651642, |
|
"best_model_checkpoint": "mikhail-panzo/zlm_b128_le4_s4000/checkpoint-3500", |
|
"epoch": 5.863874345549738, |
|
"eval_steps": 500, |
|
"global_step": 3500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08376963350785341, |
|
"grad_norm": 2.9717624187469482, |
|
"learning_rate": 2.4500000000000003e-06, |
|
"loss": 1.0424, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.16753926701570682, |
|
"grad_norm": 2.9720630645751953, |
|
"learning_rate": 4.950000000000001e-06, |
|
"loss": 0.8474, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2513089005235602, |
|
"grad_norm": 2.445929765701294, |
|
"learning_rate": 7.45e-06, |
|
"loss": 0.7336, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.33507853403141363, |
|
"grad_norm": 5.502955913543701, |
|
"learning_rate": 9.950000000000001e-06, |
|
"loss": 0.6492, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.418848167539267, |
|
"grad_norm": 2.3356130123138428, |
|
"learning_rate": 1.2450000000000001e-05, |
|
"loss": 0.6133, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5026178010471204, |
|
"grad_norm": 1.937270164489746, |
|
"learning_rate": 1.4950000000000001e-05, |
|
"loss": 0.5889, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5863874345549738, |
|
"grad_norm": 2.392244338989258, |
|
"learning_rate": 1.745e-05, |
|
"loss": 0.5694, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6701570680628273, |
|
"grad_norm": 7.3209919929504395, |
|
"learning_rate": 1.995e-05, |
|
"loss": 0.5477, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7539267015706806, |
|
"grad_norm": 3.415917158126831, |
|
"learning_rate": 2.245e-05, |
|
"loss": 0.5329, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"grad_norm": 3.0256705284118652, |
|
"learning_rate": 2.495e-05, |
|
"loss": 0.5173, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.837696335078534, |
|
"eval_loss": 0.4566049873828888, |
|
"eval_runtime": 261.3511, |
|
"eval_samples_per_second": 32.481, |
|
"eval_steps_per_second": 4.063, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9214659685863874, |
|
"grad_norm": 1.9436837434768677, |
|
"learning_rate": 2.7450000000000003e-05, |
|
"loss": 0.5079, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.0052356020942408, |
|
"grad_norm": 1.819956660270691, |
|
"learning_rate": 2.995e-05, |
|
"loss": 0.4969, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.0890052356020943, |
|
"grad_norm": 5.457251071929932, |
|
"learning_rate": 3.245e-05, |
|
"loss": 0.4977, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.1727748691099475, |
|
"grad_norm": 3.183980703353882, |
|
"learning_rate": 3.495e-05, |
|
"loss": 0.4923, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.256544502617801, |
|
"grad_norm": 7.1660051345825195, |
|
"learning_rate": 3.745e-05, |
|
"loss": 0.4802, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.3403141361256545, |
|
"grad_norm": 5.499026775360107, |
|
"learning_rate": 3.995e-05, |
|
"loss": 0.4754, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.4240837696335078, |
|
"grad_norm": 2.8053908348083496, |
|
"learning_rate": 4.245e-05, |
|
"loss": 0.4669, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.5078534031413613, |
|
"grad_norm": 3.017005443572998, |
|
"learning_rate": 4.495e-05, |
|
"loss": 0.4604, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.5916230366492146, |
|
"grad_norm": 2.7971177101135254, |
|
"learning_rate": 4.745e-05, |
|
"loss": 0.4565, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.675392670157068, |
|
"grad_norm": 3.1588356494903564, |
|
"learning_rate": 4.995e-05, |
|
"loss": 0.455, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.675392670157068, |
|
"eval_loss": 0.40312233567237854, |
|
"eval_runtime": 256.4334, |
|
"eval_samples_per_second": 33.104, |
|
"eval_steps_per_second": 4.141, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.7591623036649215, |
|
"grad_norm": 2.2053232192993164, |
|
"learning_rate": 5.245e-05, |
|
"loss": 0.4543, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.8429319371727748, |
|
"grad_norm": 2.0562164783477783, |
|
"learning_rate": 5.495e-05, |
|
"loss": 0.4456, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.9267015706806283, |
|
"grad_norm": 2.730119466781616, |
|
"learning_rate": 5.745e-05, |
|
"loss": 0.4355, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.0104712041884816, |
|
"grad_norm": 1.7484283447265625, |
|
"learning_rate": 5.995000000000001e-05, |
|
"loss": 0.4299, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.094240837696335, |
|
"grad_norm": 1.1786061525344849, |
|
"learning_rate": 6.245000000000001e-05, |
|
"loss": 0.4305, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.1780104712041886, |
|
"grad_norm": 1.98978590965271, |
|
"learning_rate": 6.494999999999999e-05, |
|
"loss": 0.4295, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.261780104712042, |
|
"grad_norm": 2.818659782409668, |
|
"learning_rate": 6.745e-05, |
|
"loss": 0.4235, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.345549738219895, |
|
"grad_norm": 2.3864262104034424, |
|
"learning_rate": 6.995e-05, |
|
"loss": 0.4271, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.4293193717277486, |
|
"grad_norm": 1.3647903203964233, |
|
"learning_rate": 7.245000000000001e-05, |
|
"loss": 0.4208, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.513089005235602, |
|
"grad_norm": 2.2144172191619873, |
|
"learning_rate": 7.495e-05, |
|
"loss": 0.4175, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.513089005235602, |
|
"eval_loss": 0.3777858018875122, |
|
"eval_runtime": 260.5025, |
|
"eval_samples_per_second": 32.587, |
|
"eval_steps_per_second": 4.077, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.5968586387434556, |
|
"grad_norm": 1.6483193635940552, |
|
"learning_rate": 7.745e-05, |
|
"loss": 0.414, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.680628272251309, |
|
"grad_norm": 1.7688554525375366, |
|
"learning_rate": 7.995e-05, |
|
"loss": 0.4153, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.7643979057591626, |
|
"grad_norm": 1.2314317226409912, |
|
"learning_rate": 8.245e-05, |
|
"loss": 0.4089, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.8481675392670156, |
|
"grad_norm": 1.6623793840408325, |
|
"learning_rate": 8.495e-05, |
|
"loss": 0.4124, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.931937172774869, |
|
"grad_norm": 3.812507390975952, |
|
"learning_rate": 8.745000000000001e-05, |
|
"loss": 0.4112, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.0157068062827226, |
|
"grad_norm": 2.141019821166992, |
|
"learning_rate": 8.995e-05, |
|
"loss": 0.4081, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.099476439790576, |
|
"grad_norm": 1.8928133249282837, |
|
"learning_rate": 9.245e-05, |
|
"loss": 0.4067, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.183246073298429, |
|
"grad_norm": 2.322817087173462, |
|
"learning_rate": 9.495e-05, |
|
"loss": 0.4088, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.2670157068062826, |
|
"grad_norm": 2.1984918117523193, |
|
"learning_rate": 9.745000000000001e-05, |
|
"loss": 0.3976, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.350785340314136, |
|
"grad_norm": 2.0455121994018555, |
|
"learning_rate": 9.995e-05, |
|
"loss": 0.4022, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.350785340314136, |
|
"eval_loss": 0.3677983582019806, |
|
"eval_runtime": 257.3111, |
|
"eval_samples_per_second": 32.991, |
|
"eval_steps_per_second": 4.127, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.4345549738219896, |
|
"grad_norm": 2.0729174613952637, |
|
"learning_rate": 9.755000000000001e-05, |
|
"loss": 0.4017, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.518324607329843, |
|
"grad_norm": 1.4496002197265625, |
|
"learning_rate": 9.505e-05, |
|
"loss": 0.3999, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.6020942408376966, |
|
"grad_norm": 1.313783884048462, |
|
"learning_rate": 9.255e-05, |
|
"loss": 0.3971, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.6858638743455496, |
|
"grad_norm": 2.4938676357269287, |
|
"learning_rate": 9.005000000000001e-05, |
|
"loss": 0.3958, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.769633507853403, |
|
"grad_norm": 1.6081351041793823, |
|
"learning_rate": 8.755e-05, |
|
"loss": 0.389, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.8534031413612566, |
|
"grad_norm": 1.5322096347808838, |
|
"learning_rate": 8.505000000000001e-05, |
|
"loss": 0.3887, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.93717277486911, |
|
"grad_norm": 2.3951408863067627, |
|
"learning_rate": 8.26e-05, |
|
"loss": 0.392, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.020942408376963, |
|
"grad_norm": 1.0958179235458374, |
|
"learning_rate": 8.010000000000001e-05, |
|
"loss": 0.3846, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.104712041884817, |
|
"grad_norm": 1.2315720319747925, |
|
"learning_rate": 7.76e-05, |
|
"loss": 0.3867, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.18848167539267, |
|
"grad_norm": 1.3186556100845337, |
|
"learning_rate": 7.510000000000001e-05, |
|
"loss": 0.3848, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.18848167539267, |
|
"eval_loss": 0.3522670865058899, |
|
"eval_runtime": 269.3867, |
|
"eval_samples_per_second": 31.512, |
|
"eval_steps_per_second": 3.942, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.272251308900524, |
|
"grad_norm": 1.6223663091659546, |
|
"learning_rate": 7.26e-05, |
|
"loss": 0.3868, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 4.356020942408377, |
|
"grad_norm": 1.2300375699996948, |
|
"learning_rate": 7.01e-05, |
|
"loss": 0.3834, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.439790575916231, |
|
"grad_norm": 1.930428147315979, |
|
"learning_rate": 6.76e-05, |
|
"loss": 0.3789, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 4.523560209424084, |
|
"grad_norm": 1.2785884141921997, |
|
"learning_rate": 6.510000000000001e-05, |
|
"loss": 0.3821, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.607329842931938, |
|
"grad_norm": 2.007714033126831, |
|
"learning_rate": 6.26e-05, |
|
"loss": 0.3777, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.69109947643979, |
|
"grad_norm": 1.490657925605774, |
|
"learning_rate": 6.0100000000000004e-05, |
|
"loss": 0.3817, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.774869109947644, |
|
"grad_norm": 1.7175956964492798, |
|
"learning_rate": 5.76e-05, |
|
"loss": 0.3778, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.858638743455497, |
|
"grad_norm": 1.9603019952774048, |
|
"learning_rate": 5.5100000000000004e-05, |
|
"loss": 0.3787, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.942408376963351, |
|
"grad_norm": 0.9075130224227905, |
|
"learning_rate": 5.2600000000000005e-05, |
|
"loss": 0.3775, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 5.026178010471204, |
|
"grad_norm": 1.1054623126983643, |
|
"learning_rate": 5.0100000000000005e-05, |
|
"loss": 0.3763, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.026178010471204, |
|
"eval_loss": 0.34259819984436035, |
|
"eval_runtime": 256.7226, |
|
"eval_samples_per_second": 33.067, |
|
"eval_steps_per_second": 4.137, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.109947643979058, |
|
"grad_norm": 0.8834495544433594, |
|
"learning_rate": 4.76e-05, |
|
"loss": 0.3718, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 5.193717277486911, |
|
"grad_norm": 1.585207223892212, |
|
"learning_rate": 4.5100000000000005e-05, |
|
"loss": 0.3732, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.277486910994765, |
|
"grad_norm": 1.8974605798721313, |
|
"learning_rate": 4.26e-05, |
|
"loss": 0.3714, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 5.361256544502618, |
|
"grad_norm": 1.280151128768921, |
|
"learning_rate": 4.0100000000000006e-05, |
|
"loss": 0.3703, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.445026178010472, |
|
"grad_norm": 1.453099012374878, |
|
"learning_rate": 3.76e-05, |
|
"loss": 0.3733, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 5.528795811518324, |
|
"grad_norm": 1.4233230352401733, |
|
"learning_rate": 3.51e-05, |
|
"loss": 0.3725, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 5.612565445026178, |
|
"grad_norm": 0.9149217009544373, |
|
"learning_rate": 3.26e-05, |
|
"loss": 0.3674, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 5.696335078534031, |
|
"grad_norm": 0.9855162501335144, |
|
"learning_rate": 3.01e-05, |
|
"loss": 0.3685, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 5.780104712041885, |
|
"grad_norm": 1.0689723491668701, |
|
"learning_rate": 2.7600000000000003e-05, |
|
"loss": 0.3684, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 5.863874345549738, |
|
"grad_norm": 1.527082920074463, |
|
"learning_rate": 2.51e-05, |
|
"loss": 0.3665, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 5.863874345549738, |
|
"eval_loss": 0.3398433029651642, |
|
"eval_runtime": 259.8568, |
|
"eval_samples_per_second": 32.668, |
|
"eval_steps_per_second": 4.087, |
|
"step": 3500 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 4000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.269975786347891e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|