|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.26501766784452296, |
|
"eval_steps": 9, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0035335689045936395, |
|
"grad_norm": 3.620260000228882, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 4.8054, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0035335689045936395, |
|
"eval_loss": 4.703144550323486, |
|
"eval_runtime": 7.764, |
|
"eval_samples_per_second": 30.783, |
|
"eval_steps_per_second": 3.864, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007067137809187279, |
|
"grad_norm": 3.2298903465270996, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 4.9204, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01060070671378092, |
|
"grad_norm": 3.3276522159576416, |
|
"learning_rate": 6e-06, |
|
"loss": 4.9672, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.014134275618374558, |
|
"grad_norm": 3.1870436668395996, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 4.4393, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.0176678445229682, |
|
"grad_norm": 3.3669509887695312, |
|
"learning_rate": 1e-05, |
|
"loss": 4.8962, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02120141342756184, |
|
"grad_norm": 3.7465147972106934, |
|
"learning_rate": 1.2e-05, |
|
"loss": 4.7028, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.024734982332155476, |
|
"grad_norm": 3.447338342666626, |
|
"learning_rate": 1.4e-05, |
|
"loss": 4.7122, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.028268551236749116, |
|
"grad_norm": 3.4820032119750977, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 4.8886, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03180212014134275, |
|
"grad_norm": 3.2590105533599854, |
|
"learning_rate": 1.8e-05, |
|
"loss": 4.5295, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.03180212014134275, |
|
"eval_loss": 4.659517765045166, |
|
"eval_runtime": 7.1618, |
|
"eval_samples_per_second": 33.372, |
|
"eval_steps_per_second": 4.189, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0353356890459364, |
|
"grad_norm": 4.073524475097656, |
|
"learning_rate": 2e-05, |
|
"loss": 5.081, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.038869257950530034, |
|
"grad_norm": 3.5995664596557617, |
|
"learning_rate": 1.999390827019096e-05, |
|
"loss": 4.5881, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.04240282685512368, |
|
"grad_norm": 3.6033053398132324, |
|
"learning_rate": 1.9975640502598243e-05, |
|
"loss": 4.5807, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.045936395759717315, |
|
"grad_norm": 3.8522286415100098, |
|
"learning_rate": 1.9945218953682736e-05, |
|
"loss": 5.0651, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.04946996466431095, |
|
"grad_norm": 3.7190427780151367, |
|
"learning_rate": 1.9902680687415704e-05, |
|
"loss": 4.7079, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.053003533568904596, |
|
"grad_norm": 3.25559663772583, |
|
"learning_rate": 1.9848077530122083e-05, |
|
"loss": 4.6699, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05653710247349823, |
|
"grad_norm": 3.60125732421875, |
|
"learning_rate": 1.9781476007338058e-05, |
|
"loss": 4.69, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.06007067137809187, |
|
"grad_norm": 3.6664018630981445, |
|
"learning_rate": 1.9702957262759964e-05, |
|
"loss": 4.2972, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0636042402826855, |
|
"grad_norm": 4.067839622497559, |
|
"learning_rate": 1.961261695938319e-05, |
|
"loss": 4.4407, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0636042402826855, |
|
"eval_loss": 4.355165004730225, |
|
"eval_runtime": 7.1612, |
|
"eval_samples_per_second": 33.374, |
|
"eval_steps_per_second": 4.189, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.06713780918727916, |
|
"grad_norm": 3.291872024536133, |
|
"learning_rate": 1.9510565162951538e-05, |
|
"loss": 4.2238, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0706713780918728, |
|
"grad_norm": 3.1632275581359863, |
|
"learning_rate": 1.9396926207859085e-05, |
|
"loss": 4.1458, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07420494699646643, |
|
"grad_norm": 3.739023208618164, |
|
"learning_rate": 1.9271838545667876e-05, |
|
"loss": 4.302, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.07773851590106007, |
|
"grad_norm": 3.6874678134918213, |
|
"learning_rate": 1.913545457642601e-05, |
|
"loss": 4.3061, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0812720848056537, |
|
"grad_norm": 3.9015374183654785, |
|
"learning_rate": 1.8987940462991673e-05, |
|
"loss": 4.5023, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.08480565371024736, |
|
"grad_norm": 3.3470852375030518, |
|
"learning_rate": 1.8829475928589272e-05, |
|
"loss": 4.0806, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.08833922261484099, |
|
"grad_norm": 3.786510944366455, |
|
"learning_rate": 1.866025403784439e-05, |
|
"loss": 4.1044, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.09187279151943463, |
|
"grad_norm": 3.945117712020874, |
|
"learning_rate": 1.848048096156426e-05, |
|
"loss": 4.1434, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.09540636042402827, |
|
"grad_norm": 3.6443867683410645, |
|
"learning_rate": 1.8290375725550417e-05, |
|
"loss": 4.0989, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.09540636042402827, |
|
"eval_loss": 3.9746689796447754, |
|
"eval_runtime": 7.1709, |
|
"eval_samples_per_second": 33.329, |
|
"eval_steps_per_second": 4.184, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0989399293286219, |
|
"grad_norm": 3.7102622985839844, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 4.301, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.10247349823321555, |
|
"grad_norm": 3.6173665523529053, |
|
"learning_rate": 1.788010753606722e-05, |
|
"loss": 4.2593, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.10600706713780919, |
|
"grad_norm": 4.246015548706055, |
|
"learning_rate": 1.766044443118978e-05, |
|
"loss": 4.1935, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.10954063604240283, |
|
"grad_norm": 4.163142681121826, |
|
"learning_rate": 1.7431448254773943e-05, |
|
"loss": 4.0349, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.11307420494699646, |
|
"grad_norm": 4.6858696937561035, |
|
"learning_rate": 1.7193398003386514e-05, |
|
"loss": 4.0095, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.1166077738515901, |
|
"grad_norm": 3.9068796634674072, |
|
"learning_rate": 1.6946583704589973e-05, |
|
"loss": 3.6553, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.12014134275618374, |
|
"grad_norm": 4.144792556762695, |
|
"learning_rate": 1.6691306063588583e-05, |
|
"loss": 3.6503, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.12367491166077739, |
|
"grad_norm": 3.6011180877685547, |
|
"learning_rate": 1.6427876096865394e-05, |
|
"loss": 3.4449, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.127208480565371, |
|
"grad_norm": 4.0844407081604, |
|
"learning_rate": 1.6156614753256583e-05, |
|
"loss": 3.7941, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.127208480565371, |
|
"eval_loss": 3.6412363052368164, |
|
"eval_runtime": 7.1728, |
|
"eval_samples_per_second": 33.32, |
|
"eval_steps_per_second": 4.182, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.13074204946996468, |
|
"grad_norm": 3.8344345092773438, |
|
"learning_rate": 1.5877852522924733e-05, |
|
"loss": 4.0356, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.13427561837455831, |
|
"grad_norm": 4.485532760620117, |
|
"learning_rate": 1.5591929034707468e-05, |
|
"loss": 3.733, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.13780918727915195, |
|
"grad_norm": 4.423894882202148, |
|
"learning_rate": 1.529919264233205e-05, |
|
"loss": 3.6177, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1413427561837456, |
|
"grad_norm": 3.844165325164795, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 3.6761, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.14487632508833923, |
|
"grad_norm": 3.5136537551879883, |
|
"learning_rate": 1.469471562785891e-05, |
|
"loss": 3.6134, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.14840989399293286, |
|
"grad_norm": 3.5087645053863525, |
|
"learning_rate": 1.4383711467890776e-05, |
|
"loss": 3.557, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.1519434628975265, |
|
"grad_norm": 3.902492046356201, |
|
"learning_rate": 1.4067366430758004e-05, |
|
"loss": 3.6009, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.15547703180212014, |
|
"grad_norm": 3.4406399726867676, |
|
"learning_rate": 1.3746065934159123e-05, |
|
"loss": 3.4341, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.15901060070671377, |
|
"grad_norm": 3.278090715408325, |
|
"learning_rate": 1.342020143325669e-05, |
|
"loss": 3.1567, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.15901060070671377, |
|
"eval_loss": 3.398796796798706, |
|
"eval_runtime": 7.1641, |
|
"eval_samples_per_second": 33.361, |
|
"eval_steps_per_second": 4.188, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1625441696113074, |
|
"grad_norm": 3.7867252826690674, |
|
"learning_rate": 1.3090169943749475e-05, |
|
"loss": 3.3983, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.16607773851590105, |
|
"grad_norm": 4.200196743011475, |
|
"learning_rate": 1.2756373558169992e-05, |
|
"loss": 3.6074, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1696113074204947, |
|
"grad_norm": 3.7975761890411377, |
|
"learning_rate": 1.2419218955996677e-05, |
|
"loss": 3.6216, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.17314487632508835, |
|
"grad_norm": 3.228562355041504, |
|
"learning_rate": 1.2079116908177592e-05, |
|
"loss": 3.2195, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.17667844522968199, |
|
"grad_norm": 3.666325092315674, |
|
"learning_rate": 1.1736481776669307e-05, |
|
"loss": 3.2573, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18021201413427562, |
|
"grad_norm": 3.6461217403411865, |
|
"learning_rate": 1.1391731009600655e-05, |
|
"loss": 3.2947, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.18374558303886926, |
|
"grad_norm": 3.643479347229004, |
|
"learning_rate": 1.1045284632676535e-05, |
|
"loss": 3.2125, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1872791519434629, |
|
"grad_norm": 3.4164750576019287, |
|
"learning_rate": 1.0697564737441254e-05, |
|
"loss": 3.3438, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.19081272084805653, |
|
"grad_norm": 3.781705856323242, |
|
"learning_rate": 1.0348994967025012e-05, |
|
"loss": 3.251, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.19081272084805653, |
|
"eval_loss": 3.2505455017089844, |
|
"eval_runtime": 7.1694, |
|
"eval_samples_per_second": 33.336, |
|
"eval_steps_per_second": 4.184, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.19434628975265017, |
|
"grad_norm": 3.5788426399230957, |
|
"learning_rate": 1e-05, |
|
"loss": 3.3719, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.1978798586572438, |
|
"grad_norm": 3.601278305053711, |
|
"learning_rate": 9.651005032974994e-06, |
|
"loss": 3.2865, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.20141342756183744, |
|
"grad_norm": 3.5564422607421875, |
|
"learning_rate": 9.302435262558748e-06, |
|
"loss": 3.2639, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.2049469964664311, |
|
"grad_norm": 3.8267316818237305, |
|
"learning_rate": 8.954715367323468e-06, |
|
"loss": 3.5431, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.20848056537102475, |
|
"grad_norm": 3.9242968559265137, |
|
"learning_rate": 8.60826899039935e-06, |
|
"loss": 3.1809, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.21201413427561838, |
|
"grad_norm": 3.4202167987823486, |
|
"learning_rate": 8.263518223330698e-06, |
|
"loss": 3.4313, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.21554770318021202, |
|
"grad_norm": 3.4609107971191406, |
|
"learning_rate": 7.92088309182241e-06, |
|
"loss": 3.0355, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.21908127208480566, |
|
"grad_norm": 3.4130520820617676, |
|
"learning_rate": 7.580781044003324e-06, |
|
"loss": 3.4006, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.2226148409893993, |
|
"grad_norm": 3.4137520790100098, |
|
"learning_rate": 7.243626441830009e-06, |
|
"loss": 3.4577, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.2226148409893993, |
|
"eval_loss": 3.1747097969055176, |
|
"eval_runtime": 7.17, |
|
"eval_samples_per_second": 33.333, |
|
"eval_steps_per_second": 4.184, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.22614840989399293, |
|
"grad_norm": 3.42524790763855, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 3.074, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.22968197879858657, |
|
"grad_norm": 3.5621337890625, |
|
"learning_rate": 6.579798566743314e-06, |
|
"loss": 3.0764, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.2332155477031802, |
|
"grad_norm": 3.954556465148926, |
|
"learning_rate": 6.25393406584088e-06, |
|
"loss": 3.2832, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.23674911660777384, |
|
"grad_norm": 3.518817663192749, |
|
"learning_rate": 5.932633569242e-06, |
|
"loss": 3.0194, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.24028268551236748, |
|
"grad_norm": 3.8170769214630127, |
|
"learning_rate": 5.616288532109225e-06, |
|
"loss": 3.2752, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.24381625441696114, |
|
"grad_norm": 3.3504433631896973, |
|
"learning_rate": 5.305284372141095e-06, |
|
"loss": 3.2076, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.24734982332155478, |
|
"grad_norm": 3.452834129333496, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 3.356, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.2508833922261484, |
|
"grad_norm": 4.365915775299072, |
|
"learning_rate": 4.700807357667953e-06, |
|
"loss": 3.3366, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.254416961130742, |
|
"grad_norm": 3.3894336223602295, |
|
"learning_rate": 4.408070965292534e-06, |
|
"loss": 3.413, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.254416961130742, |
|
"eval_loss": 3.1335721015930176, |
|
"eval_runtime": 7.1737, |
|
"eval_samples_per_second": 33.316, |
|
"eval_steps_per_second": 4.182, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.2579505300353357, |
|
"grad_norm": 3.383213996887207, |
|
"learning_rate": 4.12214747707527e-06, |
|
"loss": 3.1959, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.26148409893992935, |
|
"grad_norm": 3.5894997119903564, |
|
"learning_rate": 3.8433852467434175e-06, |
|
"loss": 3.3013, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.26501766784452296, |
|
"grad_norm": 3.3693082332611084, |
|
"learning_rate": 3.5721239031346067e-06, |
|
"loss": 3.1231, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7257919271731200.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|