|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9960474308300395, |
|
"eval_steps": 16, |
|
"global_step": 63, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.015810276679841896, |
|
"grad_norm": 3.106086254119873, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.2762, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.015810276679841896, |
|
"eval_loss": 0.34248441457748413, |
|
"eval_runtime": 9.4978, |
|
"eval_samples_per_second": 11.266, |
|
"eval_steps_per_second": 2.843, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.03162055335968379, |
|
"grad_norm": 0.6217677593231201, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.2623, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.04743083003952569, |
|
"grad_norm": 1.1419365406036377, |
|
"learning_rate": 6e-06, |
|
"loss": 0.3943, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.06324110671936758, |
|
"grad_norm": 0.8881447315216064, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.1572, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.07905138339920949, |
|
"grad_norm": 0.7020868062973022, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1578, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09486166007905138, |
|
"grad_norm": 1.2612448930740356, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.2245, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.11067193675889328, |
|
"grad_norm": 1.2954490184783936, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.2101, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.12648221343873517, |
|
"grad_norm": 0.8434045314788818, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.1529, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.1422924901185771, |
|
"grad_norm": 0.9214808940887451, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.1522, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.15810276679841898, |
|
"grad_norm": 1.3638683557510376, |
|
"learning_rate": 2e-05, |
|
"loss": 0.3282, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.17391304347826086, |
|
"grad_norm": 2.2487895488739014, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.472, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.18972332015810275, |
|
"grad_norm": 1.98398756980896, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.4099, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.20553359683794467, |
|
"grad_norm": 1.796846866607666, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.3184, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.22134387351778656, |
|
"grad_norm": 1.6363037824630737, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.3551, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.23715415019762845, |
|
"grad_norm": 1.927720308303833, |
|
"learning_rate": 3e-05, |
|
"loss": 0.405, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.25296442687747034, |
|
"grad_norm": 1.0266072750091553, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.2991, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.25296442687747034, |
|
"eval_loss": 0.21815715730190277, |
|
"eval_runtime": 8.042, |
|
"eval_samples_per_second": 13.305, |
|
"eval_steps_per_second": 3.357, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.26877470355731226, |
|
"grad_norm": 0.4768655002117157, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.2191, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.2845849802371542, |
|
"grad_norm": 0.9923710823059082, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.384, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.30039525691699603, |
|
"grad_norm": 0.5063422322273254, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.1493, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.31620553359683795, |
|
"grad_norm": 0.5010712742805481, |
|
"learning_rate": 4e-05, |
|
"loss": 0.0675, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.33201581027667987, |
|
"grad_norm": 0.48852574825286865, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.0739, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.34782608695652173, |
|
"grad_norm": 0.4387320578098297, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.0751, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 0.3455885648727417, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.0516, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.3794466403162055, |
|
"grad_norm": 0.29778942465782166, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.0421, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.3952569169960474, |
|
"grad_norm": 0.3956562578678131, |
|
"learning_rate": 5e-05, |
|
"loss": 0.057, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.41106719367588934, |
|
"grad_norm": 0.8842503428459167, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 0.1188, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.4268774703557312, |
|
"grad_norm": 0.9197725653648376, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.1128, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.4426877470355731, |
|
"grad_norm": 0.9175456762313843, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 0.0628, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.45849802371541504, |
|
"grad_norm": 0.5987579822540283, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.0499, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.4743083003952569, |
|
"grad_norm": 0.8026472330093384, |
|
"learning_rate": 6e-05, |
|
"loss": 0.0619, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.4901185770750988, |
|
"grad_norm": 0.5789671540260315, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.0394, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.5059288537549407, |
|
"grad_norm": 0.7335872054100037, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.2007, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5059288537549407, |
|
"eval_loss": 0.050875596702098846, |
|
"eval_runtime": 8.2485, |
|
"eval_samples_per_second": 12.972, |
|
"eval_steps_per_second": 3.273, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.5217391304347826, |
|
"grad_norm": 0.6524657011032104, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.1999, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.5375494071146245, |
|
"grad_norm": 1.1696100234985352, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.3284, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.5533596837944664, |
|
"grad_norm": 0.16617901623249054, |
|
"learning_rate": 7e-05, |
|
"loss": 0.0075, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.5691699604743083, |
|
"grad_norm": 0.34399452805519104, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.0108, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.5849802371541502, |
|
"grad_norm": 0.39466115832328796, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.0283, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.6007905138339921, |
|
"grad_norm": 0.31488093733787537, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.0178, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.616600790513834, |
|
"grad_norm": 0.32184290885925293, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.011, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.6324110671936759, |
|
"grad_norm": 0.395563006401062, |
|
"learning_rate": 8e-05, |
|
"loss": 0.0096, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.6482213438735178, |
|
"grad_norm": 0.22120489180088043, |
|
"learning_rate": 8.2e-05, |
|
"loss": 0.005, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.6640316205533597, |
|
"grad_norm": 0.3320792317390442, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.008, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.6798418972332015, |
|
"grad_norm": 0.28633660078048706, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.005, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.6956521739130435, |
|
"grad_norm": 0.4111138582229614, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.0084, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.7114624505928854, |
|
"grad_norm": 0.06865093857049942, |
|
"learning_rate": 9e-05, |
|
"loss": 0.0019, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 0.05319277197122574, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.0016, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.7430830039525692, |
|
"grad_norm": 0.6792236566543579, |
|
"learning_rate": 9.4e-05, |
|
"loss": 0.1265, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.758893280632411, |
|
"grad_norm": 0.5738235712051392, |
|
"learning_rate": 9.6e-05, |
|
"loss": 0.0992, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.758893280632411, |
|
"eval_loss": 0.025439240038394928, |
|
"eval_runtime": 8.0993, |
|
"eval_samples_per_second": 13.211, |
|
"eval_steps_per_second": 3.334, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.7747035573122529, |
|
"grad_norm": 0.8376194834709167, |
|
"learning_rate": 9.8e-05, |
|
"loss": 0.1525, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.7905138339920948, |
|
"grad_norm": 0.18561908602714539, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0172, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.8063241106719368, |
|
"grad_norm": 0.31320735812187195, |
|
"learning_rate": 9.85470908713026e-05, |
|
"loss": 0.0038, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.8221343873517787, |
|
"grad_norm": 0.4068452715873718, |
|
"learning_rate": 9.42728012826605e-05, |
|
"loss": 0.0178, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.8379446640316206, |
|
"grad_norm": 0.16684125363826752, |
|
"learning_rate": 8.742553740855506e-05, |
|
"loss": 0.0087, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.8537549407114624, |
|
"grad_norm": 0.05175252631306648, |
|
"learning_rate": 7.840323733655778e-05, |
|
"loss": 0.0011, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 0.018029799684882164, |
|
"learning_rate": 6.773024435212678e-05, |
|
"loss": 0.0006, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.8853754940711462, |
|
"grad_norm": 0.03412799909710884, |
|
"learning_rate": 5.602683401276615e-05, |
|
"loss": 0.0009, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.9011857707509882, |
|
"grad_norm": 0.06215568631887436, |
|
"learning_rate": 4.397316598723385e-05, |
|
"loss": 0.0011, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.9169960474308301, |
|
"grad_norm": 0.05936681851744652, |
|
"learning_rate": 3.226975564787322e-05, |
|
"loss": 0.0011, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.932806324110672, |
|
"grad_norm": 0.045637015253305435, |
|
"learning_rate": 2.1596762663442218e-05, |
|
"loss": 0.0011, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.9486166007905138, |
|
"grad_norm": 0.03515447676181793, |
|
"learning_rate": 1.257446259144494e-05, |
|
"loss": 0.001, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.9644268774703557, |
|
"grad_norm": 0.04813205078244209, |
|
"learning_rate": 5.727198717339511e-06, |
|
"loss": 0.0011, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.9802371541501976, |
|
"grad_norm": 0.05862165987491608, |
|
"learning_rate": 1.4529091286973995e-06, |
|
"loss": 0.0012, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.9960474308300395, |
|
"grad_norm": 0.07167425751686096, |
|
"learning_rate": 0.0, |
|
"loss": 0.0096, |
|
"step": 63 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 63, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 16, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9264114014617600.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|