{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9960474308300395, "eval_steps": 16, "global_step": 63, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015810276679841896, "grad_norm": 3.106086254119873, "learning_rate": 2.0000000000000003e-06, "loss": 0.2762, "step": 1 }, { "epoch": 0.015810276679841896, "eval_loss": 0.34248441457748413, "eval_runtime": 9.4978, "eval_samples_per_second": 11.266, "eval_steps_per_second": 2.843, "step": 1 }, { "epoch": 0.03162055335968379, "grad_norm": 0.6217677593231201, "learning_rate": 4.000000000000001e-06, "loss": 0.2623, "step": 2 }, { "epoch": 0.04743083003952569, "grad_norm": 1.1419365406036377, "learning_rate": 6e-06, "loss": 0.3943, "step": 3 }, { "epoch": 0.06324110671936758, "grad_norm": 0.8881447315216064, "learning_rate": 8.000000000000001e-06, "loss": 0.1572, "step": 4 }, { "epoch": 0.07905138339920949, "grad_norm": 0.7020868062973022, "learning_rate": 1e-05, "loss": 0.1578, "step": 5 }, { "epoch": 0.09486166007905138, "grad_norm": 1.2612448930740356, "learning_rate": 1.2e-05, "loss": 0.2245, "step": 6 }, { "epoch": 0.11067193675889328, "grad_norm": 1.2954490184783936, "learning_rate": 1.4000000000000001e-05, "loss": 0.2101, "step": 7 }, { "epoch": 0.12648221343873517, "grad_norm": 0.8434045314788818, "learning_rate": 1.6000000000000003e-05, "loss": 0.1529, "step": 8 }, { "epoch": 0.1422924901185771, "grad_norm": 0.9214808940887451, "learning_rate": 1.8e-05, "loss": 0.1522, "step": 9 }, { "epoch": 0.15810276679841898, "grad_norm": 1.3638683557510376, "learning_rate": 2e-05, "loss": 0.3282, "step": 10 }, { "epoch": 0.17391304347826086, "grad_norm": 2.2487895488739014, "learning_rate": 2.2000000000000003e-05, "loss": 0.472, "step": 11 }, { "epoch": 0.18972332015810275, "grad_norm": 1.98398756980896, "learning_rate": 2.4e-05, "loss": 0.4099, "step": 12 }, { "epoch": 0.20553359683794467, "grad_norm": 1.796846866607666, "learning_rate": 2.6000000000000002e-05, "loss": 0.3184, "step": 13 }, { "epoch": 0.22134387351778656, "grad_norm": 1.6363037824630737, "learning_rate": 2.8000000000000003e-05, "loss": 0.3551, "step": 14 }, { "epoch": 0.23715415019762845, "grad_norm": 1.927720308303833, "learning_rate": 3e-05, "loss": 0.405, "step": 15 }, { "epoch": 0.25296442687747034, "grad_norm": 1.0266072750091553, "learning_rate": 3.2000000000000005e-05, "loss": 0.2991, "step": 16 }, { "epoch": 0.25296442687747034, "eval_loss": 0.21815715730190277, "eval_runtime": 8.042, "eval_samples_per_second": 13.305, "eval_steps_per_second": 3.357, "step": 16 }, { "epoch": 0.26877470355731226, "grad_norm": 0.4768655002117157, "learning_rate": 3.4000000000000007e-05, "loss": 0.2191, "step": 17 }, { "epoch": 0.2845849802371542, "grad_norm": 0.9923710823059082, "learning_rate": 3.6e-05, "loss": 0.384, "step": 18 }, { "epoch": 0.30039525691699603, "grad_norm": 0.5063422322273254, "learning_rate": 3.8e-05, "loss": 0.1493, "step": 19 }, { "epoch": 0.31620553359683795, "grad_norm": 0.5010712742805481, "learning_rate": 4e-05, "loss": 0.0675, "step": 20 }, { "epoch": 0.33201581027667987, "grad_norm": 0.48852574825286865, "learning_rate": 4.2e-05, "loss": 0.0739, "step": 21 }, { "epoch": 0.34782608695652173, "grad_norm": 0.4387320578098297, "learning_rate": 4.4000000000000006e-05, "loss": 0.0751, "step": 22 }, { "epoch": 0.36363636363636365, "grad_norm": 0.3455885648727417, "learning_rate": 4.600000000000001e-05, "loss": 0.0516, "step": 23 }, { "epoch": 0.3794466403162055, "grad_norm": 0.29778942465782166, "learning_rate": 4.8e-05, "loss": 0.0421, "step": 24 }, { "epoch": 0.3952569169960474, "grad_norm": 0.3956562578678131, "learning_rate": 5e-05, "loss": 0.057, "step": 25 }, { "epoch": 0.41106719367588934, "grad_norm": 0.8842503428459167, "learning_rate": 5.2000000000000004e-05, "loss": 0.1188, "step": 26 }, { "epoch": 0.4268774703557312, "grad_norm": 0.9197725653648376, "learning_rate": 5.4000000000000005e-05, "loss": 0.1128, "step": 27 }, { "epoch": 0.4426877470355731, "grad_norm": 0.9175456762313843, "learning_rate": 5.6000000000000006e-05, "loss": 0.0628, "step": 28 }, { "epoch": 0.45849802371541504, "grad_norm": 0.5987579822540283, "learning_rate": 5.8e-05, "loss": 0.0499, "step": 29 }, { "epoch": 0.4743083003952569, "grad_norm": 0.8026472330093384, "learning_rate": 6e-05, "loss": 0.0619, "step": 30 }, { "epoch": 0.4901185770750988, "grad_norm": 0.5789671540260315, "learning_rate": 6.2e-05, "loss": 0.0394, "step": 31 }, { "epoch": 0.5059288537549407, "grad_norm": 0.7335872054100037, "learning_rate": 6.400000000000001e-05, "loss": 0.2007, "step": 32 }, { "epoch": 0.5059288537549407, "eval_loss": 0.050875596702098846, "eval_runtime": 8.2485, "eval_samples_per_second": 12.972, "eval_steps_per_second": 3.273, "step": 32 }, { "epoch": 0.5217391304347826, "grad_norm": 0.6524657011032104, "learning_rate": 6.6e-05, "loss": 0.1999, "step": 33 }, { "epoch": 0.5375494071146245, "grad_norm": 1.1696100234985352, "learning_rate": 6.800000000000001e-05, "loss": 0.3284, "step": 34 }, { "epoch": 0.5533596837944664, "grad_norm": 0.16617901623249054, "learning_rate": 7e-05, "loss": 0.0075, "step": 35 }, { "epoch": 0.5691699604743083, "grad_norm": 0.34399452805519104, "learning_rate": 7.2e-05, "loss": 0.0108, "step": 36 }, { "epoch": 0.5849802371541502, "grad_norm": 0.39466115832328796, "learning_rate": 7.4e-05, "loss": 0.0283, "step": 37 }, { "epoch": 0.6007905138339921, "grad_norm": 0.31488093733787537, "learning_rate": 7.6e-05, "loss": 0.0178, "step": 38 }, { "epoch": 0.616600790513834, "grad_norm": 0.32184290885925293, "learning_rate": 7.800000000000001e-05, "loss": 0.011, "step": 39 }, { "epoch": 0.6324110671936759, "grad_norm": 0.395563006401062, "learning_rate": 8e-05, "loss": 0.0096, "step": 40 }, { "epoch": 0.6482213438735178, "grad_norm": 0.22120489180088043, "learning_rate": 8.2e-05, "loss": 0.005, "step": 41 }, { "epoch": 0.6640316205533597, "grad_norm": 0.3320792317390442, "learning_rate": 8.4e-05, "loss": 0.008, "step": 42 }, { "epoch": 0.6798418972332015, "grad_norm": 0.28633660078048706, "learning_rate": 8.6e-05, "loss": 0.005, "step": 43 }, { "epoch": 0.6956521739130435, "grad_norm": 0.4111138582229614, "learning_rate": 8.800000000000001e-05, "loss": 0.0084, "step": 44 }, { "epoch": 0.7114624505928854, "grad_norm": 0.06865093857049942, "learning_rate": 9e-05, "loss": 0.0019, "step": 45 }, { "epoch": 0.7272727272727273, "grad_norm": 0.05319277197122574, "learning_rate": 9.200000000000001e-05, "loss": 0.0016, "step": 46 }, { "epoch": 0.7430830039525692, "grad_norm": 0.6792236566543579, "learning_rate": 9.4e-05, "loss": 0.1265, "step": 47 }, { "epoch": 0.758893280632411, "grad_norm": 0.5738235712051392, "learning_rate": 9.6e-05, "loss": 0.0992, "step": 48 }, { "epoch": 0.758893280632411, "eval_loss": 0.025439240038394928, "eval_runtime": 8.0993, "eval_samples_per_second": 13.211, "eval_steps_per_second": 3.334, "step": 48 }, { "epoch": 0.7747035573122529, "grad_norm": 0.8376194834709167, "learning_rate": 9.8e-05, "loss": 0.1525, "step": 49 }, { "epoch": 0.7905138339920948, "grad_norm": 0.18561908602714539, "learning_rate": 0.0001, "loss": 0.0172, "step": 50 }, { "epoch": 0.8063241106719368, "grad_norm": 0.31320735812187195, "learning_rate": 9.85470908713026e-05, "loss": 0.0038, "step": 51 }, { "epoch": 0.8221343873517787, "grad_norm": 0.4068452715873718, "learning_rate": 9.42728012826605e-05, "loss": 0.0178, "step": 52 }, { "epoch": 0.8379446640316206, "grad_norm": 0.16684125363826752, "learning_rate": 8.742553740855506e-05, "loss": 0.0087, "step": 53 }, { "epoch": 0.8537549407114624, "grad_norm": 0.05175252631306648, "learning_rate": 7.840323733655778e-05, "loss": 0.0011, "step": 54 }, { "epoch": 0.8695652173913043, "grad_norm": 0.018029799684882164, "learning_rate": 6.773024435212678e-05, "loss": 0.0006, "step": 55 }, { "epoch": 0.8853754940711462, "grad_norm": 0.03412799909710884, "learning_rate": 5.602683401276615e-05, "loss": 0.0009, "step": 56 }, { "epoch": 0.9011857707509882, "grad_norm": 0.06215568631887436, "learning_rate": 4.397316598723385e-05, "loss": 0.0011, "step": 57 }, { "epoch": 0.9169960474308301, "grad_norm": 0.05936681851744652, "learning_rate": 3.226975564787322e-05, "loss": 0.0011, "step": 58 }, { "epoch": 0.932806324110672, "grad_norm": 0.045637015253305435, "learning_rate": 2.1596762663442218e-05, "loss": 0.0011, "step": 59 }, { "epoch": 0.9486166007905138, "grad_norm": 0.03515447676181793, "learning_rate": 1.257446259144494e-05, "loss": 0.001, "step": 60 }, { "epoch": 0.9644268774703557, "grad_norm": 0.04813205078244209, "learning_rate": 5.727198717339511e-06, "loss": 0.0011, "step": 61 }, { "epoch": 0.9802371541501976, "grad_norm": 0.05862165987491608, "learning_rate": 1.4529091286973995e-06, "loss": 0.0012, "step": 62 }, { "epoch": 0.9960474308300395, "grad_norm": 0.07167425751686096, "learning_rate": 0.0, "loss": 0.0096, "step": 63 } ], "logging_steps": 1, "max_steps": 63, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 16, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9264114014617600.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }