gsm8k-llama160m / trainer_state.json
eqhylxx's picture
gsm8k teacher fwd distill
a4fda50
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9850267379679143,
"global_step": 116,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.02,
"learning_rate": 5e-06,
"loss": 2.127,
"step": 1
},
{
"epoch": 0.03,
"learning_rate": 1e-05,
"loss": 2.0803,
"step": 2
},
{
"epoch": 0.05,
"learning_rate": 1.5000000000000002e-05,
"loss": 2.0116,
"step": 3
},
{
"epoch": 0.07,
"learning_rate": 2e-05,
"loss": 1.9464,
"step": 4
},
{
"epoch": 0.09,
"learning_rate": 1.9996066263830533e-05,
"loss": 1.8489,
"step": 5
},
{
"epoch": 0.1,
"learning_rate": 1.998426815017817e-05,
"loss": 1.6761,
"step": 6
},
{
"epoch": 0.12,
"learning_rate": 1.9964614941176194e-05,
"loss": 1.6074,
"step": 7
},
{
"epoch": 0.14,
"learning_rate": 1.9937122098932428e-05,
"loss": 1.538,
"step": 8
},
{
"epoch": 0.15,
"learning_rate": 1.9901811253364458e-05,
"loss": 1.5003,
"step": 9
},
{
"epoch": 0.17,
"learning_rate": 1.985871018518236e-05,
"loss": 1.4459,
"step": 10
},
{
"epoch": 0.19,
"learning_rate": 1.9807852804032306e-05,
"loss": 1.3862,
"step": 11
},
{
"epoch": 0.21,
"learning_rate": 1.9749279121818235e-05,
"loss": 1.2781,
"step": 12
},
{
"epoch": 0.22,
"learning_rate": 1.9683035221222617e-05,
"loss": 1.2523,
"step": 13
},
{
"epoch": 0.24,
"learning_rate": 1.9609173219450998e-05,
"loss": 1.2374,
"step": 14
},
{
"epoch": 0.26,
"learning_rate": 1.9527751227228964e-05,
"loss": 1.2061,
"step": 15
},
{
"epoch": 0.27,
"learning_rate": 1.9438833303083677e-05,
"loss": 1.1725,
"step": 16
},
{
"epoch": 0.29,
"learning_rate": 1.9342489402945997e-05,
"loss": 1.165,
"step": 17
},
{
"epoch": 0.31,
"learning_rate": 1.9238795325112867e-05,
"loss": 1.1516,
"step": 18
},
{
"epoch": 0.33,
"learning_rate": 1.912783265061319e-05,
"loss": 1.1238,
"step": 19
},
{
"epoch": 0.34,
"learning_rate": 1.900968867902419e-05,
"loss": 1.0967,
"step": 20
},
{
"epoch": 0.36,
"learning_rate": 1.8884456359788725e-05,
"loss": 1.1067,
"step": 21
},
{
"epoch": 0.38,
"learning_rate": 1.8752234219087538e-05,
"loss": 1.0945,
"step": 22
},
{
"epoch": 0.39,
"learning_rate": 1.8613126282324092e-05,
"loss": 1.0476,
"step": 23
},
{
"epoch": 0.41,
"learning_rate": 1.8467241992282842e-05,
"loss": 1.0808,
"step": 24
},
{
"epoch": 0.43,
"learning_rate": 1.8314696123025456e-05,
"loss": 1.0276,
"step": 25
},
{
"epoch": 0.44,
"learning_rate": 1.8155608689592604e-05,
"loss": 1.0342,
"step": 26
},
{
"epoch": 0.46,
"learning_rate": 1.7990104853582494e-05,
"loss": 1.0136,
"step": 27
},
{
"epoch": 0.48,
"learning_rate": 1.78183148246803e-05,
"loss": 1.0192,
"step": 28
},
{
"epoch": 0.5,
"learning_rate": 1.7640373758216075e-05,
"loss": 1.0024,
"step": 29
},
{
"epoch": 0.51,
"learning_rate": 1.7456421648831658e-05,
"loss": 0.984,
"step": 30
},
{
"epoch": 0.53,
"learning_rate": 1.7266603220340273e-05,
"loss": 0.9774,
"step": 31
},
{
"epoch": 0.55,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.9401,
"step": 32
},
{
"epoch": 0.56,
"learning_rate": 1.686996926034902e-05,
"loss": 0.9797,
"step": 33
},
{
"epoch": 0.58,
"learning_rate": 1.6663465779520042e-05,
"loss": 0.9534,
"step": 34
},
{
"epoch": 0.6,
"learning_rate": 1.645171983542088e-05,
"loss": 0.9148,
"step": 35
},
{
"epoch": 0.62,
"learning_rate": 1.6234898018587336e-05,
"loss": 0.9408,
"step": 36
},
{
"epoch": 0.63,
"learning_rate": 1.601317091298406e-05,
"loss": 0.9417,
"step": 37
},
{
"epoch": 0.65,
"learning_rate": 1.578671296179806e-05,
"loss": 0.9484,
"step": 38
},
{
"epoch": 0.67,
"learning_rate": 1.5555702330196024e-05,
"loss": 0.9714,
"step": 39
},
{
"epoch": 0.68,
"learning_rate": 1.5320320765153367e-05,
"loss": 0.9233,
"step": 40
},
{
"epoch": 0.7,
"learning_rate": 1.5080753452465296e-05,
"loss": 0.9293,
"step": 41
},
{
"epoch": 0.72,
"learning_rate": 1.4837188871052399e-05,
"loss": 0.9469,
"step": 42
},
{
"epoch": 0.74,
"learning_rate": 1.4589818644675378e-05,
"loss": 0.9224,
"step": 43
},
{
"epoch": 0.75,
"learning_rate": 1.4338837391175582e-05,
"loss": 0.8792,
"step": 44
},
{
"epoch": 0.77,
"learning_rate": 1.4084442569359964e-05,
"loss": 0.9298,
"step": 45
},
{
"epoch": 0.79,
"learning_rate": 1.3826834323650899e-05,
"loss": 0.8869,
"step": 46
},
{
"epoch": 0.8,
"learning_rate": 1.3566215326623131e-05,
"loss": 0.9078,
"step": 47
},
{
"epoch": 0.82,
"learning_rate": 1.3302790619551673e-05,
"loss": 0.8703,
"step": 48
},
{
"epoch": 0.84,
"learning_rate": 1.3036767451096148e-05,
"loss": 0.8648,
"step": 49
},
{
"epoch": 0.86,
"learning_rate": 1.2768355114248493e-05,
"loss": 0.9123,
"step": 50
},
{
"epoch": 0.87,
"learning_rate": 1.249776478167227e-05,
"loss": 0.8753,
"step": 51
},
{
"epoch": 0.89,
"learning_rate": 1.2225209339563144e-05,
"loss": 0.8996,
"step": 52
},
{
"epoch": 0.91,
"learning_rate": 1.1950903220161286e-05,
"loss": 0.891,
"step": 53
},
{
"epoch": 0.92,
"learning_rate": 1.1675062233047365e-05,
"loss": 0.8497,
"step": 54
},
{
"epoch": 0.94,
"learning_rate": 1.1397903395354996e-05,
"loss": 0.9031,
"step": 55
},
{
"epoch": 0.96,
"learning_rate": 1.1119644761033079e-05,
"loss": 0.8373,
"step": 56
},
{
"epoch": 0.98,
"learning_rate": 1.0840505249292477e-05,
"loss": 0.8297,
"step": 57
},
{
"epoch": 0.99,
"learning_rate": 1.0560704472371919e-05,
"loss": 0.8923,
"step": 58
},
{
"epoch": 1.01,
"learning_rate": 1.028046256275869e-05,
"loss": 0.8534,
"step": 59
},
{
"epoch": 1.03,
"learning_rate": 1e-05,
"loss": 0.8152,
"step": 60
},
{
"epoch": 1.04,
"learning_rate": 9.719537437241311e-06,
"loss": 0.8488,
"step": 61
},
{
"epoch": 1.06,
"learning_rate": 9.439295527628083e-06,
"loss": 0.8453,
"step": 62
},
{
"epoch": 1.08,
"learning_rate": 9.159494750707527e-06,
"loss": 0.8441,
"step": 63
},
{
"epoch": 1.1,
"learning_rate": 8.880355238966923e-06,
"loss": 0.8281,
"step": 64
},
{
"epoch": 1.11,
"learning_rate": 8.602096604645009e-06,
"loss": 0.8209,
"step": 65
},
{
"epoch": 1.13,
"learning_rate": 8.324937766952638e-06,
"loss": 0.7932,
"step": 66
},
{
"epoch": 1.15,
"learning_rate": 8.04909677983872e-06,
"loss": 0.806,
"step": 67
},
{
"epoch": 1.16,
"learning_rate": 7.774790660436857e-06,
"loss": 0.8218,
"step": 68
},
{
"epoch": 1.18,
"learning_rate": 7.50223521832773e-06,
"loss": 0.7919,
"step": 69
},
{
"epoch": 1.2,
"learning_rate": 7.2316448857515076e-06,
"loss": 0.7795,
"step": 70
},
{
"epoch": 1.21,
"learning_rate": 6.963232548903853e-06,
"loss": 0.7774,
"step": 71
},
{
"epoch": 1.23,
"learning_rate": 6.697209380448333e-06,
"loss": 0.7844,
"step": 72
},
{
"epoch": 1.25,
"learning_rate": 6.43378467337687e-06,
"loss": 0.8,
"step": 73
},
{
"epoch": 1.27,
"learning_rate": 6.173165676349103e-06,
"loss": 0.7743,
"step": 74
},
{
"epoch": 1.28,
"learning_rate": 5.91555743064004e-06,
"loss": 0.7808,
"step": 75
},
{
"epoch": 1.3,
"learning_rate": 5.66116260882442e-06,
"loss": 0.7804,
"step": 76
},
{
"epoch": 1.32,
"learning_rate": 5.410181355324622e-06,
"loss": 0.8098,
"step": 77
},
{
"epoch": 1.33,
"learning_rate": 5.1628111289476025e-06,
"loss": 0.769,
"step": 78
},
{
"epoch": 1.35,
"learning_rate": 4.919246547534709e-06,
"loss": 0.7901,
"step": 79
},
{
"epoch": 1.37,
"learning_rate": 4.679679234846636e-06,
"loss": 0.8071,
"step": 80
},
{
"epoch": 1.39,
"learning_rate": 4.444297669803981e-06,
"loss": 0.7557,
"step": 81
},
{
"epoch": 1.4,
"learning_rate": 4.213287038201943e-06,
"loss": 0.8192,
"step": 82
},
{
"epoch": 1.42,
"learning_rate": 3.986829087015941e-06,
"loss": 0.7987,
"step": 83
},
{
"epoch": 1.44,
"learning_rate": 3.7651019814126656e-06,
"loss": 0.7918,
"step": 84
},
{
"epoch": 1.45,
"learning_rate": 3.5482801645791266e-06,
"loss": 0.8145,
"step": 85
},
{
"epoch": 1.47,
"learning_rate": 3.3365342204799613e-06,
"loss": 0.7737,
"step": 86
},
{
"epoch": 1.49,
"learning_rate": 3.1300307396509833e-06,
"loss": 0.8118,
"step": 87
},
{
"epoch": 1.51,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.7845,
"step": 88
},
{
"epoch": 1.52,
"learning_rate": 2.7333967796597317e-06,
"loss": 0.7557,
"step": 89
},
{
"epoch": 1.54,
"learning_rate": 2.5435783511683444e-06,
"loss": 0.7898,
"step": 90
},
{
"epoch": 1.56,
"learning_rate": 2.3596262417839256e-06,
"loss": 0.7754,
"step": 91
},
{
"epoch": 1.57,
"learning_rate": 2.1816851753197023e-06,
"loss": 0.7875,
"step": 92
},
{
"epoch": 1.59,
"learning_rate": 2.009895146417512e-06,
"loss": 0.7739,
"step": 93
},
{
"epoch": 1.61,
"learning_rate": 1.8443913104073984e-06,
"loss": 0.7745,
"step": 94
},
{
"epoch": 1.63,
"learning_rate": 1.6853038769745466e-06,
"loss": 0.7868,
"step": 95
},
{
"epoch": 1.64,
"learning_rate": 1.5327580077171589e-06,
"loss": 0.7942,
"step": 96
},
{
"epoch": 1.66,
"learning_rate": 1.3868737176759105e-06,
"loss": 0.8101,
"step": 97
},
{
"epoch": 1.68,
"learning_rate": 1.2477657809124632e-06,
"loss": 0.7955,
"step": 98
},
{
"epoch": 1.69,
"learning_rate": 1.1155436402112785e-06,
"loss": 0.7964,
"step": 99
},
{
"epoch": 1.71,
"learning_rate": 9.903113209758098e-07,
"loss": 0.8203,
"step": 100
},
{
"epoch": 1.73,
"learning_rate": 8.721673493868111e-07,
"loss": 0.8105,
"step": 101
},
{
"epoch": 1.75,
"learning_rate": 7.612046748871327e-07,
"loss": 0.775,
"step": 102
},
{
"epoch": 1.76,
"learning_rate": 6.57510597054003e-07,
"loss": 0.77,
"step": 103
},
{
"epoch": 1.78,
"learning_rate": 5.611666969163243e-07,
"loss": 0.7998,
"step": 104
},
{
"epoch": 1.8,
"learning_rate": 4.7224877277103673e-07,
"loss": 0.7854,
"step": 105
},
{
"epoch": 1.81,
"learning_rate": 3.908267805490051e-07,
"loss": 0.8004,
"step": 106
},
{
"epoch": 1.83,
"learning_rate": 3.1696477877738664e-07,
"loss": 0.7801,
"step": 107
},
{
"epoch": 1.85,
"learning_rate": 2.507208781817638e-07,
"loss": 0.7748,
"step": 108
},
{
"epoch": 1.87,
"learning_rate": 1.921471959676957e-07,
"loss": 0.8109,
"step": 109
},
{
"epoch": 1.88,
"learning_rate": 1.4128981481764115e-07,
"loss": 0.8023,
"step": 110
},
{
"epoch": 1.9,
"learning_rate": 9.818874663554356e-08,
"loss": 0.7938,
"step": 111
},
{
"epoch": 1.92,
"learning_rate": 6.287790106757396e-08,
"loss": 0.8001,
"step": 112
},
{
"epoch": 1.93,
"learning_rate": 3.538505882380916e-08,
"loss": 0.775,
"step": 113
},
{
"epoch": 1.95,
"learning_rate": 1.5731849821833955e-08,
"loss": 0.7893,
"step": 114
},
{
"epoch": 1.97,
"learning_rate": 3.933736169471347e-09,
"loss": 0.7897,
"step": 115
},
{
"epoch": 1.99,
"learning_rate": 0.0,
"loss": 0.7657,
"step": 116
},
{
"epoch": 1.99,
"step": 116,
"total_flos": 2.5137614882340864e+16,
"train_loss": 0.9649588188220715,
"train_runtime": 7099.8763,
"train_samples_per_second": 2.105,
"train_steps_per_second": 0.016
}
],
"max_steps": 116,
"num_train_epochs": 2,
"total_flos": 2.5137614882340864e+16,
"trial_name": null,
"trial_params": null
}