0x1202's picture
Training in progress, step 100, checkpoint
d57ffcc verified
{
"best_metric": 0.445011705160141,
"best_model_checkpoint": "miner_id_24/checkpoint-100",
"epoch": 0.12202562538133008,
"eval_steps": 50,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012202562538133007,
"grad_norm": 1.5322504043579102,
"learning_rate": 1e-06,
"loss": 0.98,
"step": 1
},
{
"epoch": 0.0012202562538133007,
"eval_loss": 1.7899479866027832,
"eval_runtime": 39.7434,
"eval_samples_per_second": 8.681,
"eval_steps_per_second": 2.189,
"step": 1
},
{
"epoch": 0.0024405125076266015,
"grad_norm": 2.2656161785125732,
"learning_rate": 2e-06,
"loss": 0.8622,
"step": 2
},
{
"epoch": 0.0036607687614399025,
"grad_norm": 1.5969613790512085,
"learning_rate": 3e-06,
"loss": 0.8886,
"step": 3
},
{
"epoch": 0.004881025015253203,
"grad_norm": 1.7182639837265015,
"learning_rate": 4e-06,
"loss": 0.9019,
"step": 4
},
{
"epoch": 0.006101281269066504,
"grad_norm": 1.8653854131698608,
"learning_rate": 4.9999999999999996e-06,
"loss": 0.926,
"step": 5
},
{
"epoch": 0.007321537522879805,
"grad_norm": 1.7472790479660034,
"learning_rate": 6e-06,
"loss": 0.8747,
"step": 6
},
{
"epoch": 0.008541793776693106,
"grad_norm": 2.0147242546081543,
"learning_rate": 7e-06,
"loss": 1.1381,
"step": 7
},
{
"epoch": 0.009762050030506406,
"grad_norm": 2.0822384357452393,
"learning_rate": 8e-06,
"loss": 1.0722,
"step": 8
},
{
"epoch": 0.010982306284319707,
"grad_norm": 2.372750997543335,
"learning_rate": 9e-06,
"loss": 1.0916,
"step": 9
},
{
"epoch": 0.012202562538133009,
"grad_norm": 2.3036062717437744,
"learning_rate": 9.999999999999999e-06,
"loss": 1.2219,
"step": 10
},
{
"epoch": 0.013422818791946308,
"grad_norm": 1.9618552923202515,
"learning_rate": 1.1e-05,
"loss": 1.1836,
"step": 11
},
{
"epoch": 0.01464307504575961,
"grad_norm": 3.2483971118927,
"learning_rate": 1.2e-05,
"loss": 0.994,
"step": 12
},
{
"epoch": 0.01586333129957291,
"grad_norm": 2.3026437759399414,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.8704,
"step": 13
},
{
"epoch": 0.017083587553386213,
"grad_norm": 3.6897010803222656,
"learning_rate": 1.4e-05,
"loss": 1.3528,
"step": 14
},
{
"epoch": 0.018303843807199512,
"grad_norm": 2.8790388107299805,
"learning_rate": 1.5e-05,
"loss": 0.8721,
"step": 15
},
{
"epoch": 0.019524100061012812,
"grad_norm": 2.256960391998291,
"learning_rate": 1.6e-05,
"loss": 0.8006,
"step": 16
},
{
"epoch": 0.020744356314826115,
"grad_norm": 2.7027554512023926,
"learning_rate": 1.7e-05,
"loss": 1.0828,
"step": 17
},
{
"epoch": 0.021964612568639415,
"grad_norm": 2.999537467956543,
"learning_rate": 1.8e-05,
"loss": 1.0613,
"step": 18
},
{
"epoch": 0.023184868822452714,
"grad_norm": 4.485517501831055,
"learning_rate": 1.9e-05,
"loss": 1.0636,
"step": 19
},
{
"epoch": 0.024405125076266018,
"grad_norm": 3.9415929317474365,
"learning_rate": 1.9999999999999998e-05,
"loss": 1.2694,
"step": 20
},
{
"epoch": 0.025625381330079317,
"grad_norm": 2.4583797454833984,
"learning_rate": 2.1e-05,
"loss": 0.8239,
"step": 21
},
{
"epoch": 0.026845637583892617,
"grad_norm": 4.539880752563477,
"learning_rate": 2.2e-05,
"loss": 1.069,
"step": 22
},
{
"epoch": 0.028065893837705917,
"grad_norm": 2.4033968448638916,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.7204,
"step": 23
},
{
"epoch": 0.02928615009151922,
"grad_norm": 3.703325033187866,
"learning_rate": 2.4e-05,
"loss": 1.0484,
"step": 24
},
{
"epoch": 0.03050640634533252,
"grad_norm": 3.5305097103118896,
"learning_rate": 2.5e-05,
"loss": 0.8979,
"step": 25
},
{
"epoch": 0.03172666259914582,
"grad_norm": 2.919682741165161,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.7295,
"step": 26
},
{
"epoch": 0.03294691885295912,
"grad_norm": 3.6534736156463623,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.1221,
"step": 27
},
{
"epoch": 0.034167175106772425,
"grad_norm": 2.0373101234436035,
"learning_rate": 2.8e-05,
"loss": 0.5132,
"step": 28
},
{
"epoch": 0.035387431360585725,
"grad_norm": 2.241091251373291,
"learning_rate": 2.9e-05,
"loss": 0.3159,
"step": 29
},
{
"epoch": 0.036607687614399025,
"grad_norm": 2.355558156967163,
"learning_rate": 3e-05,
"loss": 0.7025,
"step": 30
},
{
"epoch": 0.037827943868212324,
"grad_norm": 3.299184799194336,
"learning_rate": 2.9984895998119723e-05,
"loss": 0.4992,
"step": 31
},
{
"epoch": 0.039048200122025624,
"grad_norm": 5.94033145904541,
"learning_rate": 2.993961440992859e-05,
"loss": 1.2181,
"step": 32
},
{
"epoch": 0.040268456375838924,
"grad_norm": 2.8268990516662598,
"learning_rate": 2.9864246426519023e-05,
"loss": 0.4696,
"step": 33
},
{
"epoch": 0.04148871262965223,
"grad_norm": 2.7509398460388184,
"learning_rate": 2.9758943828979444e-05,
"loss": 0.4154,
"step": 34
},
{
"epoch": 0.04270896888346553,
"grad_norm": 2.9647717475891113,
"learning_rate": 2.9623918682727355e-05,
"loss": 0.67,
"step": 35
},
{
"epoch": 0.04392922513727883,
"grad_norm": 2.9392335414886475,
"learning_rate": 2.9459442910437798e-05,
"loss": 0.4028,
"step": 36
},
{
"epoch": 0.04514948139109213,
"grad_norm": 2.092904806137085,
"learning_rate": 2.9265847744427305e-05,
"loss": 0.7793,
"step": 37
},
{
"epoch": 0.04636973764490543,
"grad_norm": 2.533620595932007,
"learning_rate": 2.904352305959606e-05,
"loss": 0.3301,
"step": 38
},
{
"epoch": 0.04758999389871873,
"grad_norm": 2.668667793273926,
"learning_rate": 2.8792916588271762e-05,
"loss": 0.4193,
"step": 39
},
{
"epoch": 0.048810250152532035,
"grad_norm": 4.816693305969238,
"learning_rate": 2.8514533018536286e-05,
"loss": 0.5477,
"step": 40
},
{
"epoch": 0.050030506406345335,
"grad_norm": 4.066459655761719,
"learning_rate": 2.820893297785107e-05,
"loss": 0.4218,
"step": 41
},
{
"epoch": 0.051250762660158634,
"grad_norm": 3.1090385913848877,
"learning_rate": 2.7876731904027994e-05,
"loss": 0.2515,
"step": 42
},
{
"epoch": 0.052471018913971934,
"grad_norm": 2.967532157897949,
"learning_rate": 2.7518598805819542e-05,
"loss": 0.3014,
"step": 43
},
{
"epoch": 0.053691275167785234,
"grad_norm": 1.980920433998108,
"learning_rate": 2.7135254915624213e-05,
"loss": 0.2278,
"step": 44
},
{
"epoch": 0.05491153142159853,
"grad_norm": 2.5400314331054688,
"learning_rate": 2.672747223702045e-05,
"loss": 0.2601,
"step": 45
},
{
"epoch": 0.05613178767541183,
"grad_norm": 3.4992973804473877,
"learning_rate": 2.6296071990054167e-05,
"loss": 0.4071,
"step": 46
},
{
"epoch": 0.05735204392922514,
"grad_norm": 1.1667309999465942,
"learning_rate": 2.5841922957410875e-05,
"loss": 0.0667,
"step": 47
},
{
"epoch": 0.05857230018303844,
"grad_norm": 4.832851886749268,
"learning_rate": 2.5365939734802973e-05,
"loss": 0.3789,
"step": 48
},
{
"epoch": 0.05979255643685174,
"grad_norm": 5.837202072143555,
"learning_rate": 2.4869080889095693e-05,
"loss": 0.4912,
"step": 49
},
{
"epoch": 0.06101281269066504,
"grad_norm": 7.007938861846924,
"learning_rate": 2.4352347027881003e-05,
"loss": 0.2656,
"step": 50
},
{
"epoch": 0.06101281269066504,
"eval_loss": 0.492077499628067,
"eval_runtime": 40.2455,
"eval_samples_per_second": 8.572,
"eval_steps_per_second": 2.162,
"step": 50
},
{
"epoch": 0.06223306894447834,
"grad_norm": 1.555983543395996,
"learning_rate": 2.3816778784387097e-05,
"loss": 0.4603,
"step": 51
},
{
"epoch": 0.06345332519829164,
"grad_norm": 1.726062297821045,
"learning_rate": 2.3263454721781537e-05,
"loss": 0.4604,
"step": 52
},
{
"epoch": 0.06467358145210494,
"grad_norm": 1.718953013420105,
"learning_rate": 2.2693489161088592e-05,
"loss": 0.6968,
"step": 53
},
{
"epoch": 0.06589383770591824,
"grad_norm": 1.4064198732376099,
"learning_rate": 2.210802993709498e-05,
"loss": 0.5862,
"step": 54
},
{
"epoch": 0.06711409395973154,
"grad_norm": 1.369136095046997,
"learning_rate": 2.1508256086763372e-05,
"loss": 0.4746,
"step": 55
},
{
"epoch": 0.06833435021354485,
"grad_norm": 1.502777338027954,
"learning_rate": 2.0895375474808857e-05,
"loss": 0.5513,
"step": 56
},
{
"epoch": 0.06955460646735814,
"grad_norm": 1.6199352741241455,
"learning_rate": 2.0270622361220143e-05,
"loss": 0.4704,
"step": 57
},
{
"epoch": 0.07077486272117145,
"grad_norm": 1.1840685606002808,
"learning_rate": 1.963525491562421e-05,
"loss": 0.3981,
"step": 58
},
{
"epoch": 0.07199511897498474,
"grad_norm": 1.39909029006958,
"learning_rate": 1.8990552683500128e-05,
"loss": 0.2991,
"step": 59
},
{
"epoch": 0.07321537522879805,
"grad_norm": 1.3725414276123047,
"learning_rate": 1.8337814009344716e-05,
"loss": 0.7763,
"step": 60
},
{
"epoch": 0.07443563148261134,
"grad_norm": 1.4019646644592285,
"learning_rate": 1.767835342197955e-05,
"loss": 0.5279,
"step": 61
},
{
"epoch": 0.07565588773642465,
"grad_norm": 1.5673801898956299,
"learning_rate": 1.7013498987264832e-05,
"loss": 0.6006,
"step": 62
},
{
"epoch": 0.07687614399023796,
"grad_norm": 1.1575379371643066,
"learning_rate": 1.6344589633551502e-05,
"loss": 0.3972,
"step": 63
},
{
"epoch": 0.07809640024405125,
"grad_norm": 1.2837570905685425,
"learning_rate": 1.5672972455257726e-05,
"loss": 0.5369,
"step": 64
},
{
"epoch": 0.07931665649786455,
"grad_norm": 1.871349811553955,
"learning_rate": 1.5e-05,
"loss": 0.2951,
"step": 65
},
{
"epoch": 0.08053691275167785,
"grad_norm": 1.4740747213363647,
"learning_rate": 1.4327027544742281e-05,
"loss": 0.7773,
"step": 66
},
{
"epoch": 0.08175716900549115,
"grad_norm": 1.1336884498596191,
"learning_rate": 1.36554103664485e-05,
"loss": 0.3291,
"step": 67
},
{
"epoch": 0.08297742525930446,
"grad_norm": 1.4894648790359497,
"learning_rate": 1.2986501012735174e-05,
"loss": 0.5024,
"step": 68
},
{
"epoch": 0.08419768151311775,
"grad_norm": 1.3414102792739868,
"learning_rate": 1.2321646578020452e-05,
"loss": 0.4071,
"step": 69
},
{
"epoch": 0.08541793776693106,
"grad_norm": 1.8325251340866089,
"learning_rate": 1.1662185990655285e-05,
"loss": 0.8575,
"step": 70
},
{
"epoch": 0.08663819402074435,
"grad_norm": 1.1957671642303467,
"learning_rate": 1.1009447316499875e-05,
"loss": 0.3624,
"step": 71
},
{
"epoch": 0.08785845027455766,
"grad_norm": 1.479073405265808,
"learning_rate": 1.036474508437579e-05,
"loss": 0.3765,
"step": 72
},
{
"epoch": 0.08907870652837095,
"grad_norm": 1.1539698839187622,
"learning_rate": 9.729377638779859e-06,
"loss": 0.318,
"step": 73
},
{
"epoch": 0.09029896278218426,
"grad_norm": 1.653338074684143,
"learning_rate": 9.104624525191147e-06,
"loss": 0.3384,
"step": 74
},
{
"epoch": 0.09151921903599756,
"grad_norm": 1.7880301475524902,
"learning_rate": 8.491743913236629e-06,
"loss": 0.7647,
"step": 75
},
{
"epoch": 0.09273947528981086,
"grad_norm": 1.749808669090271,
"learning_rate": 7.89197006290502e-06,
"loss": 0.727,
"step": 76
},
{
"epoch": 0.09395973154362416,
"grad_norm": 1.8830204010009766,
"learning_rate": 7.30651083891141e-06,
"loss": 0.6698,
"step": 77
},
{
"epoch": 0.09517998779743746,
"grad_norm": 2.900890588760376,
"learning_rate": 6.736545278218464e-06,
"loss": 0.2807,
"step": 78
},
{
"epoch": 0.09640024405125076,
"grad_norm": 2.5810508728027344,
"learning_rate": 6.1832212156129045e-06,
"loss": 0.5424,
"step": 79
},
{
"epoch": 0.09762050030506407,
"grad_norm": 1.8283663988113403,
"learning_rate": 5.647652972118998e-06,
"loss": 0.7818,
"step": 80
},
{
"epoch": 0.09884075655887736,
"grad_norm": 2.1838881969451904,
"learning_rate": 5.130919110904311e-06,
"loss": 0.5499,
"step": 81
},
{
"epoch": 0.10006101281269067,
"grad_norm": 2.056957960128784,
"learning_rate": 4.6340602651970304e-06,
"loss": 0.6527,
"step": 82
},
{
"epoch": 0.10128126906650396,
"grad_norm": 1.682894229888916,
"learning_rate": 4.158077042589129e-06,
"loss": 0.3389,
"step": 83
},
{
"epoch": 0.10250152532031727,
"grad_norm": 1.9048279523849487,
"learning_rate": 3.7039280099458373e-06,
"loss": 0.5434,
"step": 84
},
{
"epoch": 0.10372178157413056,
"grad_norm": 1.9600788354873657,
"learning_rate": 3.272527762979553e-06,
"loss": 0.4242,
"step": 85
},
{
"epoch": 0.10494203782794387,
"grad_norm": 2.029824733734131,
"learning_rate": 2.86474508437579e-06,
"loss": 0.754,
"step": 86
},
{
"epoch": 0.10616229408175717,
"grad_norm": 1.647495150566101,
"learning_rate": 2.4814011941804603e-06,
"loss": 0.2308,
"step": 87
},
{
"epoch": 0.10738255033557047,
"grad_norm": 2.4339029788970947,
"learning_rate": 2.1232680959720085e-06,
"loss": 0.6364,
"step": 88
},
{
"epoch": 0.10860280658938377,
"grad_norm": 3.1728689670562744,
"learning_rate": 1.79106702214893e-06,
"loss": 0.3479,
"step": 89
},
{
"epoch": 0.10982306284319707,
"grad_norm": 1.8751670122146606,
"learning_rate": 1.4854669814637145e-06,
"loss": 0.2656,
"step": 90
},
{
"epoch": 0.11104331909701037,
"grad_norm": 1.807173252105713,
"learning_rate": 1.2070834117282414e-06,
"loss": 0.1919,
"step": 91
},
{
"epoch": 0.11226357535082367,
"grad_norm": 3.3415565490722656,
"learning_rate": 9.56476940403942e-07,
"loss": 0.2879,
"step": 92
},
{
"epoch": 0.11348383160463697,
"grad_norm": 6.014771938323975,
"learning_rate": 7.341522555726971e-07,
"loss": 0.486,
"step": 93
},
{
"epoch": 0.11470408785845028,
"grad_norm": 1.8159458637237549,
"learning_rate": 5.405570895622014e-07,
"loss": 0.6534,
"step": 94
},
{
"epoch": 0.11592434411226357,
"grad_norm": 2.759377956390381,
"learning_rate": 3.760813172726457e-07,
"loss": 0.3343,
"step": 95
},
{
"epoch": 0.11714460036607688,
"grad_norm": 2.1716294288635254,
"learning_rate": 2.41056171020555e-07,
"loss": 0.2218,
"step": 96
},
{
"epoch": 0.11836485661989017,
"grad_norm": 3.249907970428467,
"learning_rate": 1.357535734809795e-07,
"loss": 0.3062,
"step": 97
},
{
"epoch": 0.11958511287370348,
"grad_norm": 3.0555734634399414,
"learning_rate": 6.038559007141397e-08,
"loss": 0.2289,
"step": 98
},
{
"epoch": 0.12080536912751678,
"grad_norm": 5.055708408355713,
"learning_rate": 1.510400188028116e-08,
"loss": 0.2759,
"step": 99
},
{
"epoch": 0.12202562538133008,
"grad_norm": 7.7404255867004395,
"learning_rate": 0.0,
"loss": 0.5774,
"step": 100
},
{
"epoch": 0.12202562538133008,
"eval_loss": 0.445011705160141,
"eval_runtime": 40.2119,
"eval_samples_per_second": 8.58,
"eval_steps_per_second": 2.164,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.572495321923584e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}