sbrzz's picture
Upload 13 files
1f112dd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.009173259947253756,
"eval_steps": 500,
"global_step": 80,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 5.8857,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 5.8613,
"step": 2
},
{
"epoch": 0.0,
"grad_norm": 0.0,
"learning_rate": 0.0,
"loss": 5.8757,
"step": 3
},
{
"epoch": 0.0,
"grad_norm": 22.244311559060773,
"learning_rate": 3.816793893129771e-06,
"loss": 6.0653,
"step": 4
},
{
"epoch": 0.0,
"grad_norm": 15.057639900452392,
"learning_rate": 7.633587786259541e-06,
"loss": 5.9572,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 38.092887646390466,
"learning_rate": 1.1450381679389314e-05,
"loss": 5.7617,
"step": 6
},
{
"epoch": 0.0,
"grad_norm": 15.31117053293346,
"learning_rate": 1.5267175572519083e-05,
"loss": 5.8945,
"step": 7
},
{
"epoch": 0.0,
"grad_norm": 21.702345688707492,
"learning_rate": 1.9083969465648855e-05,
"loss": 5.9337,
"step": 8
},
{
"epoch": 0.0,
"grad_norm": 21.702345688707492,
"learning_rate": 1.9083969465648855e-05,
"loss": 5.8771,
"step": 9
},
{
"epoch": 0.0,
"grad_norm": 26.25713504824,
"learning_rate": 2.2900763358778628e-05,
"loss": 5.879,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 88.79206625984197,
"learning_rate": 2.6717557251908397e-05,
"loss": 5.6244,
"step": 11
},
{
"epoch": 0.0,
"grad_norm": 13.979462978780887,
"learning_rate": 3.0534351145038166e-05,
"loss": 5.7982,
"step": 12
},
{
"epoch": 0.0,
"grad_norm": 13.979462978780887,
"learning_rate": 3.0534351145038166e-05,
"loss": 5.8216,
"step": 13
},
{
"epoch": 0.0,
"grad_norm": 58.311502538831355,
"learning_rate": 3.435114503816794e-05,
"loss": 5.7539,
"step": 14
},
{
"epoch": 0.0,
"grad_norm": 20.71528208128746,
"learning_rate": 3.816793893129771e-05,
"loss": 5.7114,
"step": 15
},
{
"epoch": 0.0,
"grad_norm": 33.69759468630642,
"learning_rate": 4.198473282442748e-05,
"loss": 5.6904,
"step": 16
},
{
"epoch": 0.0,
"grad_norm": 17.28319140833229,
"learning_rate": 4.5801526717557256e-05,
"loss": 5.6718,
"step": 17
},
{
"epoch": 0.0,
"grad_norm": 44.39956377017333,
"learning_rate": 4.9618320610687025e-05,
"loss": 5.4446,
"step": 18
},
{
"epoch": 0.0,
"grad_norm": 91.68382396043916,
"learning_rate": 5.3435114503816794e-05,
"loss": 5.6799,
"step": 19
},
{
"epoch": 0.0,
"grad_norm": 91.68382396043916,
"learning_rate": 5.3435114503816794e-05,
"loss": 5.606,
"step": 20
},
{
"epoch": 0.0,
"grad_norm": 192.72226388224067,
"learning_rate": 5.725190839694656e-05,
"loss": 5.5796,
"step": 21
},
{
"epoch": 0.0,
"grad_norm": 45.566936397354795,
"learning_rate": 6.106870229007633e-05,
"loss": 5.6014,
"step": 22
},
{
"epoch": 0.0,
"grad_norm": 54.111992406676734,
"learning_rate": 6.488549618320611e-05,
"loss": 5.4642,
"step": 23
},
{
"epoch": 0.0,
"grad_norm": 47.77738772861109,
"learning_rate": 6.870229007633588e-05,
"loss": 5.3002,
"step": 24
},
{
"epoch": 0.0,
"grad_norm": 52.88816210898902,
"learning_rate": 7.251908396946565e-05,
"loss": 5.8573,
"step": 25
},
{
"epoch": 0.0,
"grad_norm": 43.38379795566033,
"learning_rate": 7.633587786259542e-05,
"loss": 5.5942,
"step": 26
},
{
"epoch": 0.0,
"grad_norm": 112.9679807995391,
"learning_rate": 8.015267175572518e-05,
"loss": 5.7442,
"step": 27
},
{
"epoch": 0.0,
"grad_norm": 112.9679807995391,
"learning_rate": 8.015267175572518e-05,
"loss": 5.447,
"step": 28
},
{
"epoch": 0.0,
"grad_norm": 164.11451980650267,
"learning_rate": 8.396946564885496e-05,
"loss": 5.7154,
"step": 29
},
{
"epoch": 0.0,
"grad_norm": 115.9334173332188,
"learning_rate": 8.778625954198472e-05,
"loss": 5.9088,
"step": 30
},
{
"epoch": 0.0,
"grad_norm": 145.9836977981191,
"learning_rate": 9.160305343511451e-05,
"loss": 5.4605,
"step": 31
},
{
"epoch": 0.0,
"grad_norm": 114.64052697776405,
"learning_rate": 9.541984732824429e-05,
"loss": 5.697,
"step": 32
},
{
"epoch": 0.0,
"grad_norm": 202.12636675775389,
"learning_rate": 9.923664122137405e-05,
"loss": 6.0274,
"step": 33
},
{
"epoch": 0.0,
"grad_norm": 160.88000887426793,
"learning_rate": 0.00010305343511450383,
"loss": 6.2896,
"step": 34
},
{
"epoch": 0.0,
"grad_norm": 145.16182847186317,
"learning_rate": 0.00010687022900763359,
"loss": 5.9883,
"step": 35
},
{
"epoch": 0.0,
"grad_norm": 104.5781091944148,
"learning_rate": 0.00011068702290076336,
"loss": 6.1505,
"step": 36
},
{
"epoch": 0.0,
"grad_norm": 55.72279835011099,
"learning_rate": 0.00011450381679389313,
"loss": 6.458,
"step": 37
},
{
"epoch": 0.0,
"grad_norm": 72.60539121615658,
"learning_rate": 0.0001183206106870229,
"loss": 6.4766,
"step": 38
},
{
"epoch": 0.0,
"grad_norm": 152.31919342671264,
"learning_rate": 0.00012213740458015266,
"loss": 6.6228,
"step": 39
},
{
"epoch": 0.0,
"grad_norm": 195.38778604806365,
"learning_rate": 0.00012595419847328244,
"loss": 6.5874,
"step": 40
},
{
"epoch": 0.0,
"grad_norm": 98.21218214875543,
"learning_rate": 0.00012977099236641222,
"loss": 6.2979,
"step": 41
},
{
"epoch": 0.0,
"grad_norm": 117.40378533203793,
"learning_rate": 0.000133587786259542,
"loss": 6.1422,
"step": 42
},
{
"epoch": 0.0,
"grad_norm": 76.43242080692808,
"learning_rate": 0.00013740458015267177,
"loss": 6.0982,
"step": 43
},
{
"epoch": 0.01,
"grad_norm": 161.5295826913437,
"learning_rate": 0.00014122137404580154,
"loss": 5.9792,
"step": 44
},
{
"epoch": 0.01,
"grad_norm": 54.30211860707633,
"learning_rate": 0.0001450381679389313,
"loss": 6.0895,
"step": 45
},
{
"epoch": 0.01,
"grad_norm": 96.35953226922737,
"learning_rate": 0.00014885496183206107,
"loss": 6.1023,
"step": 46
},
{
"epoch": 0.01,
"grad_norm": 49.71381292121367,
"learning_rate": 0.00015267175572519084,
"loss": 5.9927,
"step": 47
},
{
"epoch": 0.01,
"grad_norm": 92.40570872689418,
"learning_rate": 0.00015648854961832062,
"loss": 5.8947,
"step": 48
},
{
"epoch": 0.01,
"grad_norm": 70.58634543270558,
"learning_rate": 0.00016030534351145037,
"loss": 5.5419,
"step": 49
},
{
"epoch": 0.01,
"grad_norm": 99.21861402306824,
"learning_rate": 0.00016412213740458014,
"loss": 5.533,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 60.43737769128788,
"learning_rate": 0.00016793893129770992,
"loss": 5.7271,
"step": 51
},
{
"epoch": 0.01,
"grad_norm": 40.38259047816709,
"learning_rate": 0.0001717557251908397,
"loss": 5.7707,
"step": 52
},
{
"epoch": 0.01,
"grad_norm": 50.37624352755525,
"learning_rate": 0.00017557251908396944,
"loss": 5.4807,
"step": 53
},
{
"epoch": 0.01,
"grad_norm": 105.31786701509579,
"learning_rate": 0.00017938931297709925,
"loss": 5.5782,
"step": 54
},
{
"epoch": 0.01,
"grad_norm": 58.697213953188964,
"learning_rate": 0.00018320610687022902,
"loss": 5.5337,
"step": 55
},
{
"epoch": 0.01,
"grad_norm": 110.55644774315732,
"learning_rate": 0.0001870229007633588,
"loss": 5.5836,
"step": 56
},
{
"epoch": 0.01,
"grad_norm": 14.426822607818815,
"learning_rate": 0.00019083969465648857,
"loss": 5.6155,
"step": 57
},
{
"epoch": 0.01,
"grad_norm": 26.228166827626183,
"learning_rate": 0.00019465648854961832,
"loss": 5.6817,
"step": 58
},
{
"epoch": 0.01,
"grad_norm": 27.269174056089717,
"learning_rate": 0.0001984732824427481,
"loss": 5.2749,
"step": 59
},
{
"epoch": 0.01,
"grad_norm": 21.695242218914665,
"learning_rate": 0.00020229007633587788,
"loss": 5.5323,
"step": 60
},
{
"epoch": 0.01,
"grad_norm": 68.62936938874972,
"learning_rate": 0.00020610687022900765,
"loss": 5.804,
"step": 61
},
{
"epoch": 0.01,
"grad_norm": 25.97127488754509,
"learning_rate": 0.0002099236641221374,
"loss": 5.4587,
"step": 62
},
{
"epoch": 0.01,
"grad_norm": 15.325961009638357,
"learning_rate": 0.00021374045801526718,
"loss": 5.4342,
"step": 63
},
{
"epoch": 0.01,
"grad_norm": 19.875772589083574,
"learning_rate": 0.00021755725190839695,
"loss": 5.5249,
"step": 64
},
{
"epoch": 0.01,
"grad_norm": 69.24118271312939,
"learning_rate": 0.00022137404580152673,
"loss": 5.1242,
"step": 65
},
{
"epoch": 0.01,
"grad_norm": 26.355890547603202,
"learning_rate": 0.00022519083969465648,
"loss": 5.5468,
"step": 66
},
{
"epoch": 0.01,
"grad_norm": 10.563329361046026,
"learning_rate": 0.00022900763358778625,
"loss": 5.2759,
"step": 67
},
{
"epoch": 0.01,
"grad_norm": 91.00091143398366,
"learning_rate": 0.00023282442748091603,
"loss": 5.4683,
"step": 68
},
{
"epoch": 0.01,
"grad_norm": 31.924406772853743,
"learning_rate": 0.0002366412213740458,
"loss": 5.4741,
"step": 69
},
{
"epoch": 0.01,
"grad_norm": 59.162721435471,
"learning_rate": 0.00024045801526717558,
"loss": 5.2497,
"step": 70
},
{
"epoch": 0.01,
"grad_norm": 31.455685925896024,
"learning_rate": 0.00024427480916030533,
"loss": 5.3254,
"step": 71
},
{
"epoch": 0.01,
"grad_norm": 67.5878959609375,
"learning_rate": 0.00024809160305343513,
"loss": 5.4352,
"step": 72
},
{
"epoch": 0.01,
"grad_norm": 42.716427641408124,
"learning_rate": 0.0002519083969465649,
"loss": 5.1745,
"step": 73
},
{
"epoch": 0.01,
"grad_norm": 51.763664942049346,
"learning_rate": 0.00025572519083969463,
"loss": 5.3703,
"step": 74
},
{
"epoch": 0.01,
"grad_norm": 45.70575367441054,
"learning_rate": 0.00025954198473282443,
"loss": 5.3987,
"step": 75
},
{
"epoch": 0.01,
"grad_norm": 22.180135968343524,
"learning_rate": 0.0002633587786259542,
"loss": 5.4224,
"step": 76
},
{
"epoch": 0.01,
"grad_norm": 60.31973560441679,
"learning_rate": 0.000267175572519084,
"loss": 5.6587,
"step": 77
},
{
"epoch": 0.01,
"grad_norm": 33.89974772165083,
"learning_rate": 0.00027099236641221373,
"loss": 5.6768,
"step": 78
},
{
"epoch": 0.01,
"grad_norm": 34.7175502203455,
"learning_rate": 0.00027480916030534353,
"loss": 5.4154,
"step": 79
},
{
"epoch": 0.01,
"grad_norm": 34.357468729191154,
"learning_rate": 0.0002786259541984733,
"loss": 5.4172,
"step": 80
}
],
"logging_steps": 1.0,
"max_steps": 8721,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10,
"total_flos": 419593936896.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}