Training in progress, step 230, checkpoint

2ad8d6e verified 25 days ago

40.9 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.9978308026030369,
	"eval_steps": 58,
	"global_step": 230,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 0.004338394793926247,
	"grad_norm": 0.2975526452064514,
	"learning_rate": 4.000000000000001e-06,
	"loss": 3.4415,
	"step": 1
	},
	{
	"epoch": 0.004338394793926247,
	"eval_loss": 4.956099033355713,
	"eval_runtime": 43.9816,
	"eval_samples_per_second": 8.845,
	"eval_steps_per_second": 2.228,
	"step": 1
	},
	{
	"epoch": 0.008676789587852495,
	"grad_norm": 0.41739514470100403,
	"learning_rate": 8.000000000000001e-06,
	"loss": 3.5934,
	"step": 2
	},
	{
	"epoch": 0.013015184381778741,
	"grad_norm": 0.47740957140922546,
	"learning_rate": 1.2e-05,
	"loss": 3.7866,
	"step": 3
	},
	{
	"epoch": 0.01735357917570499,
	"grad_norm": 0.5908945798873901,
	"learning_rate": 1.6000000000000003e-05,
	"loss": 3.8979,
	"step": 4
	},
	{
	"epoch": 0.021691973969631236,
	"grad_norm": 0.602057933807373,
	"learning_rate": 2e-05,
	"loss": 3.8257,
	"step": 5
	},
	{
	"epoch": 0.026030368763557483,
	"grad_norm": 0.7276618480682373,
	"learning_rate": 2.4e-05,
	"loss": 4.0716,
	"step": 6
	},
	{
	"epoch": 0.03036876355748373,
	"grad_norm": 0.7895606160163879,
	"learning_rate": 2.8000000000000003e-05,
	"loss": 4.1982,
	"step": 7
	},
	{
	"epoch": 0.03470715835140998,
	"grad_norm": 0.9524717926979065,
	"learning_rate": 3.2000000000000005e-05,
	"loss": 4.1916,
	"step": 8
	},
	{
	"epoch": 0.039045553145336226,
	"grad_norm": 0.9786620736122131,
	"learning_rate": 3.6e-05,
	"loss": 4.0969,
	"step": 9
	},
	{
	"epoch": 0.04338394793926247,
	"grad_norm": 1.0913058519363403,
	"learning_rate": 4e-05,
	"loss": 4.3758,
	"step": 10
	},
	{
	"epoch": 0.04772234273318872,
	"grad_norm": 1.2519152164459229,
	"learning_rate": 4.4000000000000006e-05,
	"loss": 4.3174,
	"step": 11
	},
	{
	"epoch": 0.052060737527114966,
	"grad_norm": 1.4428540468215942,
	"learning_rate": 4.8e-05,
	"loss": 4.2927,
	"step": 12
	},
	{
	"epoch": 0.05639913232104121,
	"grad_norm": 1.557953953742981,
	"learning_rate": 5.2000000000000004e-05,
	"loss": 4.3237,
	"step": 13
	},
	{
	"epoch": 0.06073752711496746,
	"grad_norm": 1.791407585144043,
	"learning_rate": 5.6000000000000006e-05,
	"loss": 4.3922,
	"step": 14
	},
	{
	"epoch": 0.0650759219088937,
	"grad_norm": 1.829128623008728,
	"learning_rate": 6e-05,
	"loss": 4.2218,
	"step": 15
	},
	{
	"epoch": 0.06941431670281996,
	"grad_norm": 1.8731590509414673,
	"learning_rate": 6.400000000000001e-05,
	"loss": 4.294,
	"step": 16
	},
	{
	"epoch": 0.0737527114967462,
	"grad_norm": 2.140212297439575,
	"learning_rate": 6.800000000000001e-05,
	"loss": 4.3197,
	"step": 17
	},
	{
	"epoch": 0.07809110629067245,
	"grad_norm": 2.5610997676849365,
	"learning_rate": 7.2e-05,
	"loss": 4.3203,
	"step": 18
	},
	{
	"epoch": 0.0824295010845987,
	"grad_norm": 2.5937764644622803,
	"learning_rate": 7.6e-05,
	"loss": 4.2128,
	"step": 19
	},
	{
	"epoch": 0.08676789587852494,
	"grad_norm": 1.9964145421981812,
	"learning_rate": 8e-05,
	"loss": 4.1141,
	"step": 20
	},
	{
	"epoch": 0.0911062906724512,
	"grad_norm": 1.9274357557296753,
	"learning_rate": 8.4e-05,
	"loss": 3.9559,
	"step": 21
	},
	{
	"epoch": 0.09544468546637744,
	"grad_norm": 2.1689515113830566,
	"learning_rate": 8.800000000000001e-05,
	"loss": 4.0459,
	"step": 22
	},
	{
	"epoch": 0.09978308026030369,
	"grad_norm": 2.417027235031128,
	"learning_rate": 9.200000000000001e-05,
	"loss": 3.8996,
	"step": 23
	},
	{
	"epoch": 0.10412147505422993,
	"grad_norm": 2.925503969192505,
	"learning_rate": 9.6e-05,
	"loss": 4.0918,
	"step": 24
	},
	{
	"epoch": 0.10845986984815618,
	"grad_norm": 4.8928961753845215,
	"learning_rate": 0.0001,
	"loss": 4.332,
	"step": 25
	},
	{
	"epoch": 0.11279826464208242,
	"grad_norm": 3.4765207767486572,
	"learning_rate": 0.00010400000000000001,
	"loss": 3.7618,
	"step": 26
	},
	{
	"epoch": 0.11713665943600868,
	"grad_norm": 3.5958409309387207,
	"learning_rate": 0.00010800000000000001,
	"loss": 3.7645,
	"step": 27
	},
	{
	"epoch": 0.12147505422993492,
	"grad_norm": 3.053165912628174,
	"learning_rate": 0.00011200000000000001,
	"loss": 3.6536,
	"step": 28
	},
	{
	"epoch": 0.12581344902386118,
	"grad_norm": 2.3347203731536865,
	"learning_rate": 0.000116,
	"loss": 3.8871,
	"step": 29
	},
	{
	"epoch": 0.1301518438177874,
	"grad_norm": 1.4141613245010376,
	"learning_rate": 0.00012,
	"loss": 3.6367,
	"step": 30
	},
	{
	"epoch": 0.13449023861171366,
	"grad_norm": 1.1042953729629517,
	"learning_rate": 0.000124,
	"loss": 3.4273,
	"step": 31
	},
	{
	"epoch": 0.13882863340563992,
	"grad_norm": 0.9391370415687561,
	"learning_rate": 0.00012800000000000002,
	"loss": 3.635,
	"step": 32
	},
	{
	"epoch": 0.14316702819956617,
	"grad_norm": 1.028341293334961,
	"learning_rate": 0.000132,
	"loss": 3.845,
	"step": 33
	},
	{
	"epoch": 0.1475054229934924,
	"grad_norm": 1.0668063163757324,
	"learning_rate": 0.00013600000000000003,
	"loss": 3.732,
	"step": 34
	},
	{
	"epoch": 0.15184381778741865,
	"grad_norm": 1.0369871854782104,
	"learning_rate": 0.00014,
	"loss": 3.6734,
	"step": 35
	},
	{
	"epoch": 0.1561822125813449,
	"grad_norm": 1.0699695348739624,
	"learning_rate": 0.000144,
	"loss": 3.5469,
	"step": 36
	},
	{
	"epoch": 0.16052060737527116,
	"grad_norm": 1.1715625524520874,
	"learning_rate": 0.000148,
	"loss": 3.5759,
	"step": 37
	},
	{
	"epoch": 0.1648590021691974,
	"grad_norm": 1.2680530548095703,
	"learning_rate": 0.000152,
	"loss": 3.7013,
	"step": 38
	},
	{
	"epoch": 0.16919739696312364,
	"grad_norm": 1.2043352127075195,
	"learning_rate": 0.00015600000000000002,
	"loss": 3.74,
	"step": 39
	},
	{
	"epoch": 0.1735357917570499,
	"grad_norm": 1.342244029045105,
	"learning_rate": 0.00016,
	"loss": 3.7761,
	"step": 40
	},
	{
	"epoch": 0.17787418655097614,
	"grad_norm": 1.4112831354141235,
	"learning_rate": 0.000164,
	"loss": 3.6449,
	"step": 41
	},
	{
	"epoch": 0.1822125813449024,
	"grad_norm": 1.3947268724441528,
	"learning_rate": 0.000168,
	"loss": 3.6043,
	"step": 42
	},
	{
	"epoch": 0.18655097613882862,
	"grad_norm": 1.5763946771621704,
	"learning_rate": 0.000172,
	"loss": 3.4768,
	"step": 43
	},
	{
	"epoch": 0.19088937093275488,
	"grad_norm": 1.9006760120391846,
	"learning_rate": 0.00017600000000000002,
	"loss": 3.6424,
	"step": 44
	},
	{
	"epoch": 0.19522776572668113,
	"grad_norm": 2.0071113109588623,
	"learning_rate": 0.00018,
	"loss": 3.7499,
	"step": 45
	},
	{
	"epoch": 0.19956616052060738,
	"grad_norm": 2.002067804336548,
	"learning_rate": 0.00018400000000000003,
	"loss": 3.6082,
	"step": 46
	},
	{
	"epoch": 0.2039045553145336,
	"grad_norm": 2.4698357582092285,
	"learning_rate": 0.000188,
	"loss": 3.7604,
	"step": 47
	},
	{
	"epoch": 0.20824295010845986,
	"grad_norm": 3.051906108856201,
	"learning_rate": 0.000192,
	"loss": 3.7546,
	"step": 48
	},
	{
	"epoch": 0.21258134490238612,
	"grad_norm": 3.100890636444092,
	"learning_rate": 0.000196,
	"loss": 3.6134,
	"step": 49
	},
	{
	"epoch": 0.21691973969631237,
	"grad_norm": 4.4481425285339355,
	"learning_rate": 0.0002,
	"loss": 3.355,
	"step": 50
	},
	{
	"epoch": 0.22125813449023862,
	"grad_norm": 4.157866954803467,
	"learning_rate": 0.00019998476951563915,
	"loss": 3.5229,
	"step": 51
	},
	{
	"epoch": 0.22559652928416485,
	"grad_norm": 5.159533500671387,
	"learning_rate": 0.0001999390827019096,
	"loss": 3.9098,
	"step": 52
	},
	{
	"epoch": 0.2299349240780911,
	"grad_norm": 4.372255325317383,
	"learning_rate": 0.0001998629534754574,
	"loss": 3.8285,
	"step": 53
	},
	{
	"epoch": 0.23427331887201736,
	"grad_norm": 2.7389180660247803,
	"learning_rate": 0.00019975640502598244,
	"loss": 3.5892,
	"step": 54
	},
	{
	"epoch": 0.2386117136659436,
	"grad_norm": 1.623792290687561,
	"learning_rate": 0.00019961946980917456,
	"loss": 3.6253,
	"step": 55
	},
	{
	"epoch": 0.24295010845986983,
	"grad_norm": 1.0698862075805664,
	"learning_rate": 0.00019945218953682734,
	"loss": 3.4747,
	"step": 56
	},
	{
	"epoch": 0.2472885032537961,
	"grad_norm": 1.0480446815490723,
	"learning_rate": 0.00019925461516413223,
	"loss": 3.5076,
	"step": 57
	},
	{
	"epoch": 0.25162689804772237,
	"grad_norm": 1.1356984376907349,
	"learning_rate": 0.00019902680687415705,
	"loss": 3.4875,
	"step": 58
	},
	{
	"epoch": 0.25162689804772237,
	"eval_loss": 3.5454585552215576,
	"eval_runtime": 43.9485,
	"eval_samples_per_second": 8.851,
	"eval_steps_per_second": 2.23,
	"step": 58
	},
	{
	"epoch": 0.2559652928416486,
	"grad_norm": 1.147985816001892,
	"learning_rate": 0.00019876883405951377,
	"loss": 3.5368,
	"step": 59
	},
	{
	"epoch": 0.2603036876355748,
	"grad_norm": 1.167962670326233,
	"learning_rate": 0.00019848077530122083,
	"loss": 3.4885,
	"step": 60
	},
	{
	"epoch": 0.2646420824295011,
	"grad_norm": 1.1241693496704102,
	"learning_rate": 0.00019816271834476642,
	"loss": 3.5335,
	"step": 61
	},
	{
	"epoch": 0.26898047722342733,
	"grad_norm": 1.0841178894042969,
	"learning_rate": 0.00019781476007338058,
	"loss": 3.5822,
	"step": 62
	},
	{
	"epoch": 0.27331887201735355,
	"grad_norm": 1.1276164054870605,
	"learning_rate": 0.00019743700647852354,
	"loss": 3.4757,
	"step": 63
	},
	{
	"epoch": 0.27765726681127983,
	"grad_norm": 1.192659854888916,
	"learning_rate": 0.00019702957262759965,
	"loss": 3.4212,
	"step": 64
	},
	{
	"epoch": 0.28199566160520606,
	"grad_norm": 1.2061688899993896,
	"learning_rate": 0.00019659258262890683,
	"loss": 3.4564,
	"step": 65
	},
	{
	"epoch": 0.28633405639913234,
	"grad_norm": 1.4012079238891602,
	"learning_rate": 0.0001961261695938319,
	"loss": 3.423,
	"step": 66
	},
	{
	"epoch": 0.29067245119305857,
	"grad_norm": 1.3591368198394775,
	"learning_rate": 0.00019563047559630357,
	"loss": 3.5284,
	"step": 67
	},
	{
	"epoch": 0.2950108459869848,
	"grad_norm": 1.3555010557174683,
	"learning_rate": 0.00019510565162951537,
	"loss": 3.4406,
	"step": 68
	},
	{
	"epoch": 0.2993492407809111,
	"grad_norm": 1.4745391607284546,
	"learning_rate": 0.0001945518575599317,
	"loss": 3.3899,
	"step": 69
	},
	{
	"epoch": 0.3036876355748373,
	"grad_norm": 1.6432572603225708,
	"learning_rate": 0.00019396926207859084,
	"loss": 3.4343,
	"step": 70
	},
	{
	"epoch": 0.3080260303687636,
	"grad_norm": 1.9187488555908203,
	"learning_rate": 0.00019335804264972018,
	"loss": 3.5881,
	"step": 71
	},
	{
	"epoch": 0.3123644251626898,
	"grad_norm": 2.1937949657440186,
	"learning_rate": 0.00019271838545667876,
	"loss": 3.3765,
	"step": 72
	},
	{
	"epoch": 0.31670281995661603,
	"grad_norm": 2.376640558242798,
	"learning_rate": 0.00019205048534524406,
	"loss": 3.2758,
	"step": 73
	},
	{
	"epoch": 0.3210412147505423,
	"grad_norm": 3.0442004203796387,
	"learning_rate": 0.0001913545457642601,
	"loss": 3.5366,
	"step": 74
	},
	{
	"epoch": 0.32537960954446854,
	"grad_norm": 3.6359612941741943,
	"learning_rate": 0.000190630778703665,
	"loss": 3.0313,
	"step": 75
	},
	{
	"epoch": 0.3297180043383948,
	"grad_norm": 4.367193698883057,
	"learning_rate": 0.0001898794046299167,
	"loss": 3.3864,
	"step": 76
	},
	{
	"epoch": 0.33405639913232105,
	"grad_norm": 5.261653900146484,
	"learning_rate": 0.0001891006524188368,
	"loss": 3.5411,
	"step": 77
	},
	{
	"epoch": 0.3383947939262473,
	"grad_norm": 5.341316223144531,
	"learning_rate": 0.00018829475928589271,
	"loss": 3.843,
	"step": 78
	},
	{
	"epoch": 0.34273318872017355,
	"grad_norm": 2.9030559062957764,
	"learning_rate": 0.00018746197071393958,
	"loss": 3.4254,
	"step": 79
	},
	{
	"epoch": 0.3470715835140998,
	"grad_norm": 1.4780312776565552,
	"learning_rate": 0.00018660254037844388,
	"loss": 3.4877,
	"step": 80
	},
	{
	"epoch": 0.351409978308026,
	"grad_norm": 1.0593628883361816,
	"learning_rate": 0.00018571673007021123,
	"loss": 3.3987,
	"step": 81
	},
	{
	"epoch": 0.3557483731019523,
	"grad_norm": 0.9910492897033691,
	"learning_rate": 0.0001848048096156426,
	"loss": 3.4944,
	"step": 82
	},
	{
	"epoch": 0.3600867678958785,
	"grad_norm": 1.004767656326294,
	"learning_rate": 0.00018386705679454242,
	"loss": 3.4143,
	"step": 83
	},
	{
	"epoch": 0.3644251626898048,
	"grad_norm": 1.012804627418518,
	"learning_rate": 0.00018290375725550417,
	"loss": 3.4504,
	"step": 84
	},
	{
	"epoch": 0.368763557483731,
	"grad_norm": 1.0758857727050781,
	"learning_rate": 0.0001819152044288992,
	"loss": 3.5181,
	"step": 85
	},
	{
	"epoch": 0.37310195227765725,
	"grad_norm": 1.0776313543319702,
	"learning_rate": 0.00018090169943749476,
	"loss": 3.4293,
	"step": 86
	},
	{
	"epoch": 0.3774403470715835,
	"grad_norm": 1.0856565237045288,
	"learning_rate": 0.00017986355100472928,
	"loss": 3.3012,
	"step": 87
	},
	{
	"epoch": 0.38177874186550975,
	"grad_norm": 1.146246075630188,
	"learning_rate": 0.00017880107536067218,
	"loss": 3.5778,
	"step": 88
	},
	{
	"epoch": 0.38611713665943603,
	"grad_norm": 1.1812922954559326,
	"learning_rate": 0.0001777145961456971,
	"loss": 3.2835,
	"step": 89
	},
	{
	"epoch": 0.39045553145336226,
	"grad_norm": 1.3535960912704468,
	"learning_rate": 0.0001766044443118978,
	"loss": 3.1863,
	"step": 90
	},
	{
	"epoch": 0.3947939262472885,
	"grad_norm": 1.312524437904358,
	"learning_rate": 0.00017547095802227723,
	"loss": 3.3794,
	"step": 91
	},
	{
	"epoch": 0.39913232104121477,
	"grad_norm": 1.2628040313720703,
	"learning_rate": 0.00017431448254773944,
	"loss": 3.2127,
	"step": 92
	},
	{
	"epoch": 0.403470715835141,
	"grad_norm": 1.3810231685638428,
	"learning_rate": 0.00017313537016191706,
	"loss": 3.3664,
	"step": 93
	},
	{
	"epoch": 0.4078091106290672,
	"grad_norm": 1.5726513862609863,
	"learning_rate": 0.0001719339800338651,
	"loss": 3.4052,
	"step": 94
	},
	{
	"epoch": 0.4121475054229935,
	"grad_norm": 1.5839647054672241,
	"learning_rate": 0.00017071067811865476,
	"loss": 3.2746,
	"step": 95
	},
	{
	"epoch": 0.4164859002169197,
	"grad_norm": 1.7605924606323242,
	"learning_rate": 0.00016946583704589973,
	"loss": 3.48,
	"step": 96
	},
	{
	"epoch": 0.420824295010846,
	"grad_norm": 2.3345723152160645,
	"learning_rate": 0.00016819983600624986,
	"loss": 3.2033,
	"step": 97
	},
	{
	"epoch": 0.42516268980477223,
	"grad_norm": 1.9480637311935425,
	"learning_rate": 0.00016691306063588583,
	"loss": 3.3796,
	"step": 98
	},
	{
	"epoch": 0.42950108459869846,
	"grad_norm": 2.3618791103363037,
	"learning_rate": 0.00016560590289905073,
	"loss": 3.1398,
	"step": 99
	},
	{
	"epoch": 0.43383947939262474,
	"grad_norm": 3.546729326248169,
	"learning_rate": 0.00016427876096865394,
	"loss": 3.0558,
	"step": 100
	},
	{
	"epoch": 0.43817787418655096,
	"grad_norm": 1.5932743549346924,
	"learning_rate": 0.00016293203910498376,
	"loss": 3.3932,
	"step": 101
	},
	{
	"epoch": 0.44251626898047725,
	"grad_norm": 2.039661407470703,
	"learning_rate": 0.0001615661475325658,
	"loss": 3.4066,
	"step": 102
	},
	{
	"epoch": 0.44685466377440347,
	"grad_norm": 1.742119312286377,
	"learning_rate": 0.00016018150231520486,
	"loss": 3.2823,
	"step": 103
	},
	{
	"epoch": 0.4511930585683297,
	"grad_norm": 1.5700186491012573,
	"learning_rate": 0.00015877852522924732,
	"loss": 3.3591,
	"step": 104
	},
	{
	"epoch": 0.455531453362256,
	"grad_norm": 1.136389970779419,
	"learning_rate": 0.0001573576436351046,
	"loss": 3.4663,
	"step": 105
	},
	{
	"epoch": 0.4598698481561822,
	"grad_norm": 0.8537334203720093,
	"learning_rate": 0.0001559192903470747,
	"loss": 3.4367,
	"step": 106
	},
	{
	"epoch": 0.4642082429501085,
	"grad_norm": 0.8642299175262451,
	"learning_rate": 0.00015446390350150273,
	"loss": 3.287,
	"step": 107
	},
	{
	"epoch": 0.4685466377440347,
	"grad_norm": 0.9279235601425171,
	"learning_rate": 0.0001529919264233205,
	"loss": 3.2911,
	"step": 108
	},
	{
	"epoch": 0.47288503253796094,
	"grad_norm": 0.9121331572532654,
	"learning_rate": 0.00015150380749100545,
	"loss": 3.3101,
	"step": 109
	},
	{
	"epoch": 0.4772234273318872,
	"grad_norm": 0.9868795275688171,
	"learning_rate": 0.00015000000000000001,
	"loss": 3.3431,
	"step": 110
	},
	{
	"epoch": 0.48156182212581344,
	"grad_norm": 1.0646886825561523,
	"learning_rate": 0.00014848096202463372,
	"loss": 3.3876,
	"step": 111
	},
	{
	"epoch": 0.48590021691973967,
	"grad_norm": 1.0819416046142578,
	"learning_rate": 0.00014694715627858908,
	"loss": 3.2128,
	"step": 112
	},
	{
	"epoch": 0.49023861171366595,
	"grad_norm": 1.0728636980056763,
	"learning_rate": 0.00014539904997395468,
	"loss": 3.2076,
	"step": 113
	},
	{
	"epoch": 0.4945770065075922,
	"grad_norm": 1.1562669277191162,
	"learning_rate": 0.00014383711467890774,
	"loss": 3.2825,
	"step": 114
	},
	{
	"epoch": 0.49891540130151846,
	"grad_norm": 1.1967557668685913,
	"learning_rate": 0.00014226182617406996,
	"loss": 3.2185,
	"step": 115
	},
	{
	"epoch": 0.5032537960954447,
	"grad_norm": 1.3139584064483643,
	"learning_rate": 0.00014067366430758004,
	"loss": 3.1373,
	"step": 116
	},
	{
	"epoch": 0.5032537960954447,
	"eval_loss": 3.2728052139282227,
	"eval_runtime": 43.9403,
	"eval_samples_per_second": 8.853,
	"eval_steps_per_second": 2.23,
	"step": 116
	},
	{
	"epoch": 0.5075921908893709,
	"grad_norm": 1.3170753717422485,
	"learning_rate": 0.00013907311284892736,
	"loss": 2.9572,
	"step": 117
	},
	{
	"epoch": 0.5119305856832972,
	"grad_norm": 1.5243107080459595,
	"learning_rate": 0.00013746065934159123,
	"loss": 3.3082,
	"step": 118
	},
	{
	"epoch": 0.5162689804772235,
	"grad_norm": 1.5845880508422852,
	"learning_rate": 0.00013583679495453,
	"loss": 3.4819,
	"step": 119
	},
	{
	"epoch": 0.5206073752711496,
	"grad_norm": 1.66307532787323,
	"learning_rate": 0.00013420201433256689,
	"loss": 3.1131,
	"step": 120
	},
	{
	"epoch": 0.5249457700650759,
	"grad_norm": 1.6470588445663452,
	"learning_rate": 0.00013255681544571568,
	"loss": 3.2847,
	"step": 121
	},
	{
	"epoch": 0.5292841648590022,
	"grad_norm": 2.1118075847625732,
	"learning_rate": 0.00013090169943749476,
	"loss": 3.4669,
	"step": 122
	},
	{
	"epoch": 0.5336225596529284,
	"grad_norm": 2.056396722793579,
	"learning_rate": 0.00012923717047227368,
	"loss": 3.1136,
	"step": 123
	},
	{
	"epoch": 0.5379609544468547,
	"grad_norm": 2.2389657497406006,
	"learning_rate": 0.0001275637355816999,
	"loss": 2.9323,
	"step": 124
	},
	{
	"epoch": 0.5422993492407809,
	"grad_norm": 2.863621711730957,
	"learning_rate": 0.00012588190451025207,
	"loss": 2.9585,
	"step": 125
	},
	{
	"epoch": 0.5466377440347071,
	"grad_norm": 0.8712321519851685,
	"learning_rate": 0.00012419218955996676,
	"loss": 3.1439,
	"step": 126
	},
	{
	"epoch": 0.5509761388286334,
	"grad_norm": 1.0713740587234497,
	"learning_rate": 0.0001224951054343865,
	"loss": 3.2213,
	"step": 127
	},
	{
	"epoch": 0.5553145336225597,
	"grad_norm": 1.104315996170044,
	"learning_rate": 0.00012079116908177593,
	"loss": 3.4522,
	"step": 128
	},
	{
	"epoch": 0.559652928416486,
	"grad_norm": 1.0883917808532715,
	"learning_rate": 0.00011908089953765449,
	"loss": 3.3503,
	"step": 129
	},
	{
	"epoch": 0.5639913232104121,
	"grad_norm": 1.0000834465026855,
	"learning_rate": 0.00011736481776669306,
	"loss": 3.4036,
	"step": 130
	},
	{
	"epoch": 0.5683297180043384,
	"grad_norm": 0.8869354128837585,
	"learning_rate": 0.0001156434465040231,
	"loss": 3.2749,
	"step": 131
	},
	{
	"epoch": 0.5726681127982647,
	"grad_norm": 0.8651937246322632,
	"learning_rate": 0.00011391731009600654,
	"loss": 3.3679,
	"step": 132
	},
	{
	"epoch": 0.5770065075921909,
	"grad_norm": 0.9174556136131287,
	"learning_rate": 0.00011218693434051475,
	"loss": 3.311,
	"step": 133
	},
	{
	"epoch": 0.5813449023861171,
	"grad_norm": 0.930533230304718,
	"learning_rate": 0.00011045284632676536,
	"loss": 3.3761,
	"step": 134
	},
	{
	"epoch": 0.5856832971800434,
	"grad_norm": 0.9851680994033813,
	"learning_rate": 0.00010871557427476583,
	"loss": 3.2752,
	"step": 135
	},
	{
	"epoch": 0.5900216919739696,
	"grad_norm": 0.9633740782737732,
	"learning_rate": 0.00010697564737441252,
	"loss": 3.2373,
	"step": 136
	},
	{
	"epoch": 0.5943600867678959,
	"grad_norm": 1.132585048675537,
	"learning_rate": 0.0001052335956242944,
	"loss": 3.2323,
	"step": 137
	},
	{
	"epoch": 0.5986984815618221,
	"grad_norm": 1.1232091188430786,
	"learning_rate": 0.00010348994967025012,
	"loss": 3.2874,
	"step": 138
	},
	{
	"epoch": 0.6030368763557483,
	"grad_norm": 1.2559125423431396,
	"learning_rate": 0.00010174524064372837,
	"loss": 3.2367,
	"step": 139
	},
	{
	"epoch": 0.6073752711496746,
	"grad_norm": 1.2623041868209839,
	"learning_rate": 0.0001,
	"loss": 3.2243,
	"step": 140
	},
	{
	"epoch": 0.6117136659436009,
	"grad_norm": 1.3554457426071167,
	"learning_rate": 9.825475935627165e-05,
	"loss": 3.4802,
	"step": 141
	},
	{
	"epoch": 0.6160520607375272,
	"grad_norm": 1.4170132875442505,
	"learning_rate": 9.651005032974994e-05,
	"loss": 3.354,
	"step": 142
	},
	{
	"epoch": 0.6203904555314533,
	"grad_norm": 1.4309097528457642,
	"learning_rate": 9.476640437570562e-05,
	"loss": 3.1352,
	"step": 143
	},
	{
	"epoch": 0.6247288503253796,
	"grad_norm": 1.5829153060913086,
	"learning_rate": 9.302435262558747e-05,
	"loss": 3.2455,
	"step": 144
	},
	{
	"epoch": 0.6290672451193059,
	"grad_norm": 1.8210502862930298,
	"learning_rate": 9.128442572523417e-05,
	"loss": 3.2991,
	"step": 145
	},
	{
	"epoch": 0.6334056399132321,
	"grad_norm": 1.842761516571045,
	"learning_rate": 8.954715367323468e-05,
	"loss": 3.2255,
	"step": 146
	},
	{
	"epoch": 0.6377440347071583,
	"grad_norm": 1.9258646965026855,
	"learning_rate": 8.781306565948528e-05,
	"loss": 3.1397,
	"step": 147
	},
	{
	"epoch": 0.6420824295010846,
	"grad_norm": 2.1189215183258057,
	"learning_rate": 8.608268990399349e-05,
	"loss": 3.0414,
	"step": 148
	},
	{
	"epoch": 0.6464208242950108,
	"grad_norm": 2.4063761234283447,
	"learning_rate": 8.435655349597689e-05,
	"loss": 2.8524,
	"step": 149
	},
	{
	"epoch": 0.6507592190889371,
	"grad_norm": 3.6420836448669434,
	"learning_rate": 8.263518223330697e-05,
	"loss": 3.0156,
	"step": 150
	},
	{
	"epoch": 0.6550976138828634,
	"grad_norm": 0.7080674171447754,
	"learning_rate": 8.091910046234552e-05,
	"loss": 3.1636,
	"step": 151
	},
	{
	"epoch": 0.6594360086767896,
	"grad_norm": 0.798520565032959,
	"learning_rate": 7.920883091822408e-05,
	"loss": 3.212,
	"step": 152
	},
	{
	"epoch": 0.6637744034707158,
	"grad_norm": 0.8640486001968384,
	"learning_rate": 7.750489456561352e-05,
	"loss": 3.1644,
	"step": 153
	},
	{
	"epoch": 0.6681127982646421,
	"grad_norm": 0.870906412601471,
	"learning_rate": 7.580781044003324e-05,
	"loss": 3.1876,
	"step": 154
	},
	{
	"epoch": 0.6724511930585684,
	"grad_norm": 0.8581348061561584,
	"learning_rate": 7.411809548974792e-05,
	"loss": 3.2739,
	"step": 155
	},
	{
	"epoch": 0.6767895878524945,
	"grad_norm": 0.8691614270210266,
	"learning_rate": 7.243626441830009e-05,
	"loss": 3.2444,
	"step": 156
	},
	{
	"epoch": 0.6811279826464208,
	"grad_norm": 0.9455673098564148,
	"learning_rate": 7.076282952772633e-05,
	"loss": 3.3004,
	"step": 157
	},
	{
	"epoch": 0.6854663774403471,
	"grad_norm": 0.8873337507247925,
	"learning_rate": 6.909830056250527e-05,
	"loss": 3.1778,
	"step": 158
	},
	{
	"epoch": 0.6898047722342733,
	"grad_norm": 0.910775363445282,
	"learning_rate": 6.744318455428436e-05,
	"loss": 3.1346,
	"step": 159
	},
	{
	"epoch": 0.6941431670281996,
	"grad_norm": 0.9872409105300903,
	"learning_rate": 6.579798566743314e-05,
	"loss": 3.1665,
	"step": 160
	},
	{
	"epoch": 0.6984815618221258,
	"grad_norm": 1.0516481399536133,
	"learning_rate": 6.416320504546997e-05,
	"loss": 3.3064,
	"step": 161
	},
	{
	"epoch": 0.702819956616052,
	"grad_norm": 1.0263571739196777,
	"learning_rate": 6.25393406584088e-05,
	"loss": 3.3698,
	"step": 162
	},
	{
	"epoch": 0.7071583514099783,
	"grad_norm": 1.1050878763198853,
	"learning_rate": 6.092688715107264e-05,
	"loss": 3.2436,
	"step": 163
	},
	{
	"epoch": 0.7114967462039046,
	"grad_norm": 1.1121841669082642,
	"learning_rate": 5.9326335692419995e-05,
	"loss": 2.9433,
	"step": 164
	},
	{
	"epoch": 0.7158351409978309,
	"grad_norm": 1.2424358129501343,
	"learning_rate": 5.773817382593008e-05,
	"loss": 3.3616,
	"step": 165
	},
	{
	"epoch": 0.720173535791757,
	"grad_norm": 1.1899327039718628,
	"learning_rate": 5.616288532109225e-05,
	"loss": 3.0392,
	"step": 166
	},
	{
	"epoch": 0.7245119305856833,
	"grad_norm": 1.3395730257034302,
	"learning_rate": 5.4600950026045326e-05,
	"loss": 3.0905,
	"step": 167
	},
	{
	"epoch": 0.7288503253796096,
	"grad_norm": 1.4268842935562134,
	"learning_rate": 5.305284372141095e-05,
	"loss": 3.1247,
	"step": 168
	},
	{
	"epoch": 0.7331887201735358,
	"grad_norm": 1.5514875650405884,
	"learning_rate": 5.15190379753663e-05,
	"loss": 3.3772,
	"step": 169
	},
	{
	"epoch": 0.737527114967462,
	"grad_norm": 1.8371058702468872,
	"learning_rate": 5.000000000000002e-05,
	"loss": 2.9262,
	"step": 170
	},
	{
	"epoch": 0.7418655097613883,
	"grad_norm": 1.7641676664352417,
	"learning_rate": 4.8496192508994576e-05,
	"loss": 3.0113,
	"step": 171
	},
	{
	"epoch": 0.7462039045553145,
	"grad_norm": 1.8325039148330688,
	"learning_rate": 4.700807357667952e-05,
	"loss": 3.0551,
	"step": 172
	},
	{
	"epoch": 0.7505422993492408,
	"grad_norm": 1.9740185737609863,
	"learning_rate": 4.5536096498497295e-05,
	"loss": 3.1479,
	"step": 173
	},
	{
	"epoch": 0.754880694143167,
	"grad_norm": 2.327420234680176,
	"learning_rate": 4.4080709652925336e-05,
	"loss": 3.2149,
	"step": 174
	},
	{
	"epoch": 0.754880694143167,
	"eval_loss": 3.163884162902832,
	"eval_runtime": 43.9914,
	"eval_samples_per_second": 8.843,
	"eval_steps_per_second": 2.228,
	"step": 174
	},
	{
	"epoch": 0.7592190889370932,
	"grad_norm": 3.558817148208618,
	"learning_rate": 4.264235636489542e-05,
	"loss": 3.0184,
	"step": 175
	},
	{
	"epoch": 0.7635574837310195,
	"grad_norm": 0.48007091879844666,
	"learning_rate": 4.12214747707527e-05,
	"loss": 3.0708,
	"step": 176
	},
	{
	"epoch": 0.7678958785249458,
	"grad_norm": 0.606035590171814,
	"learning_rate": 3.981849768479517e-05,
	"loss": 3.1584,
	"step": 177
	},
	{
	"epoch": 0.7722342733188721,
	"grad_norm": 0.6647348999977112,
	"learning_rate": 3.843385246743417e-05,
	"loss": 3.3003,
	"step": 178
	},
	{
	"epoch": 0.7765726681127982,
	"grad_norm": 0.6956514120101929,
	"learning_rate": 3.7067960895016275e-05,
	"loss": 3.2168,
	"step": 179
	},
	{
	"epoch": 0.7809110629067245,
	"grad_norm": 0.7575390338897705,
	"learning_rate": 3.5721239031346066e-05,
	"loss": 3.2576,
	"step": 180
	},
	{
	"epoch": 0.7852494577006508,
	"grad_norm": 0.8106915950775146,
	"learning_rate": 3.439409710094929e-05,
	"loss": 3.124,
	"step": 181
	},
	{
	"epoch": 0.789587852494577,
	"grad_norm": 0.873997688293457,
	"learning_rate": 3.308693936411421e-05,
	"loss": 3.0977,
	"step": 182
	},
	{
	"epoch": 0.7939262472885033,
	"grad_norm": 0.9612168073654175,
	"learning_rate": 3.1800163993750166e-05,
	"loss": 3.435,
	"step": 183
	},
	{
	"epoch": 0.7982646420824295,
	"grad_norm": 0.9549990892410278,
	"learning_rate": 3.053416295410026e-05,
	"loss": 3.2216,
	"step": 184
	},
	{
	"epoch": 0.8026030368763557,
	"grad_norm": 0.9309582710266113,
	"learning_rate": 2.9289321881345254e-05,
	"loss": 3.0546,
	"step": 185
	},
	{
	"epoch": 0.806941431670282,
	"grad_norm": 1.0800687074661255,
	"learning_rate": 2.8066019966134904e-05,
	"loss": 3.2283,
	"step": 186
	},
	{
	"epoch": 0.8112798264642083,
	"grad_norm": 1.0009733438491821,
	"learning_rate": 2.6864629838082956e-05,
	"loss": 3.1638,
	"step": 187
	},
	{
	"epoch": 0.8156182212581344,
	"grad_norm": 1.1134998798370361,
	"learning_rate": 2.5685517452260567e-05,
	"loss": 3.2642,
	"step": 188
	},
	{
	"epoch": 0.8199566160520607,
	"grad_norm": 1.1395593881607056,
	"learning_rate": 2.45290419777228e-05,
	"loss": 3.0712,
	"step": 189
	},
	{
	"epoch": 0.824295010845987,
	"grad_norm": 1.1547160148620605,
	"learning_rate": 2.339555568810221e-05,
	"loss": 3.2101,
	"step": 190
	},
	{
	"epoch": 0.8286334056399133,
	"grad_norm": 1.2223323583602905,
	"learning_rate": 2.2285403854302912e-05,
	"loss": 3.1109,
	"step": 191
	},
	{
	"epoch": 0.8329718004338394,
	"grad_norm": 1.4417051076889038,
	"learning_rate": 2.119892463932781e-05,
	"loss": 3.2497,
	"step": 192
	},
	{
	"epoch": 0.8373101952277657,
	"grad_norm": 1.3542780876159668,
	"learning_rate": 2.013644899527074e-05,
	"loss": 3.23,
	"step": 193
	},
	{
	"epoch": 0.841648590021692,
	"grad_norm": 1.5529882907867432,
	"learning_rate": 1.9098300562505266e-05,
	"loss": 3.2404,
	"step": 194
	},
	{
	"epoch": 0.8459869848156182,
	"grad_norm": 1.63187575340271,
	"learning_rate": 1.808479557110081e-05,
	"loss": 3.0776,
	"step": 195
	},
	{
	"epoch": 0.8503253796095445,
	"grad_norm": 1.6470518112182617,
	"learning_rate": 1.7096242744495837e-05,
	"loss": 3.1312,
	"step": 196
	},
	{
	"epoch": 0.8546637744034707,
	"grad_norm": 1.8358676433563232,
	"learning_rate": 1.6132943205457606e-05,
	"loss": 3.0136,
	"step": 197
	},
	{
	"epoch": 0.8590021691973969,
	"grad_norm": 2.2392208576202393,
	"learning_rate": 1.5195190384357404e-05,
	"loss": 3.0873,
	"step": 198
	},
	{
	"epoch": 0.8633405639913232,
	"grad_norm": 2.3587329387664795,
	"learning_rate": 1.4283269929788779e-05,
	"loss": 3.1336,
	"step": 199
	},
	{
	"epoch": 0.8676789587852495,
	"grad_norm": 3.4689748287200928,
	"learning_rate": 1.339745962155613e-05,
	"loss": 3.2269,
	"step": 200
	},
	{
	"epoch": 0.8720173535791758,
	"grad_norm": 0.4676075577735901,
	"learning_rate": 1.2538029286060426e-05,
	"loss": 3.2194,
	"step": 201
	},
	{
	"epoch": 0.8763557483731019,
	"grad_norm": 0.5948041081428528,
	"learning_rate": 1.1705240714107302e-05,
	"loss": 3.2006,
	"step": 202
	},
	{
	"epoch": 0.8806941431670282,
	"grad_norm": 0.6200747489929199,
	"learning_rate": 1.0899347581163221e-05,
	"loss": 3.1966,
	"step": 203
	},
	{
	"epoch": 0.8850325379609545,
	"grad_norm": 0.6264815926551819,
	"learning_rate": 1.0120595370083318e-05,
	"loss": 3.1552,
	"step": 204
	},
	{
	"epoch": 0.8893709327548807,
	"grad_norm": 0.6958035230636597,
	"learning_rate": 9.369221296335006e-06,
	"loss": 3.21,
	"step": 205
	},
	{
	"epoch": 0.8937093275488069,
	"grad_norm": 0.7550477981567383,
	"learning_rate": 8.645454235739903e-06,
	"loss": 3.2491,
	"step": 206
	},
	{
	"epoch": 0.8980477223427332,
	"grad_norm": 0.78013014793396,
	"learning_rate": 7.949514654755962e-06,
	"loss": 3.158,
	"step": 207
	},
	{
	"epoch": 0.9023861171366594,
	"grad_norm": 0.786949098110199,
	"learning_rate": 7.281614543321269e-06,
	"loss": 3.2446,
	"step": 208
	},
	{
	"epoch": 0.9067245119305857,
	"grad_norm": 0.8102577924728394,
	"learning_rate": 6.6419573502798374e-06,
	"loss": 3.2238,
	"step": 209
	},
	{
	"epoch": 0.911062906724512,
	"grad_norm": 0.8839837908744812,
	"learning_rate": 6.030737921409169e-06,
	"loss": 3.1035,
	"step": 210
	},
	{
	"epoch": 0.9154013015184381,
	"grad_norm": 0.9286414980888367,
	"learning_rate": 5.448142440068316e-06,
	"loss": 3.2198,
	"step": 211
	},
	{
	"epoch": 0.9197396963123644,
	"grad_norm": 1.031367540359497,
	"learning_rate": 4.8943483704846475e-06,
	"loss": 3.207,
	"step": 212
	},
	{
	"epoch": 0.9240780911062907,
	"grad_norm": 1.1086468696594238,
	"learning_rate": 4.369524403696457e-06,
	"loss": 3.2715,
	"step": 213
	},
	{
	"epoch": 0.928416485900217,
	"grad_norm": 1.0586810111999512,
	"learning_rate": 3.873830406168111e-06,
	"loss": 3.2091,
	"step": 214
	},
	{
	"epoch": 0.9327548806941431,
	"grad_norm": 1.0433012247085571,
	"learning_rate": 3.40741737109318e-06,
	"loss": 3.11,
	"step": 215
	},
	{
	"epoch": 0.9370932754880694,
	"grad_norm": 1.214693546295166,
	"learning_rate": 2.970427372400353e-06,
	"loss": 3.1984,
	"step": 216
	},
	{
	"epoch": 0.9414316702819957,
	"grad_norm": 1.3140201568603516,
	"learning_rate": 2.5629935214764865e-06,
	"loss": 3.1381,
	"step": 217
	},
	{
	"epoch": 0.9457700650759219,
	"grad_norm": 1.439610242843628,
	"learning_rate": 2.1852399266194314e-06,
	"loss": 3.2828,
	"step": 218
	},
	{
	"epoch": 0.9501084598698482,
	"grad_norm": 1.447763204574585,
	"learning_rate": 1.8372816552336026e-06,
	"loss": 3.1896,
	"step": 219
	},
	{
	"epoch": 0.9544468546637744,
	"grad_norm": 1.5651224851608276,
	"learning_rate": 1.5192246987791981e-06,
	"loss": 3.0176,
	"step": 220
	},
	{
	"epoch": 0.9587852494577006,
	"grad_norm": 1.7138340473175049,
	"learning_rate": 1.231165940486234e-06,
	"loss": 3.0649,
	"step": 221
	},
	{
	"epoch": 0.9631236442516269,
	"grad_norm": 1.7278990745544434,
	"learning_rate": 9.731931258429638e-07,
	"loss": 2.901,
	"step": 222
	},
	{
	"epoch": 0.9674620390455532,
	"grad_norm": 1.8585275411605835,
	"learning_rate": 7.453848358678017e-07,
	"loss": 2.9228,
	"step": 223
	},
	{
	"epoch": 0.9718004338394793,
	"grad_norm": 2.438549757003784,
	"learning_rate": 5.478104631726711e-07,
	"loss": 2.9518,
	"step": 224
	},
	{
	"epoch": 0.9761388286334056,
	"grad_norm": 3.6199636459350586,
	"learning_rate": 3.805301908254455e-07,
	"loss": 2.9323,
	"step": 225
	},
	{
	"epoch": 0.9804772234273319,
	"grad_norm": 0.5968942046165466,
	"learning_rate": 2.4359497401758024e-07,
	"loss": 3.0911,
	"step": 226
	},
	{
	"epoch": 0.9848156182212582,
	"grad_norm": 0.8684574365615845,
	"learning_rate": 1.3704652454261668e-07,
	"loss": 3.2219,
	"step": 227
	},
	{
	"epoch": 0.9891540130151844,
	"grad_norm": 1.1482932567596436,
	"learning_rate": 6.09172980904238e-08,
	"loss": 3.0405,
	"step": 228
	},
	{
	"epoch": 0.9934924078091106,
	"grad_norm": 1.4431898593902588,
	"learning_rate": 1.5230484360873044e-08,
	"loss": 3.1692,
	"step": 229
	},
	{
	"epoch": 0.9978308026030369,
	"grad_norm": 1.78467857837677,
	"learning_rate": 0.0,
	"loss": 2.932,
	"step": 230
	}
	],
	"logging_steps": 1,
	"max_steps": 230,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 1,
	"save_steps": 58,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 5.142510989790413e+17,
	"train_batch_size": 2,
	"trial_name": null,
	"trial_params": null
	}