fats-fme's picture
Training in progress, step 230, checkpoint
2ad8d6e verified
raw
history blame
40.9 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9978308026030369,
"eval_steps": 58,
"global_step": 230,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004338394793926247,
"grad_norm": 0.2975526452064514,
"learning_rate": 4.000000000000001e-06,
"loss": 3.4415,
"step": 1
},
{
"epoch": 0.004338394793926247,
"eval_loss": 4.956099033355713,
"eval_runtime": 43.9816,
"eval_samples_per_second": 8.845,
"eval_steps_per_second": 2.228,
"step": 1
},
{
"epoch": 0.008676789587852495,
"grad_norm": 0.41739514470100403,
"learning_rate": 8.000000000000001e-06,
"loss": 3.5934,
"step": 2
},
{
"epoch": 0.013015184381778741,
"grad_norm": 0.47740957140922546,
"learning_rate": 1.2e-05,
"loss": 3.7866,
"step": 3
},
{
"epoch": 0.01735357917570499,
"grad_norm": 0.5908945798873901,
"learning_rate": 1.6000000000000003e-05,
"loss": 3.8979,
"step": 4
},
{
"epoch": 0.021691973969631236,
"grad_norm": 0.602057933807373,
"learning_rate": 2e-05,
"loss": 3.8257,
"step": 5
},
{
"epoch": 0.026030368763557483,
"grad_norm": 0.7276618480682373,
"learning_rate": 2.4e-05,
"loss": 4.0716,
"step": 6
},
{
"epoch": 0.03036876355748373,
"grad_norm": 0.7895606160163879,
"learning_rate": 2.8000000000000003e-05,
"loss": 4.1982,
"step": 7
},
{
"epoch": 0.03470715835140998,
"grad_norm": 0.9524717926979065,
"learning_rate": 3.2000000000000005e-05,
"loss": 4.1916,
"step": 8
},
{
"epoch": 0.039045553145336226,
"grad_norm": 0.9786620736122131,
"learning_rate": 3.6e-05,
"loss": 4.0969,
"step": 9
},
{
"epoch": 0.04338394793926247,
"grad_norm": 1.0913058519363403,
"learning_rate": 4e-05,
"loss": 4.3758,
"step": 10
},
{
"epoch": 0.04772234273318872,
"grad_norm": 1.2519152164459229,
"learning_rate": 4.4000000000000006e-05,
"loss": 4.3174,
"step": 11
},
{
"epoch": 0.052060737527114966,
"grad_norm": 1.4428540468215942,
"learning_rate": 4.8e-05,
"loss": 4.2927,
"step": 12
},
{
"epoch": 0.05639913232104121,
"grad_norm": 1.557953953742981,
"learning_rate": 5.2000000000000004e-05,
"loss": 4.3237,
"step": 13
},
{
"epoch": 0.06073752711496746,
"grad_norm": 1.791407585144043,
"learning_rate": 5.6000000000000006e-05,
"loss": 4.3922,
"step": 14
},
{
"epoch": 0.0650759219088937,
"grad_norm": 1.829128623008728,
"learning_rate": 6e-05,
"loss": 4.2218,
"step": 15
},
{
"epoch": 0.06941431670281996,
"grad_norm": 1.8731590509414673,
"learning_rate": 6.400000000000001e-05,
"loss": 4.294,
"step": 16
},
{
"epoch": 0.0737527114967462,
"grad_norm": 2.140212297439575,
"learning_rate": 6.800000000000001e-05,
"loss": 4.3197,
"step": 17
},
{
"epoch": 0.07809110629067245,
"grad_norm": 2.5610997676849365,
"learning_rate": 7.2e-05,
"loss": 4.3203,
"step": 18
},
{
"epoch": 0.0824295010845987,
"grad_norm": 2.5937764644622803,
"learning_rate": 7.6e-05,
"loss": 4.2128,
"step": 19
},
{
"epoch": 0.08676789587852494,
"grad_norm": 1.9964145421981812,
"learning_rate": 8e-05,
"loss": 4.1141,
"step": 20
},
{
"epoch": 0.0911062906724512,
"grad_norm": 1.9274357557296753,
"learning_rate": 8.4e-05,
"loss": 3.9559,
"step": 21
},
{
"epoch": 0.09544468546637744,
"grad_norm": 2.1689515113830566,
"learning_rate": 8.800000000000001e-05,
"loss": 4.0459,
"step": 22
},
{
"epoch": 0.09978308026030369,
"grad_norm": 2.417027235031128,
"learning_rate": 9.200000000000001e-05,
"loss": 3.8996,
"step": 23
},
{
"epoch": 0.10412147505422993,
"grad_norm": 2.925503969192505,
"learning_rate": 9.6e-05,
"loss": 4.0918,
"step": 24
},
{
"epoch": 0.10845986984815618,
"grad_norm": 4.8928961753845215,
"learning_rate": 0.0001,
"loss": 4.332,
"step": 25
},
{
"epoch": 0.11279826464208242,
"grad_norm": 3.4765207767486572,
"learning_rate": 0.00010400000000000001,
"loss": 3.7618,
"step": 26
},
{
"epoch": 0.11713665943600868,
"grad_norm": 3.5958409309387207,
"learning_rate": 0.00010800000000000001,
"loss": 3.7645,
"step": 27
},
{
"epoch": 0.12147505422993492,
"grad_norm": 3.053165912628174,
"learning_rate": 0.00011200000000000001,
"loss": 3.6536,
"step": 28
},
{
"epoch": 0.12581344902386118,
"grad_norm": 2.3347203731536865,
"learning_rate": 0.000116,
"loss": 3.8871,
"step": 29
},
{
"epoch": 0.1301518438177874,
"grad_norm": 1.4141613245010376,
"learning_rate": 0.00012,
"loss": 3.6367,
"step": 30
},
{
"epoch": 0.13449023861171366,
"grad_norm": 1.1042953729629517,
"learning_rate": 0.000124,
"loss": 3.4273,
"step": 31
},
{
"epoch": 0.13882863340563992,
"grad_norm": 0.9391370415687561,
"learning_rate": 0.00012800000000000002,
"loss": 3.635,
"step": 32
},
{
"epoch": 0.14316702819956617,
"grad_norm": 1.028341293334961,
"learning_rate": 0.000132,
"loss": 3.845,
"step": 33
},
{
"epoch": 0.1475054229934924,
"grad_norm": 1.0668063163757324,
"learning_rate": 0.00013600000000000003,
"loss": 3.732,
"step": 34
},
{
"epoch": 0.15184381778741865,
"grad_norm": 1.0369871854782104,
"learning_rate": 0.00014,
"loss": 3.6734,
"step": 35
},
{
"epoch": 0.1561822125813449,
"grad_norm": 1.0699695348739624,
"learning_rate": 0.000144,
"loss": 3.5469,
"step": 36
},
{
"epoch": 0.16052060737527116,
"grad_norm": 1.1715625524520874,
"learning_rate": 0.000148,
"loss": 3.5759,
"step": 37
},
{
"epoch": 0.1648590021691974,
"grad_norm": 1.2680530548095703,
"learning_rate": 0.000152,
"loss": 3.7013,
"step": 38
},
{
"epoch": 0.16919739696312364,
"grad_norm": 1.2043352127075195,
"learning_rate": 0.00015600000000000002,
"loss": 3.74,
"step": 39
},
{
"epoch": 0.1735357917570499,
"grad_norm": 1.342244029045105,
"learning_rate": 0.00016,
"loss": 3.7761,
"step": 40
},
{
"epoch": 0.17787418655097614,
"grad_norm": 1.4112831354141235,
"learning_rate": 0.000164,
"loss": 3.6449,
"step": 41
},
{
"epoch": 0.1822125813449024,
"grad_norm": 1.3947268724441528,
"learning_rate": 0.000168,
"loss": 3.6043,
"step": 42
},
{
"epoch": 0.18655097613882862,
"grad_norm": 1.5763946771621704,
"learning_rate": 0.000172,
"loss": 3.4768,
"step": 43
},
{
"epoch": 0.19088937093275488,
"grad_norm": 1.9006760120391846,
"learning_rate": 0.00017600000000000002,
"loss": 3.6424,
"step": 44
},
{
"epoch": 0.19522776572668113,
"grad_norm": 2.0071113109588623,
"learning_rate": 0.00018,
"loss": 3.7499,
"step": 45
},
{
"epoch": 0.19956616052060738,
"grad_norm": 2.002067804336548,
"learning_rate": 0.00018400000000000003,
"loss": 3.6082,
"step": 46
},
{
"epoch": 0.2039045553145336,
"grad_norm": 2.4698357582092285,
"learning_rate": 0.000188,
"loss": 3.7604,
"step": 47
},
{
"epoch": 0.20824295010845986,
"grad_norm": 3.051906108856201,
"learning_rate": 0.000192,
"loss": 3.7546,
"step": 48
},
{
"epoch": 0.21258134490238612,
"grad_norm": 3.100890636444092,
"learning_rate": 0.000196,
"loss": 3.6134,
"step": 49
},
{
"epoch": 0.21691973969631237,
"grad_norm": 4.4481425285339355,
"learning_rate": 0.0002,
"loss": 3.355,
"step": 50
},
{
"epoch": 0.22125813449023862,
"grad_norm": 4.157866954803467,
"learning_rate": 0.00019998476951563915,
"loss": 3.5229,
"step": 51
},
{
"epoch": 0.22559652928416485,
"grad_norm": 5.159533500671387,
"learning_rate": 0.0001999390827019096,
"loss": 3.9098,
"step": 52
},
{
"epoch": 0.2299349240780911,
"grad_norm": 4.372255325317383,
"learning_rate": 0.0001998629534754574,
"loss": 3.8285,
"step": 53
},
{
"epoch": 0.23427331887201736,
"grad_norm": 2.7389180660247803,
"learning_rate": 0.00019975640502598244,
"loss": 3.5892,
"step": 54
},
{
"epoch": 0.2386117136659436,
"grad_norm": 1.623792290687561,
"learning_rate": 0.00019961946980917456,
"loss": 3.6253,
"step": 55
},
{
"epoch": 0.24295010845986983,
"grad_norm": 1.0698862075805664,
"learning_rate": 0.00019945218953682734,
"loss": 3.4747,
"step": 56
},
{
"epoch": 0.2472885032537961,
"grad_norm": 1.0480446815490723,
"learning_rate": 0.00019925461516413223,
"loss": 3.5076,
"step": 57
},
{
"epoch": 0.25162689804772237,
"grad_norm": 1.1356984376907349,
"learning_rate": 0.00019902680687415705,
"loss": 3.4875,
"step": 58
},
{
"epoch": 0.25162689804772237,
"eval_loss": 3.5454585552215576,
"eval_runtime": 43.9485,
"eval_samples_per_second": 8.851,
"eval_steps_per_second": 2.23,
"step": 58
},
{
"epoch": 0.2559652928416486,
"grad_norm": 1.147985816001892,
"learning_rate": 0.00019876883405951377,
"loss": 3.5368,
"step": 59
},
{
"epoch": 0.2603036876355748,
"grad_norm": 1.167962670326233,
"learning_rate": 0.00019848077530122083,
"loss": 3.4885,
"step": 60
},
{
"epoch": 0.2646420824295011,
"grad_norm": 1.1241693496704102,
"learning_rate": 0.00019816271834476642,
"loss": 3.5335,
"step": 61
},
{
"epoch": 0.26898047722342733,
"grad_norm": 1.0841178894042969,
"learning_rate": 0.00019781476007338058,
"loss": 3.5822,
"step": 62
},
{
"epoch": 0.27331887201735355,
"grad_norm": 1.1276164054870605,
"learning_rate": 0.00019743700647852354,
"loss": 3.4757,
"step": 63
},
{
"epoch": 0.27765726681127983,
"grad_norm": 1.192659854888916,
"learning_rate": 0.00019702957262759965,
"loss": 3.4212,
"step": 64
},
{
"epoch": 0.28199566160520606,
"grad_norm": 1.2061688899993896,
"learning_rate": 0.00019659258262890683,
"loss": 3.4564,
"step": 65
},
{
"epoch": 0.28633405639913234,
"grad_norm": 1.4012079238891602,
"learning_rate": 0.0001961261695938319,
"loss": 3.423,
"step": 66
},
{
"epoch": 0.29067245119305857,
"grad_norm": 1.3591368198394775,
"learning_rate": 0.00019563047559630357,
"loss": 3.5284,
"step": 67
},
{
"epoch": 0.2950108459869848,
"grad_norm": 1.3555010557174683,
"learning_rate": 0.00019510565162951537,
"loss": 3.4406,
"step": 68
},
{
"epoch": 0.2993492407809111,
"grad_norm": 1.4745391607284546,
"learning_rate": 0.0001945518575599317,
"loss": 3.3899,
"step": 69
},
{
"epoch": 0.3036876355748373,
"grad_norm": 1.6432572603225708,
"learning_rate": 0.00019396926207859084,
"loss": 3.4343,
"step": 70
},
{
"epoch": 0.3080260303687636,
"grad_norm": 1.9187488555908203,
"learning_rate": 0.00019335804264972018,
"loss": 3.5881,
"step": 71
},
{
"epoch": 0.3123644251626898,
"grad_norm": 2.1937949657440186,
"learning_rate": 0.00019271838545667876,
"loss": 3.3765,
"step": 72
},
{
"epoch": 0.31670281995661603,
"grad_norm": 2.376640558242798,
"learning_rate": 0.00019205048534524406,
"loss": 3.2758,
"step": 73
},
{
"epoch": 0.3210412147505423,
"grad_norm": 3.0442004203796387,
"learning_rate": 0.0001913545457642601,
"loss": 3.5366,
"step": 74
},
{
"epoch": 0.32537960954446854,
"grad_norm": 3.6359612941741943,
"learning_rate": 0.000190630778703665,
"loss": 3.0313,
"step": 75
},
{
"epoch": 0.3297180043383948,
"grad_norm": 4.367193698883057,
"learning_rate": 0.0001898794046299167,
"loss": 3.3864,
"step": 76
},
{
"epoch": 0.33405639913232105,
"grad_norm": 5.261653900146484,
"learning_rate": 0.0001891006524188368,
"loss": 3.5411,
"step": 77
},
{
"epoch": 0.3383947939262473,
"grad_norm": 5.341316223144531,
"learning_rate": 0.00018829475928589271,
"loss": 3.843,
"step": 78
},
{
"epoch": 0.34273318872017355,
"grad_norm": 2.9030559062957764,
"learning_rate": 0.00018746197071393958,
"loss": 3.4254,
"step": 79
},
{
"epoch": 0.3470715835140998,
"grad_norm": 1.4780312776565552,
"learning_rate": 0.00018660254037844388,
"loss": 3.4877,
"step": 80
},
{
"epoch": 0.351409978308026,
"grad_norm": 1.0593628883361816,
"learning_rate": 0.00018571673007021123,
"loss": 3.3987,
"step": 81
},
{
"epoch": 0.3557483731019523,
"grad_norm": 0.9910492897033691,
"learning_rate": 0.0001848048096156426,
"loss": 3.4944,
"step": 82
},
{
"epoch": 0.3600867678958785,
"grad_norm": 1.004767656326294,
"learning_rate": 0.00018386705679454242,
"loss": 3.4143,
"step": 83
},
{
"epoch": 0.3644251626898048,
"grad_norm": 1.012804627418518,
"learning_rate": 0.00018290375725550417,
"loss": 3.4504,
"step": 84
},
{
"epoch": 0.368763557483731,
"grad_norm": 1.0758857727050781,
"learning_rate": 0.0001819152044288992,
"loss": 3.5181,
"step": 85
},
{
"epoch": 0.37310195227765725,
"grad_norm": 1.0776313543319702,
"learning_rate": 0.00018090169943749476,
"loss": 3.4293,
"step": 86
},
{
"epoch": 0.3774403470715835,
"grad_norm": 1.0856565237045288,
"learning_rate": 0.00017986355100472928,
"loss": 3.3012,
"step": 87
},
{
"epoch": 0.38177874186550975,
"grad_norm": 1.146246075630188,
"learning_rate": 0.00017880107536067218,
"loss": 3.5778,
"step": 88
},
{
"epoch": 0.38611713665943603,
"grad_norm": 1.1812922954559326,
"learning_rate": 0.0001777145961456971,
"loss": 3.2835,
"step": 89
},
{
"epoch": 0.39045553145336226,
"grad_norm": 1.3535960912704468,
"learning_rate": 0.0001766044443118978,
"loss": 3.1863,
"step": 90
},
{
"epoch": 0.3947939262472885,
"grad_norm": 1.312524437904358,
"learning_rate": 0.00017547095802227723,
"loss": 3.3794,
"step": 91
},
{
"epoch": 0.39913232104121477,
"grad_norm": 1.2628040313720703,
"learning_rate": 0.00017431448254773944,
"loss": 3.2127,
"step": 92
},
{
"epoch": 0.403470715835141,
"grad_norm": 1.3810231685638428,
"learning_rate": 0.00017313537016191706,
"loss": 3.3664,
"step": 93
},
{
"epoch": 0.4078091106290672,
"grad_norm": 1.5726513862609863,
"learning_rate": 0.0001719339800338651,
"loss": 3.4052,
"step": 94
},
{
"epoch": 0.4121475054229935,
"grad_norm": 1.5839647054672241,
"learning_rate": 0.00017071067811865476,
"loss": 3.2746,
"step": 95
},
{
"epoch": 0.4164859002169197,
"grad_norm": 1.7605924606323242,
"learning_rate": 0.00016946583704589973,
"loss": 3.48,
"step": 96
},
{
"epoch": 0.420824295010846,
"grad_norm": 2.3345723152160645,
"learning_rate": 0.00016819983600624986,
"loss": 3.2033,
"step": 97
},
{
"epoch": 0.42516268980477223,
"grad_norm": 1.9480637311935425,
"learning_rate": 0.00016691306063588583,
"loss": 3.3796,
"step": 98
},
{
"epoch": 0.42950108459869846,
"grad_norm": 2.3618791103363037,
"learning_rate": 0.00016560590289905073,
"loss": 3.1398,
"step": 99
},
{
"epoch": 0.43383947939262474,
"grad_norm": 3.546729326248169,
"learning_rate": 0.00016427876096865394,
"loss": 3.0558,
"step": 100
},
{
"epoch": 0.43817787418655096,
"grad_norm": 1.5932743549346924,
"learning_rate": 0.00016293203910498376,
"loss": 3.3932,
"step": 101
},
{
"epoch": 0.44251626898047725,
"grad_norm": 2.039661407470703,
"learning_rate": 0.0001615661475325658,
"loss": 3.4066,
"step": 102
},
{
"epoch": 0.44685466377440347,
"grad_norm": 1.742119312286377,
"learning_rate": 0.00016018150231520486,
"loss": 3.2823,
"step": 103
},
{
"epoch": 0.4511930585683297,
"grad_norm": 1.5700186491012573,
"learning_rate": 0.00015877852522924732,
"loss": 3.3591,
"step": 104
},
{
"epoch": 0.455531453362256,
"grad_norm": 1.136389970779419,
"learning_rate": 0.0001573576436351046,
"loss": 3.4663,
"step": 105
},
{
"epoch": 0.4598698481561822,
"grad_norm": 0.8537334203720093,
"learning_rate": 0.0001559192903470747,
"loss": 3.4367,
"step": 106
},
{
"epoch": 0.4642082429501085,
"grad_norm": 0.8642299175262451,
"learning_rate": 0.00015446390350150273,
"loss": 3.287,
"step": 107
},
{
"epoch": 0.4685466377440347,
"grad_norm": 0.9279235601425171,
"learning_rate": 0.0001529919264233205,
"loss": 3.2911,
"step": 108
},
{
"epoch": 0.47288503253796094,
"grad_norm": 0.9121331572532654,
"learning_rate": 0.00015150380749100545,
"loss": 3.3101,
"step": 109
},
{
"epoch": 0.4772234273318872,
"grad_norm": 0.9868795275688171,
"learning_rate": 0.00015000000000000001,
"loss": 3.3431,
"step": 110
},
{
"epoch": 0.48156182212581344,
"grad_norm": 1.0646886825561523,
"learning_rate": 0.00014848096202463372,
"loss": 3.3876,
"step": 111
},
{
"epoch": 0.48590021691973967,
"grad_norm": 1.0819416046142578,
"learning_rate": 0.00014694715627858908,
"loss": 3.2128,
"step": 112
},
{
"epoch": 0.49023861171366595,
"grad_norm": 1.0728636980056763,
"learning_rate": 0.00014539904997395468,
"loss": 3.2076,
"step": 113
},
{
"epoch": 0.4945770065075922,
"grad_norm": 1.1562669277191162,
"learning_rate": 0.00014383711467890774,
"loss": 3.2825,
"step": 114
},
{
"epoch": 0.49891540130151846,
"grad_norm": 1.1967557668685913,
"learning_rate": 0.00014226182617406996,
"loss": 3.2185,
"step": 115
},
{
"epoch": 0.5032537960954447,
"grad_norm": 1.3139584064483643,
"learning_rate": 0.00014067366430758004,
"loss": 3.1373,
"step": 116
},
{
"epoch": 0.5032537960954447,
"eval_loss": 3.2728052139282227,
"eval_runtime": 43.9403,
"eval_samples_per_second": 8.853,
"eval_steps_per_second": 2.23,
"step": 116
},
{
"epoch": 0.5075921908893709,
"grad_norm": 1.3170753717422485,
"learning_rate": 0.00013907311284892736,
"loss": 2.9572,
"step": 117
},
{
"epoch": 0.5119305856832972,
"grad_norm": 1.5243107080459595,
"learning_rate": 0.00013746065934159123,
"loss": 3.3082,
"step": 118
},
{
"epoch": 0.5162689804772235,
"grad_norm": 1.5845880508422852,
"learning_rate": 0.00013583679495453,
"loss": 3.4819,
"step": 119
},
{
"epoch": 0.5206073752711496,
"grad_norm": 1.66307532787323,
"learning_rate": 0.00013420201433256689,
"loss": 3.1131,
"step": 120
},
{
"epoch": 0.5249457700650759,
"grad_norm": 1.6470588445663452,
"learning_rate": 0.00013255681544571568,
"loss": 3.2847,
"step": 121
},
{
"epoch": 0.5292841648590022,
"grad_norm": 2.1118075847625732,
"learning_rate": 0.00013090169943749476,
"loss": 3.4669,
"step": 122
},
{
"epoch": 0.5336225596529284,
"grad_norm": 2.056396722793579,
"learning_rate": 0.00012923717047227368,
"loss": 3.1136,
"step": 123
},
{
"epoch": 0.5379609544468547,
"grad_norm": 2.2389657497406006,
"learning_rate": 0.0001275637355816999,
"loss": 2.9323,
"step": 124
},
{
"epoch": 0.5422993492407809,
"grad_norm": 2.863621711730957,
"learning_rate": 0.00012588190451025207,
"loss": 2.9585,
"step": 125
},
{
"epoch": 0.5466377440347071,
"grad_norm": 0.8712321519851685,
"learning_rate": 0.00012419218955996676,
"loss": 3.1439,
"step": 126
},
{
"epoch": 0.5509761388286334,
"grad_norm": 1.0713740587234497,
"learning_rate": 0.0001224951054343865,
"loss": 3.2213,
"step": 127
},
{
"epoch": 0.5553145336225597,
"grad_norm": 1.104315996170044,
"learning_rate": 0.00012079116908177593,
"loss": 3.4522,
"step": 128
},
{
"epoch": 0.559652928416486,
"grad_norm": 1.0883917808532715,
"learning_rate": 0.00011908089953765449,
"loss": 3.3503,
"step": 129
},
{
"epoch": 0.5639913232104121,
"grad_norm": 1.0000834465026855,
"learning_rate": 0.00011736481776669306,
"loss": 3.4036,
"step": 130
},
{
"epoch": 0.5683297180043384,
"grad_norm": 0.8869354128837585,
"learning_rate": 0.0001156434465040231,
"loss": 3.2749,
"step": 131
},
{
"epoch": 0.5726681127982647,
"grad_norm": 0.8651937246322632,
"learning_rate": 0.00011391731009600654,
"loss": 3.3679,
"step": 132
},
{
"epoch": 0.5770065075921909,
"grad_norm": 0.9174556136131287,
"learning_rate": 0.00011218693434051475,
"loss": 3.311,
"step": 133
},
{
"epoch": 0.5813449023861171,
"grad_norm": 0.930533230304718,
"learning_rate": 0.00011045284632676536,
"loss": 3.3761,
"step": 134
},
{
"epoch": 0.5856832971800434,
"grad_norm": 0.9851680994033813,
"learning_rate": 0.00010871557427476583,
"loss": 3.2752,
"step": 135
},
{
"epoch": 0.5900216919739696,
"grad_norm": 0.9633740782737732,
"learning_rate": 0.00010697564737441252,
"loss": 3.2373,
"step": 136
},
{
"epoch": 0.5943600867678959,
"grad_norm": 1.132585048675537,
"learning_rate": 0.0001052335956242944,
"loss": 3.2323,
"step": 137
},
{
"epoch": 0.5986984815618221,
"grad_norm": 1.1232091188430786,
"learning_rate": 0.00010348994967025012,
"loss": 3.2874,
"step": 138
},
{
"epoch": 0.6030368763557483,
"grad_norm": 1.2559125423431396,
"learning_rate": 0.00010174524064372837,
"loss": 3.2367,
"step": 139
},
{
"epoch": 0.6073752711496746,
"grad_norm": 1.2623041868209839,
"learning_rate": 0.0001,
"loss": 3.2243,
"step": 140
},
{
"epoch": 0.6117136659436009,
"grad_norm": 1.3554457426071167,
"learning_rate": 9.825475935627165e-05,
"loss": 3.4802,
"step": 141
},
{
"epoch": 0.6160520607375272,
"grad_norm": 1.4170132875442505,
"learning_rate": 9.651005032974994e-05,
"loss": 3.354,
"step": 142
},
{
"epoch": 0.6203904555314533,
"grad_norm": 1.4309097528457642,
"learning_rate": 9.476640437570562e-05,
"loss": 3.1352,
"step": 143
},
{
"epoch": 0.6247288503253796,
"grad_norm": 1.5829153060913086,
"learning_rate": 9.302435262558747e-05,
"loss": 3.2455,
"step": 144
},
{
"epoch": 0.6290672451193059,
"grad_norm": 1.8210502862930298,
"learning_rate": 9.128442572523417e-05,
"loss": 3.2991,
"step": 145
},
{
"epoch": 0.6334056399132321,
"grad_norm": 1.842761516571045,
"learning_rate": 8.954715367323468e-05,
"loss": 3.2255,
"step": 146
},
{
"epoch": 0.6377440347071583,
"grad_norm": 1.9258646965026855,
"learning_rate": 8.781306565948528e-05,
"loss": 3.1397,
"step": 147
},
{
"epoch": 0.6420824295010846,
"grad_norm": 2.1189215183258057,
"learning_rate": 8.608268990399349e-05,
"loss": 3.0414,
"step": 148
},
{
"epoch": 0.6464208242950108,
"grad_norm": 2.4063761234283447,
"learning_rate": 8.435655349597689e-05,
"loss": 2.8524,
"step": 149
},
{
"epoch": 0.6507592190889371,
"grad_norm": 3.6420836448669434,
"learning_rate": 8.263518223330697e-05,
"loss": 3.0156,
"step": 150
},
{
"epoch": 0.6550976138828634,
"grad_norm": 0.7080674171447754,
"learning_rate": 8.091910046234552e-05,
"loss": 3.1636,
"step": 151
},
{
"epoch": 0.6594360086767896,
"grad_norm": 0.798520565032959,
"learning_rate": 7.920883091822408e-05,
"loss": 3.212,
"step": 152
},
{
"epoch": 0.6637744034707158,
"grad_norm": 0.8640486001968384,
"learning_rate": 7.750489456561352e-05,
"loss": 3.1644,
"step": 153
},
{
"epoch": 0.6681127982646421,
"grad_norm": 0.870906412601471,
"learning_rate": 7.580781044003324e-05,
"loss": 3.1876,
"step": 154
},
{
"epoch": 0.6724511930585684,
"grad_norm": 0.8581348061561584,
"learning_rate": 7.411809548974792e-05,
"loss": 3.2739,
"step": 155
},
{
"epoch": 0.6767895878524945,
"grad_norm": 0.8691614270210266,
"learning_rate": 7.243626441830009e-05,
"loss": 3.2444,
"step": 156
},
{
"epoch": 0.6811279826464208,
"grad_norm": 0.9455673098564148,
"learning_rate": 7.076282952772633e-05,
"loss": 3.3004,
"step": 157
},
{
"epoch": 0.6854663774403471,
"grad_norm": 0.8873337507247925,
"learning_rate": 6.909830056250527e-05,
"loss": 3.1778,
"step": 158
},
{
"epoch": 0.6898047722342733,
"grad_norm": 0.910775363445282,
"learning_rate": 6.744318455428436e-05,
"loss": 3.1346,
"step": 159
},
{
"epoch": 0.6941431670281996,
"grad_norm": 0.9872409105300903,
"learning_rate": 6.579798566743314e-05,
"loss": 3.1665,
"step": 160
},
{
"epoch": 0.6984815618221258,
"grad_norm": 1.0516481399536133,
"learning_rate": 6.416320504546997e-05,
"loss": 3.3064,
"step": 161
},
{
"epoch": 0.702819956616052,
"grad_norm": 1.0263571739196777,
"learning_rate": 6.25393406584088e-05,
"loss": 3.3698,
"step": 162
},
{
"epoch": 0.7071583514099783,
"grad_norm": 1.1050878763198853,
"learning_rate": 6.092688715107264e-05,
"loss": 3.2436,
"step": 163
},
{
"epoch": 0.7114967462039046,
"grad_norm": 1.1121841669082642,
"learning_rate": 5.9326335692419995e-05,
"loss": 2.9433,
"step": 164
},
{
"epoch": 0.7158351409978309,
"grad_norm": 1.2424358129501343,
"learning_rate": 5.773817382593008e-05,
"loss": 3.3616,
"step": 165
},
{
"epoch": 0.720173535791757,
"grad_norm": 1.1899327039718628,
"learning_rate": 5.616288532109225e-05,
"loss": 3.0392,
"step": 166
},
{
"epoch": 0.7245119305856833,
"grad_norm": 1.3395730257034302,
"learning_rate": 5.4600950026045326e-05,
"loss": 3.0905,
"step": 167
},
{
"epoch": 0.7288503253796096,
"grad_norm": 1.4268842935562134,
"learning_rate": 5.305284372141095e-05,
"loss": 3.1247,
"step": 168
},
{
"epoch": 0.7331887201735358,
"grad_norm": 1.5514875650405884,
"learning_rate": 5.15190379753663e-05,
"loss": 3.3772,
"step": 169
},
{
"epoch": 0.737527114967462,
"grad_norm": 1.8371058702468872,
"learning_rate": 5.000000000000002e-05,
"loss": 2.9262,
"step": 170
},
{
"epoch": 0.7418655097613883,
"grad_norm": 1.7641676664352417,
"learning_rate": 4.8496192508994576e-05,
"loss": 3.0113,
"step": 171
},
{
"epoch": 0.7462039045553145,
"grad_norm": 1.8325039148330688,
"learning_rate": 4.700807357667952e-05,
"loss": 3.0551,
"step": 172
},
{
"epoch": 0.7505422993492408,
"grad_norm": 1.9740185737609863,
"learning_rate": 4.5536096498497295e-05,
"loss": 3.1479,
"step": 173
},
{
"epoch": 0.754880694143167,
"grad_norm": 2.327420234680176,
"learning_rate": 4.4080709652925336e-05,
"loss": 3.2149,
"step": 174
},
{
"epoch": 0.754880694143167,
"eval_loss": 3.163884162902832,
"eval_runtime": 43.9914,
"eval_samples_per_second": 8.843,
"eval_steps_per_second": 2.228,
"step": 174
},
{
"epoch": 0.7592190889370932,
"grad_norm": 3.558817148208618,
"learning_rate": 4.264235636489542e-05,
"loss": 3.0184,
"step": 175
},
{
"epoch": 0.7635574837310195,
"grad_norm": 0.48007091879844666,
"learning_rate": 4.12214747707527e-05,
"loss": 3.0708,
"step": 176
},
{
"epoch": 0.7678958785249458,
"grad_norm": 0.606035590171814,
"learning_rate": 3.981849768479517e-05,
"loss": 3.1584,
"step": 177
},
{
"epoch": 0.7722342733188721,
"grad_norm": 0.6647348999977112,
"learning_rate": 3.843385246743417e-05,
"loss": 3.3003,
"step": 178
},
{
"epoch": 0.7765726681127982,
"grad_norm": 0.6956514120101929,
"learning_rate": 3.7067960895016275e-05,
"loss": 3.2168,
"step": 179
},
{
"epoch": 0.7809110629067245,
"grad_norm": 0.7575390338897705,
"learning_rate": 3.5721239031346066e-05,
"loss": 3.2576,
"step": 180
},
{
"epoch": 0.7852494577006508,
"grad_norm": 0.8106915950775146,
"learning_rate": 3.439409710094929e-05,
"loss": 3.124,
"step": 181
},
{
"epoch": 0.789587852494577,
"grad_norm": 0.873997688293457,
"learning_rate": 3.308693936411421e-05,
"loss": 3.0977,
"step": 182
},
{
"epoch": 0.7939262472885033,
"grad_norm": 0.9612168073654175,
"learning_rate": 3.1800163993750166e-05,
"loss": 3.435,
"step": 183
},
{
"epoch": 0.7982646420824295,
"grad_norm": 0.9549990892410278,
"learning_rate": 3.053416295410026e-05,
"loss": 3.2216,
"step": 184
},
{
"epoch": 0.8026030368763557,
"grad_norm": 0.9309582710266113,
"learning_rate": 2.9289321881345254e-05,
"loss": 3.0546,
"step": 185
},
{
"epoch": 0.806941431670282,
"grad_norm": 1.0800687074661255,
"learning_rate": 2.8066019966134904e-05,
"loss": 3.2283,
"step": 186
},
{
"epoch": 0.8112798264642083,
"grad_norm": 1.0009733438491821,
"learning_rate": 2.6864629838082956e-05,
"loss": 3.1638,
"step": 187
},
{
"epoch": 0.8156182212581344,
"grad_norm": 1.1134998798370361,
"learning_rate": 2.5685517452260567e-05,
"loss": 3.2642,
"step": 188
},
{
"epoch": 0.8199566160520607,
"grad_norm": 1.1395593881607056,
"learning_rate": 2.45290419777228e-05,
"loss": 3.0712,
"step": 189
},
{
"epoch": 0.824295010845987,
"grad_norm": 1.1547160148620605,
"learning_rate": 2.339555568810221e-05,
"loss": 3.2101,
"step": 190
},
{
"epoch": 0.8286334056399133,
"grad_norm": 1.2223323583602905,
"learning_rate": 2.2285403854302912e-05,
"loss": 3.1109,
"step": 191
},
{
"epoch": 0.8329718004338394,
"grad_norm": 1.4417051076889038,
"learning_rate": 2.119892463932781e-05,
"loss": 3.2497,
"step": 192
},
{
"epoch": 0.8373101952277657,
"grad_norm": 1.3542780876159668,
"learning_rate": 2.013644899527074e-05,
"loss": 3.23,
"step": 193
},
{
"epoch": 0.841648590021692,
"grad_norm": 1.5529882907867432,
"learning_rate": 1.9098300562505266e-05,
"loss": 3.2404,
"step": 194
},
{
"epoch": 0.8459869848156182,
"grad_norm": 1.63187575340271,
"learning_rate": 1.808479557110081e-05,
"loss": 3.0776,
"step": 195
},
{
"epoch": 0.8503253796095445,
"grad_norm": 1.6470518112182617,
"learning_rate": 1.7096242744495837e-05,
"loss": 3.1312,
"step": 196
},
{
"epoch": 0.8546637744034707,
"grad_norm": 1.8358676433563232,
"learning_rate": 1.6132943205457606e-05,
"loss": 3.0136,
"step": 197
},
{
"epoch": 0.8590021691973969,
"grad_norm": 2.2392208576202393,
"learning_rate": 1.5195190384357404e-05,
"loss": 3.0873,
"step": 198
},
{
"epoch": 0.8633405639913232,
"grad_norm": 2.3587329387664795,
"learning_rate": 1.4283269929788779e-05,
"loss": 3.1336,
"step": 199
},
{
"epoch": 0.8676789587852495,
"grad_norm": 3.4689748287200928,
"learning_rate": 1.339745962155613e-05,
"loss": 3.2269,
"step": 200
},
{
"epoch": 0.8720173535791758,
"grad_norm": 0.4676075577735901,
"learning_rate": 1.2538029286060426e-05,
"loss": 3.2194,
"step": 201
},
{
"epoch": 0.8763557483731019,
"grad_norm": 0.5948041081428528,
"learning_rate": 1.1705240714107302e-05,
"loss": 3.2006,
"step": 202
},
{
"epoch": 0.8806941431670282,
"grad_norm": 0.6200747489929199,
"learning_rate": 1.0899347581163221e-05,
"loss": 3.1966,
"step": 203
},
{
"epoch": 0.8850325379609545,
"grad_norm": 0.6264815926551819,
"learning_rate": 1.0120595370083318e-05,
"loss": 3.1552,
"step": 204
},
{
"epoch": 0.8893709327548807,
"grad_norm": 0.6958035230636597,
"learning_rate": 9.369221296335006e-06,
"loss": 3.21,
"step": 205
},
{
"epoch": 0.8937093275488069,
"grad_norm": 0.7550477981567383,
"learning_rate": 8.645454235739903e-06,
"loss": 3.2491,
"step": 206
},
{
"epoch": 0.8980477223427332,
"grad_norm": 0.78013014793396,
"learning_rate": 7.949514654755962e-06,
"loss": 3.158,
"step": 207
},
{
"epoch": 0.9023861171366594,
"grad_norm": 0.786949098110199,
"learning_rate": 7.281614543321269e-06,
"loss": 3.2446,
"step": 208
},
{
"epoch": 0.9067245119305857,
"grad_norm": 0.8102577924728394,
"learning_rate": 6.6419573502798374e-06,
"loss": 3.2238,
"step": 209
},
{
"epoch": 0.911062906724512,
"grad_norm": 0.8839837908744812,
"learning_rate": 6.030737921409169e-06,
"loss": 3.1035,
"step": 210
},
{
"epoch": 0.9154013015184381,
"grad_norm": 0.9286414980888367,
"learning_rate": 5.448142440068316e-06,
"loss": 3.2198,
"step": 211
},
{
"epoch": 0.9197396963123644,
"grad_norm": 1.031367540359497,
"learning_rate": 4.8943483704846475e-06,
"loss": 3.207,
"step": 212
},
{
"epoch": 0.9240780911062907,
"grad_norm": 1.1086468696594238,
"learning_rate": 4.369524403696457e-06,
"loss": 3.2715,
"step": 213
},
{
"epoch": 0.928416485900217,
"grad_norm": 1.0586810111999512,
"learning_rate": 3.873830406168111e-06,
"loss": 3.2091,
"step": 214
},
{
"epoch": 0.9327548806941431,
"grad_norm": 1.0433012247085571,
"learning_rate": 3.40741737109318e-06,
"loss": 3.11,
"step": 215
},
{
"epoch": 0.9370932754880694,
"grad_norm": 1.214693546295166,
"learning_rate": 2.970427372400353e-06,
"loss": 3.1984,
"step": 216
},
{
"epoch": 0.9414316702819957,
"grad_norm": 1.3140201568603516,
"learning_rate": 2.5629935214764865e-06,
"loss": 3.1381,
"step": 217
},
{
"epoch": 0.9457700650759219,
"grad_norm": 1.439610242843628,
"learning_rate": 2.1852399266194314e-06,
"loss": 3.2828,
"step": 218
},
{
"epoch": 0.9501084598698482,
"grad_norm": 1.447763204574585,
"learning_rate": 1.8372816552336026e-06,
"loss": 3.1896,
"step": 219
},
{
"epoch": 0.9544468546637744,
"grad_norm": 1.5651224851608276,
"learning_rate": 1.5192246987791981e-06,
"loss": 3.0176,
"step": 220
},
{
"epoch": 0.9587852494577006,
"grad_norm": 1.7138340473175049,
"learning_rate": 1.231165940486234e-06,
"loss": 3.0649,
"step": 221
},
{
"epoch": 0.9631236442516269,
"grad_norm": 1.7278990745544434,
"learning_rate": 9.731931258429638e-07,
"loss": 2.901,
"step": 222
},
{
"epoch": 0.9674620390455532,
"grad_norm": 1.8585275411605835,
"learning_rate": 7.453848358678017e-07,
"loss": 2.9228,
"step": 223
},
{
"epoch": 0.9718004338394793,
"grad_norm": 2.438549757003784,
"learning_rate": 5.478104631726711e-07,
"loss": 2.9518,
"step": 224
},
{
"epoch": 0.9761388286334056,
"grad_norm": 3.6199636459350586,
"learning_rate": 3.805301908254455e-07,
"loss": 2.9323,
"step": 225
},
{
"epoch": 0.9804772234273319,
"grad_norm": 0.5968942046165466,
"learning_rate": 2.4359497401758024e-07,
"loss": 3.0911,
"step": 226
},
{
"epoch": 0.9848156182212582,
"grad_norm": 0.8684574365615845,
"learning_rate": 1.3704652454261668e-07,
"loss": 3.2219,
"step": 227
},
{
"epoch": 0.9891540130151844,
"grad_norm": 1.1482932567596436,
"learning_rate": 6.09172980904238e-08,
"loss": 3.0405,
"step": 228
},
{
"epoch": 0.9934924078091106,
"grad_norm": 1.4431898593902588,
"learning_rate": 1.5230484360873044e-08,
"loss": 3.1692,
"step": 229
},
{
"epoch": 0.9978308026030369,
"grad_norm": 1.78467857837677,
"learning_rate": 0.0,
"loss": 2.932,
"step": 230
}
],
"logging_steps": 1,
"max_steps": 230,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 58,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.142510989790413e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}