{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9978308026030369, "eval_steps": 58, "global_step": 230, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004338394793926247, "grad_norm": 0.2975526452064514, "learning_rate": 4.000000000000001e-06, "loss": 3.4415, "step": 1 }, { "epoch": 0.004338394793926247, "eval_loss": 4.956099033355713, "eval_runtime": 43.9816, "eval_samples_per_second": 8.845, "eval_steps_per_second": 2.228, "step": 1 }, { "epoch": 0.008676789587852495, "grad_norm": 0.41739514470100403, "learning_rate": 8.000000000000001e-06, "loss": 3.5934, "step": 2 }, { "epoch": 0.013015184381778741, "grad_norm": 0.47740957140922546, "learning_rate": 1.2e-05, "loss": 3.7866, "step": 3 }, { "epoch": 0.01735357917570499, "grad_norm": 0.5908945798873901, "learning_rate": 1.6000000000000003e-05, "loss": 3.8979, "step": 4 }, { "epoch": 0.021691973969631236, "grad_norm": 0.602057933807373, "learning_rate": 2e-05, "loss": 3.8257, "step": 5 }, { "epoch": 0.026030368763557483, "grad_norm": 0.7276618480682373, "learning_rate": 2.4e-05, "loss": 4.0716, "step": 6 }, { "epoch": 0.03036876355748373, "grad_norm": 0.7895606160163879, "learning_rate": 2.8000000000000003e-05, "loss": 4.1982, "step": 7 }, { "epoch": 0.03470715835140998, "grad_norm": 0.9524717926979065, "learning_rate": 3.2000000000000005e-05, "loss": 4.1916, "step": 8 }, { "epoch": 0.039045553145336226, "grad_norm": 0.9786620736122131, "learning_rate": 3.6e-05, "loss": 4.0969, "step": 9 }, { "epoch": 0.04338394793926247, "grad_norm": 1.0913058519363403, "learning_rate": 4e-05, "loss": 4.3758, "step": 10 }, { "epoch": 0.04772234273318872, "grad_norm": 1.2519152164459229, "learning_rate": 4.4000000000000006e-05, "loss": 4.3174, "step": 11 }, { "epoch": 0.052060737527114966, "grad_norm": 1.4428540468215942, "learning_rate": 4.8e-05, "loss": 4.2927, "step": 12 }, { "epoch": 0.05639913232104121, "grad_norm": 1.557953953742981, "learning_rate": 5.2000000000000004e-05, "loss": 4.3237, "step": 13 }, { "epoch": 0.06073752711496746, "grad_norm": 1.791407585144043, "learning_rate": 5.6000000000000006e-05, "loss": 4.3922, "step": 14 }, { "epoch": 0.0650759219088937, "grad_norm": 1.829128623008728, "learning_rate": 6e-05, "loss": 4.2218, "step": 15 }, { "epoch": 0.06941431670281996, "grad_norm": 1.8731590509414673, "learning_rate": 6.400000000000001e-05, "loss": 4.294, "step": 16 }, { "epoch": 0.0737527114967462, "grad_norm": 2.140212297439575, "learning_rate": 6.800000000000001e-05, "loss": 4.3197, "step": 17 }, { "epoch": 0.07809110629067245, "grad_norm": 2.5610997676849365, "learning_rate": 7.2e-05, "loss": 4.3203, "step": 18 }, { "epoch": 0.0824295010845987, "grad_norm": 2.5937764644622803, "learning_rate": 7.6e-05, "loss": 4.2128, "step": 19 }, { "epoch": 0.08676789587852494, "grad_norm": 1.9964145421981812, "learning_rate": 8e-05, "loss": 4.1141, "step": 20 }, { "epoch": 0.0911062906724512, "grad_norm": 1.9274357557296753, "learning_rate": 8.4e-05, "loss": 3.9559, "step": 21 }, { "epoch": 0.09544468546637744, "grad_norm": 2.1689515113830566, "learning_rate": 8.800000000000001e-05, "loss": 4.0459, "step": 22 }, { "epoch": 0.09978308026030369, "grad_norm": 2.417027235031128, "learning_rate": 9.200000000000001e-05, "loss": 3.8996, "step": 23 }, { "epoch": 0.10412147505422993, "grad_norm": 2.925503969192505, "learning_rate": 9.6e-05, "loss": 4.0918, "step": 24 }, { "epoch": 0.10845986984815618, "grad_norm": 4.8928961753845215, "learning_rate": 0.0001, "loss": 4.332, "step": 25 }, { "epoch": 0.11279826464208242, "grad_norm": 3.4765207767486572, "learning_rate": 0.00010400000000000001, "loss": 3.7618, "step": 26 }, { "epoch": 0.11713665943600868, "grad_norm": 3.5958409309387207, "learning_rate": 0.00010800000000000001, "loss": 3.7645, "step": 27 }, { "epoch": 0.12147505422993492, "grad_norm": 3.053165912628174, "learning_rate": 0.00011200000000000001, "loss": 3.6536, "step": 28 }, { "epoch": 0.12581344902386118, "grad_norm": 2.3347203731536865, "learning_rate": 0.000116, "loss": 3.8871, "step": 29 }, { "epoch": 0.1301518438177874, "grad_norm": 1.4141613245010376, "learning_rate": 0.00012, "loss": 3.6367, "step": 30 }, { "epoch": 0.13449023861171366, "grad_norm": 1.1042953729629517, "learning_rate": 0.000124, "loss": 3.4273, "step": 31 }, { "epoch": 0.13882863340563992, "grad_norm": 0.9391370415687561, "learning_rate": 0.00012800000000000002, "loss": 3.635, "step": 32 }, { "epoch": 0.14316702819956617, "grad_norm": 1.028341293334961, "learning_rate": 0.000132, "loss": 3.845, "step": 33 }, { "epoch": 0.1475054229934924, "grad_norm": 1.0668063163757324, "learning_rate": 0.00013600000000000003, "loss": 3.732, "step": 34 }, { "epoch": 0.15184381778741865, "grad_norm": 1.0369871854782104, "learning_rate": 0.00014, "loss": 3.6734, "step": 35 }, { "epoch": 0.1561822125813449, "grad_norm": 1.0699695348739624, "learning_rate": 0.000144, "loss": 3.5469, "step": 36 }, { "epoch": 0.16052060737527116, "grad_norm": 1.1715625524520874, "learning_rate": 0.000148, "loss": 3.5759, "step": 37 }, { "epoch": 0.1648590021691974, "grad_norm": 1.2680530548095703, "learning_rate": 0.000152, "loss": 3.7013, "step": 38 }, { "epoch": 0.16919739696312364, "grad_norm": 1.2043352127075195, "learning_rate": 0.00015600000000000002, "loss": 3.74, "step": 39 }, { "epoch": 0.1735357917570499, "grad_norm": 1.342244029045105, "learning_rate": 0.00016, "loss": 3.7761, "step": 40 }, { "epoch": 0.17787418655097614, "grad_norm": 1.4112831354141235, "learning_rate": 0.000164, "loss": 3.6449, "step": 41 }, { "epoch": 0.1822125813449024, "grad_norm": 1.3947268724441528, "learning_rate": 0.000168, "loss": 3.6043, "step": 42 }, { "epoch": 0.18655097613882862, "grad_norm": 1.5763946771621704, "learning_rate": 0.000172, "loss": 3.4768, "step": 43 }, { "epoch": 0.19088937093275488, "grad_norm": 1.9006760120391846, "learning_rate": 0.00017600000000000002, "loss": 3.6424, "step": 44 }, { "epoch": 0.19522776572668113, "grad_norm": 2.0071113109588623, "learning_rate": 0.00018, "loss": 3.7499, "step": 45 }, { "epoch": 0.19956616052060738, "grad_norm": 2.002067804336548, "learning_rate": 0.00018400000000000003, "loss": 3.6082, "step": 46 }, { "epoch": 0.2039045553145336, "grad_norm": 2.4698357582092285, "learning_rate": 0.000188, "loss": 3.7604, "step": 47 }, { "epoch": 0.20824295010845986, "grad_norm": 3.051906108856201, "learning_rate": 0.000192, "loss": 3.7546, "step": 48 }, { "epoch": 0.21258134490238612, "grad_norm": 3.100890636444092, "learning_rate": 0.000196, "loss": 3.6134, "step": 49 }, { "epoch": 0.21691973969631237, "grad_norm": 4.4481425285339355, "learning_rate": 0.0002, "loss": 3.355, "step": 50 }, { "epoch": 0.22125813449023862, "grad_norm": 4.157866954803467, "learning_rate": 0.00019998476951563915, "loss": 3.5229, "step": 51 }, { "epoch": 0.22559652928416485, "grad_norm": 5.159533500671387, "learning_rate": 0.0001999390827019096, "loss": 3.9098, "step": 52 }, { "epoch": 0.2299349240780911, "grad_norm": 4.372255325317383, "learning_rate": 0.0001998629534754574, "loss": 3.8285, "step": 53 }, { "epoch": 0.23427331887201736, "grad_norm": 2.7389180660247803, "learning_rate": 0.00019975640502598244, "loss": 3.5892, "step": 54 }, { "epoch": 0.2386117136659436, "grad_norm": 1.623792290687561, "learning_rate": 0.00019961946980917456, "loss": 3.6253, "step": 55 }, { "epoch": 0.24295010845986983, "grad_norm": 1.0698862075805664, "learning_rate": 0.00019945218953682734, "loss": 3.4747, "step": 56 }, { "epoch": 0.2472885032537961, "grad_norm": 1.0480446815490723, "learning_rate": 0.00019925461516413223, "loss": 3.5076, "step": 57 }, { "epoch": 0.25162689804772237, "grad_norm": 1.1356984376907349, "learning_rate": 0.00019902680687415705, "loss": 3.4875, "step": 58 }, { "epoch": 0.25162689804772237, "eval_loss": 3.5454585552215576, "eval_runtime": 43.9485, "eval_samples_per_second": 8.851, "eval_steps_per_second": 2.23, "step": 58 }, { "epoch": 0.2559652928416486, "grad_norm": 1.147985816001892, "learning_rate": 0.00019876883405951377, "loss": 3.5368, "step": 59 }, { "epoch": 0.2603036876355748, "grad_norm": 1.167962670326233, "learning_rate": 0.00019848077530122083, "loss": 3.4885, "step": 60 }, { "epoch": 0.2646420824295011, "grad_norm": 1.1241693496704102, "learning_rate": 0.00019816271834476642, "loss": 3.5335, "step": 61 }, { "epoch": 0.26898047722342733, "grad_norm": 1.0841178894042969, "learning_rate": 0.00019781476007338058, "loss": 3.5822, "step": 62 }, { "epoch": 0.27331887201735355, "grad_norm": 1.1276164054870605, "learning_rate": 0.00019743700647852354, "loss": 3.4757, "step": 63 }, { "epoch": 0.27765726681127983, "grad_norm": 1.192659854888916, "learning_rate": 0.00019702957262759965, "loss": 3.4212, "step": 64 }, { "epoch": 0.28199566160520606, "grad_norm": 1.2061688899993896, "learning_rate": 0.00019659258262890683, "loss": 3.4564, "step": 65 }, { "epoch": 0.28633405639913234, "grad_norm": 1.4012079238891602, "learning_rate": 0.0001961261695938319, "loss": 3.423, "step": 66 }, { "epoch": 0.29067245119305857, "grad_norm": 1.3591368198394775, "learning_rate": 0.00019563047559630357, "loss": 3.5284, "step": 67 }, { "epoch": 0.2950108459869848, "grad_norm": 1.3555010557174683, "learning_rate": 0.00019510565162951537, "loss": 3.4406, "step": 68 }, { "epoch": 0.2993492407809111, "grad_norm": 1.4745391607284546, "learning_rate": 0.0001945518575599317, "loss": 3.3899, "step": 69 }, { "epoch": 0.3036876355748373, "grad_norm": 1.6432572603225708, "learning_rate": 0.00019396926207859084, "loss": 3.4343, "step": 70 }, { "epoch": 0.3080260303687636, "grad_norm": 1.9187488555908203, "learning_rate": 0.00019335804264972018, "loss": 3.5881, "step": 71 }, { "epoch": 0.3123644251626898, "grad_norm": 2.1937949657440186, "learning_rate": 0.00019271838545667876, "loss": 3.3765, "step": 72 }, { "epoch": 0.31670281995661603, "grad_norm": 2.376640558242798, "learning_rate": 0.00019205048534524406, "loss": 3.2758, "step": 73 }, { "epoch": 0.3210412147505423, "grad_norm": 3.0442004203796387, "learning_rate": 0.0001913545457642601, "loss": 3.5366, "step": 74 }, { "epoch": 0.32537960954446854, "grad_norm": 3.6359612941741943, "learning_rate": 0.000190630778703665, "loss": 3.0313, "step": 75 }, { "epoch": 0.3297180043383948, "grad_norm": 4.367193698883057, "learning_rate": 0.0001898794046299167, "loss": 3.3864, "step": 76 }, { "epoch": 0.33405639913232105, "grad_norm": 5.261653900146484, "learning_rate": 0.0001891006524188368, "loss": 3.5411, "step": 77 }, { "epoch": 0.3383947939262473, "grad_norm": 5.341316223144531, "learning_rate": 0.00018829475928589271, "loss": 3.843, "step": 78 }, { "epoch": 0.34273318872017355, "grad_norm": 2.9030559062957764, "learning_rate": 0.00018746197071393958, "loss": 3.4254, "step": 79 }, { "epoch": 0.3470715835140998, "grad_norm": 1.4780312776565552, "learning_rate": 0.00018660254037844388, "loss": 3.4877, "step": 80 }, { "epoch": 0.351409978308026, "grad_norm": 1.0593628883361816, "learning_rate": 0.00018571673007021123, "loss": 3.3987, "step": 81 }, { "epoch": 0.3557483731019523, "grad_norm": 0.9910492897033691, "learning_rate": 0.0001848048096156426, "loss": 3.4944, "step": 82 }, { "epoch": 0.3600867678958785, "grad_norm": 1.004767656326294, "learning_rate": 0.00018386705679454242, "loss": 3.4143, "step": 83 }, { "epoch": 0.3644251626898048, "grad_norm": 1.012804627418518, "learning_rate": 0.00018290375725550417, "loss": 3.4504, "step": 84 }, { "epoch": 0.368763557483731, "grad_norm": 1.0758857727050781, "learning_rate": 0.0001819152044288992, "loss": 3.5181, "step": 85 }, { "epoch": 0.37310195227765725, "grad_norm": 1.0776313543319702, "learning_rate": 0.00018090169943749476, "loss": 3.4293, "step": 86 }, { "epoch": 0.3774403470715835, "grad_norm": 1.0856565237045288, "learning_rate": 0.00017986355100472928, "loss": 3.3012, "step": 87 }, { "epoch": 0.38177874186550975, "grad_norm": 1.146246075630188, "learning_rate": 0.00017880107536067218, "loss": 3.5778, "step": 88 }, { "epoch": 0.38611713665943603, "grad_norm": 1.1812922954559326, "learning_rate": 0.0001777145961456971, "loss": 3.2835, "step": 89 }, { "epoch": 0.39045553145336226, "grad_norm": 1.3535960912704468, "learning_rate": 0.0001766044443118978, "loss": 3.1863, "step": 90 }, { "epoch": 0.3947939262472885, "grad_norm": 1.312524437904358, "learning_rate": 0.00017547095802227723, "loss": 3.3794, "step": 91 }, { "epoch": 0.39913232104121477, "grad_norm": 1.2628040313720703, "learning_rate": 0.00017431448254773944, "loss": 3.2127, "step": 92 }, { "epoch": 0.403470715835141, "grad_norm": 1.3810231685638428, "learning_rate": 0.00017313537016191706, "loss": 3.3664, "step": 93 }, { "epoch": 0.4078091106290672, "grad_norm": 1.5726513862609863, "learning_rate": 0.0001719339800338651, "loss": 3.4052, "step": 94 }, { "epoch": 0.4121475054229935, "grad_norm": 1.5839647054672241, "learning_rate": 0.00017071067811865476, "loss": 3.2746, "step": 95 }, { "epoch": 0.4164859002169197, "grad_norm": 1.7605924606323242, "learning_rate": 0.00016946583704589973, "loss": 3.48, "step": 96 }, { "epoch": 0.420824295010846, "grad_norm": 2.3345723152160645, "learning_rate": 0.00016819983600624986, "loss": 3.2033, "step": 97 }, { "epoch": 0.42516268980477223, "grad_norm": 1.9480637311935425, "learning_rate": 0.00016691306063588583, "loss": 3.3796, "step": 98 }, { "epoch": 0.42950108459869846, "grad_norm": 2.3618791103363037, "learning_rate": 0.00016560590289905073, "loss": 3.1398, "step": 99 }, { "epoch": 0.43383947939262474, "grad_norm": 3.546729326248169, "learning_rate": 0.00016427876096865394, "loss": 3.0558, "step": 100 }, { "epoch": 0.43817787418655096, "grad_norm": 1.5932743549346924, "learning_rate": 0.00016293203910498376, "loss": 3.3932, "step": 101 }, { "epoch": 0.44251626898047725, "grad_norm": 2.039661407470703, "learning_rate": 0.0001615661475325658, "loss": 3.4066, "step": 102 }, { "epoch": 0.44685466377440347, "grad_norm": 1.742119312286377, "learning_rate": 0.00016018150231520486, "loss": 3.2823, "step": 103 }, { "epoch": 0.4511930585683297, "grad_norm": 1.5700186491012573, "learning_rate": 0.00015877852522924732, "loss": 3.3591, "step": 104 }, { "epoch": 0.455531453362256, "grad_norm": 1.136389970779419, "learning_rate": 0.0001573576436351046, "loss": 3.4663, "step": 105 }, { "epoch": 0.4598698481561822, "grad_norm": 0.8537334203720093, "learning_rate": 0.0001559192903470747, "loss": 3.4367, "step": 106 }, { "epoch": 0.4642082429501085, "grad_norm": 0.8642299175262451, "learning_rate": 0.00015446390350150273, "loss": 3.287, "step": 107 }, { "epoch": 0.4685466377440347, "grad_norm": 0.9279235601425171, "learning_rate": 0.0001529919264233205, "loss": 3.2911, "step": 108 }, { "epoch": 0.47288503253796094, "grad_norm": 0.9121331572532654, "learning_rate": 0.00015150380749100545, "loss": 3.3101, "step": 109 }, { "epoch": 0.4772234273318872, "grad_norm": 0.9868795275688171, "learning_rate": 0.00015000000000000001, "loss": 3.3431, "step": 110 }, { "epoch": 0.48156182212581344, "grad_norm": 1.0646886825561523, "learning_rate": 0.00014848096202463372, "loss": 3.3876, "step": 111 }, { "epoch": 0.48590021691973967, "grad_norm": 1.0819416046142578, "learning_rate": 0.00014694715627858908, "loss": 3.2128, "step": 112 }, { "epoch": 0.49023861171366595, "grad_norm": 1.0728636980056763, "learning_rate": 0.00014539904997395468, "loss": 3.2076, "step": 113 }, { "epoch": 0.4945770065075922, "grad_norm": 1.1562669277191162, "learning_rate": 0.00014383711467890774, "loss": 3.2825, "step": 114 }, { "epoch": 0.49891540130151846, "grad_norm": 1.1967557668685913, "learning_rate": 0.00014226182617406996, "loss": 3.2185, "step": 115 }, { "epoch": 0.5032537960954447, "grad_norm": 1.3139584064483643, "learning_rate": 0.00014067366430758004, "loss": 3.1373, "step": 116 }, { "epoch": 0.5032537960954447, "eval_loss": 3.2728052139282227, "eval_runtime": 43.9403, "eval_samples_per_second": 8.853, "eval_steps_per_second": 2.23, "step": 116 }, { "epoch": 0.5075921908893709, "grad_norm": 1.3170753717422485, "learning_rate": 0.00013907311284892736, "loss": 2.9572, "step": 117 }, { "epoch": 0.5119305856832972, "grad_norm": 1.5243107080459595, "learning_rate": 0.00013746065934159123, "loss": 3.3082, "step": 118 }, { "epoch": 0.5162689804772235, "grad_norm": 1.5845880508422852, "learning_rate": 0.00013583679495453, "loss": 3.4819, "step": 119 }, { "epoch": 0.5206073752711496, "grad_norm": 1.66307532787323, "learning_rate": 0.00013420201433256689, "loss": 3.1131, "step": 120 }, { "epoch": 0.5249457700650759, "grad_norm": 1.6470588445663452, "learning_rate": 0.00013255681544571568, "loss": 3.2847, "step": 121 }, { "epoch": 0.5292841648590022, "grad_norm": 2.1118075847625732, "learning_rate": 0.00013090169943749476, "loss": 3.4669, "step": 122 }, { "epoch": 0.5336225596529284, "grad_norm": 2.056396722793579, "learning_rate": 0.00012923717047227368, "loss": 3.1136, "step": 123 }, { "epoch": 0.5379609544468547, "grad_norm": 2.2389657497406006, "learning_rate": 0.0001275637355816999, "loss": 2.9323, "step": 124 }, { "epoch": 0.5422993492407809, "grad_norm": 2.863621711730957, "learning_rate": 0.00012588190451025207, "loss": 2.9585, "step": 125 }, { "epoch": 0.5466377440347071, "grad_norm": 0.8712321519851685, "learning_rate": 0.00012419218955996676, "loss": 3.1439, "step": 126 }, { "epoch": 0.5509761388286334, "grad_norm": 1.0713740587234497, "learning_rate": 0.0001224951054343865, "loss": 3.2213, "step": 127 }, { "epoch": 0.5553145336225597, "grad_norm": 1.104315996170044, "learning_rate": 0.00012079116908177593, "loss": 3.4522, "step": 128 }, { "epoch": 0.559652928416486, "grad_norm": 1.0883917808532715, "learning_rate": 0.00011908089953765449, "loss": 3.3503, "step": 129 }, { "epoch": 0.5639913232104121, "grad_norm": 1.0000834465026855, "learning_rate": 0.00011736481776669306, "loss": 3.4036, "step": 130 }, { "epoch": 0.5683297180043384, "grad_norm": 0.8869354128837585, "learning_rate": 0.0001156434465040231, "loss": 3.2749, "step": 131 }, { "epoch": 0.5726681127982647, "grad_norm": 0.8651937246322632, "learning_rate": 0.00011391731009600654, "loss": 3.3679, "step": 132 }, { "epoch": 0.5770065075921909, "grad_norm": 0.9174556136131287, "learning_rate": 0.00011218693434051475, "loss": 3.311, "step": 133 }, { "epoch": 0.5813449023861171, "grad_norm": 0.930533230304718, "learning_rate": 0.00011045284632676536, "loss": 3.3761, "step": 134 }, { "epoch": 0.5856832971800434, "grad_norm": 0.9851680994033813, "learning_rate": 0.00010871557427476583, "loss": 3.2752, "step": 135 }, { "epoch": 0.5900216919739696, "grad_norm": 0.9633740782737732, "learning_rate": 0.00010697564737441252, "loss": 3.2373, "step": 136 }, { "epoch": 0.5943600867678959, "grad_norm": 1.132585048675537, "learning_rate": 0.0001052335956242944, "loss": 3.2323, "step": 137 }, { "epoch": 0.5986984815618221, "grad_norm": 1.1232091188430786, "learning_rate": 0.00010348994967025012, "loss": 3.2874, "step": 138 }, { "epoch": 0.6030368763557483, "grad_norm": 1.2559125423431396, "learning_rate": 0.00010174524064372837, "loss": 3.2367, "step": 139 }, { "epoch": 0.6073752711496746, "grad_norm": 1.2623041868209839, "learning_rate": 0.0001, "loss": 3.2243, "step": 140 }, { "epoch": 0.6117136659436009, "grad_norm": 1.3554457426071167, "learning_rate": 9.825475935627165e-05, "loss": 3.4802, "step": 141 }, { "epoch": 0.6160520607375272, "grad_norm": 1.4170132875442505, "learning_rate": 9.651005032974994e-05, "loss": 3.354, "step": 142 }, { "epoch": 0.6203904555314533, "grad_norm": 1.4309097528457642, "learning_rate": 9.476640437570562e-05, "loss": 3.1352, "step": 143 }, { "epoch": 0.6247288503253796, "grad_norm": 1.5829153060913086, "learning_rate": 9.302435262558747e-05, "loss": 3.2455, "step": 144 }, { "epoch": 0.6290672451193059, "grad_norm": 1.8210502862930298, "learning_rate": 9.128442572523417e-05, "loss": 3.2991, "step": 145 }, { "epoch": 0.6334056399132321, "grad_norm": 1.842761516571045, "learning_rate": 8.954715367323468e-05, "loss": 3.2255, "step": 146 }, { "epoch": 0.6377440347071583, "grad_norm": 1.9258646965026855, "learning_rate": 8.781306565948528e-05, "loss": 3.1397, "step": 147 }, { "epoch": 0.6420824295010846, "grad_norm": 2.1189215183258057, "learning_rate": 8.608268990399349e-05, "loss": 3.0414, "step": 148 }, { "epoch": 0.6464208242950108, "grad_norm": 2.4063761234283447, "learning_rate": 8.435655349597689e-05, "loss": 2.8524, "step": 149 }, { "epoch": 0.6507592190889371, "grad_norm": 3.6420836448669434, "learning_rate": 8.263518223330697e-05, "loss": 3.0156, "step": 150 }, { "epoch": 0.6550976138828634, "grad_norm": 0.7080674171447754, "learning_rate": 8.091910046234552e-05, "loss": 3.1636, "step": 151 }, { "epoch": 0.6594360086767896, "grad_norm": 0.798520565032959, "learning_rate": 7.920883091822408e-05, "loss": 3.212, "step": 152 }, { "epoch": 0.6637744034707158, "grad_norm": 0.8640486001968384, "learning_rate": 7.750489456561352e-05, "loss": 3.1644, "step": 153 }, { "epoch": 0.6681127982646421, "grad_norm": 0.870906412601471, "learning_rate": 7.580781044003324e-05, "loss": 3.1876, "step": 154 }, { "epoch": 0.6724511930585684, "grad_norm": 0.8581348061561584, "learning_rate": 7.411809548974792e-05, "loss": 3.2739, "step": 155 }, { "epoch": 0.6767895878524945, "grad_norm": 0.8691614270210266, "learning_rate": 7.243626441830009e-05, "loss": 3.2444, "step": 156 }, { "epoch": 0.6811279826464208, "grad_norm": 0.9455673098564148, "learning_rate": 7.076282952772633e-05, "loss": 3.3004, "step": 157 }, { "epoch": 0.6854663774403471, "grad_norm": 0.8873337507247925, "learning_rate": 6.909830056250527e-05, "loss": 3.1778, "step": 158 }, { "epoch": 0.6898047722342733, "grad_norm": 0.910775363445282, "learning_rate": 6.744318455428436e-05, "loss": 3.1346, "step": 159 }, { "epoch": 0.6941431670281996, "grad_norm": 0.9872409105300903, "learning_rate": 6.579798566743314e-05, "loss": 3.1665, "step": 160 }, { "epoch": 0.6984815618221258, "grad_norm": 1.0516481399536133, "learning_rate": 6.416320504546997e-05, "loss": 3.3064, "step": 161 }, { "epoch": 0.702819956616052, "grad_norm": 1.0263571739196777, "learning_rate": 6.25393406584088e-05, "loss": 3.3698, "step": 162 }, { "epoch": 0.7071583514099783, "grad_norm": 1.1050878763198853, "learning_rate": 6.092688715107264e-05, "loss": 3.2436, "step": 163 }, { "epoch": 0.7114967462039046, "grad_norm": 1.1121841669082642, "learning_rate": 5.9326335692419995e-05, "loss": 2.9433, "step": 164 }, { "epoch": 0.7158351409978309, "grad_norm": 1.2424358129501343, "learning_rate": 5.773817382593008e-05, "loss": 3.3616, "step": 165 }, { "epoch": 0.720173535791757, "grad_norm": 1.1899327039718628, "learning_rate": 5.616288532109225e-05, "loss": 3.0392, "step": 166 }, { "epoch": 0.7245119305856833, "grad_norm": 1.3395730257034302, "learning_rate": 5.4600950026045326e-05, "loss": 3.0905, "step": 167 }, { "epoch": 0.7288503253796096, "grad_norm": 1.4268842935562134, "learning_rate": 5.305284372141095e-05, "loss": 3.1247, "step": 168 }, { "epoch": 0.7331887201735358, "grad_norm": 1.5514875650405884, "learning_rate": 5.15190379753663e-05, "loss": 3.3772, "step": 169 }, { "epoch": 0.737527114967462, "grad_norm": 1.8371058702468872, "learning_rate": 5.000000000000002e-05, "loss": 2.9262, "step": 170 }, { "epoch": 0.7418655097613883, "grad_norm": 1.7641676664352417, "learning_rate": 4.8496192508994576e-05, "loss": 3.0113, "step": 171 }, { "epoch": 0.7462039045553145, "grad_norm": 1.8325039148330688, "learning_rate": 4.700807357667952e-05, "loss": 3.0551, "step": 172 }, { "epoch": 0.7505422993492408, "grad_norm": 1.9740185737609863, "learning_rate": 4.5536096498497295e-05, "loss": 3.1479, "step": 173 }, { "epoch": 0.754880694143167, "grad_norm": 2.327420234680176, "learning_rate": 4.4080709652925336e-05, "loss": 3.2149, "step": 174 }, { "epoch": 0.754880694143167, "eval_loss": 3.163884162902832, "eval_runtime": 43.9914, "eval_samples_per_second": 8.843, "eval_steps_per_second": 2.228, "step": 174 }, { "epoch": 0.7592190889370932, "grad_norm": 3.558817148208618, "learning_rate": 4.264235636489542e-05, "loss": 3.0184, "step": 175 }, { "epoch": 0.7635574837310195, "grad_norm": 0.48007091879844666, "learning_rate": 4.12214747707527e-05, "loss": 3.0708, "step": 176 }, { "epoch": 0.7678958785249458, "grad_norm": 0.606035590171814, "learning_rate": 3.981849768479517e-05, "loss": 3.1584, "step": 177 }, { "epoch": 0.7722342733188721, "grad_norm": 0.6647348999977112, "learning_rate": 3.843385246743417e-05, "loss": 3.3003, "step": 178 }, { "epoch": 0.7765726681127982, "grad_norm": 0.6956514120101929, "learning_rate": 3.7067960895016275e-05, "loss": 3.2168, "step": 179 }, { "epoch": 0.7809110629067245, "grad_norm": 0.7575390338897705, "learning_rate": 3.5721239031346066e-05, "loss": 3.2576, "step": 180 }, { "epoch": 0.7852494577006508, "grad_norm": 0.8106915950775146, "learning_rate": 3.439409710094929e-05, "loss": 3.124, "step": 181 }, { "epoch": 0.789587852494577, "grad_norm": 0.873997688293457, "learning_rate": 3.308693936411421e-05, "loss": 3.0977, "step": 182 }, { "epoch": 0.7939262472885033, "grad_norm": 0.9612168073654175, "learning_rate": 3.1800163993750166e-05, "loss": 3.435, "step": 183 }, { "epoch": 0.7982646420824295, "grad_norm": 0.9549990892410278, "learning_rate": 3.053416295410026e-05, "loss": 3.2216, "step": 184 }, { "epoch": 0.8026030368763557, "grad_norm": 0.9309582710266113, "learning_rate": 2.9289321881345254e-05, "loss": 3.0546, "step": 185 }, { "epoch": 0.806941431670282, "grad_norm": 1.0800687074661255, "learning_rate": 2.8066019966134904e-05, "loss": 3.2283, "step": 186 }, { "epoch": 0.8112798264642083, "grad_norm": 1.0009733438491821, "learning_rate": 2.6864629838082956e-05, "loss": 3.1638, "step": 187 }, { "epoch": 0.8156182212581344, "grad_norm": 1.1134998798370361, "learning_rate": 2.5685517452260567e-05, "loss": 3.2642, "step": 188 }, { "epoch": 0.8199566160520607, "grad_norm": 1.1395593881607056, "learning_rate": 2.45290419777228e-05, "loss": 3.0712, "step": 189 }, { "epoch": 0.824295010845987, "grad_norm": 1.1547160148620605, "learning_rate": 2.339555568810221e-05, "loss": 3.2101, "step": 190 }, { "epoch": 0.8286334056399133, "grad_norm": 1.2223323583602905, "learning_rate": 2.2285403854302912e-05, "loss": 3.1109, "step": 191 }, { "epoch": 0.8329718004338394, "grad_norm": 1.4417051076889038, "learning_rate": 2.119892463932781e-05, "loss": 3.2497, "step": 192 }, { "epoch": 0.8373101952277657, "grad_norm": 1.3542780876159668, "learning_rate": 2.013644899527074e-05, "loss": 3.23, "step": 193 }, { "epoch": 0.841648590021692, "grad_norm": 1.5529882907867432, "learning_rate": 1.9098300562505266e-05, "loss": 3.2404, "step": 194 }, { "epoch": 0.8459869848156182, "grad_norm": 1.63187575340271, "learning_rate": 1.808479557110081e-05, "loss": 3.0776, "step": 195 }, { "epoch": 0.8503253796095445, "grad_norm": 1.6470518112182617, "learning_rate": 1.7096242744495837e-05, "loss": 3.1312, "step": 196 }, { "epoch": 0.8546637744034707, "grad_norm": 1.8358676433563232, "learning_rate": 1.6132943205457606e-05, "loss": 3.0136, "step": 197 }, { "epoch": 0.8590021691973969, "grad_norm": 2.2392208576202393, "learning_rate": 1.5195190384357404e-05, "loss": 3.0873, "step": 198 }, { "epoch": 0.8633405639913232, "grad_norm": 2.3587329387664795, "learning_rate": 1.4283269929788779e-05, "loss": 3.1336, "step": 199 }, { "epoch": 0.8676789587852495, "grad_norm": 3.4689748287200928, "learning_rate": 1.339745962155613e-05, "loss": 3.2269, "step": 200 }, { "epoch": 0.8720173535791758, "grad_norm": 0.4676075577735901, "learning_rate": 1.2538029286060426e-05, "loss": 3.2194, "step": 201 }, { "epoch": 0.8763557483731019, "grad_norm": 0.5948041081428528, "learning_rate": 1.1705240714107302e-05, "loss": 3.2006, "step": 202 }, { "epoch": 0.8806941431670282, "grad_norm": 0.6200747489929199, "learning_rate": 1.0899347581163221e-05, "loss": 3.1966, "step": 203 }, { "epoch": 0.8850325379609545, "grad_norm": 0.6264815926551819, "learning_rate": 1.0120595370083318e-05, "loss": 3.1552, "step": 204 }, { "epoch": 0.8893709327548807, "grad_norm": 0.6958035230636597, "learning_rate": 9.369221296335006e-06, "loss": 3.21, "step": 205 }, { "epoch": 0.8937093275488069, "grad_norm": 0.7550477981567383, "learning_rate": 8.645454235739903e-06, "loss": 3.2491, "step": 206 }, { "epoch": 0.8980477223427332, "grad_norm": 0.78013014793396, "learning_rate": 7.949514654755962e-06, "loss": 3.158, "step": 207 }, { "epoch": 0.9023861171366594, "grad_norm": 0.786949098110199, "learning_rate": 7.281614543321269e-06, "loss": 3.2446, "step": 208 }, { "epoch": 0.9067245119305857, "grad_norm": 0.8102577924728394, "learning_rate": 6.6419573502798374e-06, "loss": 3.2238, "step": 209 }, { "epoch": 0.911062906724512, "grad_norm": 0.8839837908744812, "learning_rate": 6.030737921409169e-06, "loss": 3.1035, "step": 210 }, { "epoch": 0.9154013015184381, "grad_norm": 0.9286414980888367, "learning_rate": 5.448142440068316e-06, "loss": 3.2198, "step": 211 }, { "epoch": 0.9197396963123644, "grad_norm": 1.031367540359497, "learning_rate": 4.8943483704846475e-06, "loss": 3.207, "step": 212 }, { "epoch": 0.9240780911062907, "grad_norm": 1.1086468696594238, "learning_rate": 4.369524403696457e-06, "loss": 3.2715, "step": 213 }, { "epoch": 0.928416485900217, "grad_norm": 1.0586810111999512, "learning_rate": 3.873830406168111e-06, "loss": 3.2091, "step": 214 }, { "epoch": 0.9327548806941431, "grad_norm": 1.0433012247085571, "learning_rate": 3.40741737109318e-06, "loss": 3.11, "step": 215 }, { "epoch": 0.9370932754880694, "grad_norm": 1.214693546295166, "learning_rate": 2.970427372400353e-06, "loss": 3.1984, "step": 216 }, { "epoch": 0.9414316702819957, "grad_norm": 1.3140201568603516, "learning_rate": 2.5629935214764865e-06, "loss": 3.1381, "step": 217 }, { "epoch": 0.9457700650759219, "grad_norm": 1.439610242843628, "learning_rate": 2.1852399266194314e-06, "loss": 3.2828, "step": 218 }, { "epoch": 0.9501084598698482, "grad_norm": 1.447763204574585, "learning_rate": 1.8372816552336026e-06, "loss": 3.1896, "step": 219 }, { "epoch": 0.9544468546637744, "grad_norm": 1.5651224851608276, "learning_rate": 1.5192246987791981e-06, "loss": 3.0176, "step": 220 }, { "epoch": 0.9587852494577006, "grad_norm": 1.7138340473175049, "learning_rate": 1.231165940486234e-06, "loss": 3.0649, "step": 221 }, { "epoch": 0.9631236442516269, "grad_norm": 1.7278990745544434, "learning_rate": 9.731931258429638e-07, "loss": 2.901, "step": 222 }, { "epoch": 0.9674620390455532, "grad_norm": 1.8585275411605835, "learning_rate": 7.453848358678017e-07, "loss": 2.9228, "step": 223 }, { "epoch": 0.9718004338394793, "grad_norm": 2.438549757003784, "learning_rate": 5.478104631726711e-07, "loss": 2.9518, "step": 224 }, { "epoch": 0.9761388286334056, "grad_norm": 3.6199636459350586, "learning_rate": 3.805301908254455e-07, "loss": 2.9323, "step": 225 }, { "epoch": 0.9804772234273319, "grad_norm": 0.5968942046165466, "learning_rate": 2.4359497401758024e-07, "loss": 3.0911, "step": 226 }, { "epoch": 0.9848156182212582, "grad_norm": 0.8684574365615845, "learning_rate": 1.3704652454261668e-07, "loss": 3.2219, "step": 227 }, { "epoch": 0.9891540130151844, "grad_norm": 1.1482932567596436, "learning_rate": 6.09172980904238e-08, "loss": 3.0405, "step": 228 }, { "epoch": 0.9934924078091106, "grad_norm": 1.4431898593902588, "learning_rate": 1.5230484360873044e-08, "loss": 3.1692, "step": 229 }, { "epoch": 0.9978308026030369, "grad_norm": 1.78467857837677, "learning_rate": 0.0, "loss": 2.932, "step": 230 } ], "logging_steps": 1, "max_steps": 230, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 58, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.142510989790413e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }