|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9978308026030369, |
|
"eval_steps": 58, |
|
"global_step": 230, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004338394793926247, |
|
"grad_norm": 0.2975526452064514, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 3.4415, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004338394793926247, |
|
"eval_loss": 4.956099033355713, |
|
"eval_runtime": 43.9816, |
|
"eval_samples_per_second": 8.845, |
|
"eval_steps_per_second": 2.228, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.008676789587852495, |
|
"grad_norm": 0.41739514470100403, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 3.5934, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.013015184381778741, |
|
"grad_norm": 0.47740957140922546, |
|
"learning_rate": 1.2e-05, |
|
"loss": 3.7866, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01735357917570499, |
|
"grad_norm": 0.5908945798873901, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 3.8979, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.021691973969631236, |
|
"grad_norm": 0.602057933807373, |
|
"learning_rate": 2e-05, |
|
"loss": 3.8257, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.026030368763557483, |
|
"grad_norm": 0.7276618480682373, |
|
"learning_rate": 2.4e-05, |
|
"loss": 4.0716, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03036876355748373, |
|
"grad_norm": 0.7895606160163879, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 4.1982, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.03470715835140998, |
|
"grad_norm": 0.9524717926979065, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 4.1916, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.039045553145336226, |
|
"grad_norm": 0.9786620736122131, |
|
"learning_rate": 3.6e-05, |
|
"loss": 4.0969, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.04338394793926247, |
|
"grad_norm": 1.0913058519363403, |
|
"learning_rate": 4e-05, |
|
"loss": 4.3758, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.04772234273318872, |
|
"grad_norm": 1.2519152164459229, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 4.3174, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.052060737527114966, |
|
"grad_norm": 1.4428540468215942, |
|
"learning_rate": 4.8e-05, |
|
"loss": 4.2927, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.05639913232104121, |
|
"grad_norm": 1.557953953742981, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 4.3237, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.06073752711496746, |
|
"grad_norm": 1.791407585144043, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 4.3922, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0650759219088937, |
|
"grad_norm": 1.829128623008728, |
|
"learning_rate": 6e-05, |
|
"loss": 4.2218, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.06941431670281996, |
|
"grad_norm": 1.8731590509414673, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 4.294, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.0737527114967462, |
|
"grad_norm": 2.140212297439575, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 4.3197, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.07809110629067245, |
|
"grad_norm": 2.5610997676849365, |
|
"learning_rate": 7.2e-05, |
|
"loss": 4.3203, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0824295010845987, |
|
"grad_norm": 2.5937764644622803, |
|
"learning_rate": 7.6e-05, |
|
"loss": 4.2128, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.08676789587852494, |
|
"grad_norm": 1.9964145421981812, |
|
"learning_rate": 8e-05, |
|
"loss": 4.1141, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0911062906724512, |
|
"grad_norm": 1.9274357557296753, |
|
"learning_rate": 8.4e-05, |
|
"loss": 3.9559, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.09544468546637744, |
|
"grad_norm": 2.1689515113830566, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 4.0459, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.09978308026030369, |
|
"grad_norm": 2.417027235031128, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 3.8996, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.10412147505422993, |
|
"grad_norm": 2.925503969192505, |
|
"learning_rate": 9.6e-05, |
|
"loss": 4.0918, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.10845986984815618, |
|
"grad_norm": 4.8928961753845215, |
|
"learning_rate": 0.0001, |
|
"loss": 4.332, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.11279826464208242, |
|
"grad_norm": 3.4765207767486572, |
|
"learning_rate": 0.00010400000000000001, |
|
"loss": 3.7618, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.11713665943600868, |
|
"grad_norm": 3.5958409309387207, |
|
"learning_rate": 0.00010800000000000001, |
|
"loss": 3.7645, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.12147505422993492, |
|
"grad_norm": 3.053165912628174, |
|
"learning_rate": 0.00011200000000000001, |
|
"loss": 3.6536, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.12581344902386118, |
|
"grad_norm": 2.3347203731536865, |
|
"learning_rate": 0.000116, |
|
"loss": 3.8871, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1301518438177874, |
|
"grad_norm": 1.4141613245010376, |
|
"learning_rate": 0.00012, |
|
"loss": 3.6367, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13449023861171366, |
|
"grad_norm": 1.1042953729629517, |
|
"learning_rate": 0.000124, |
|
"loss": 3.4273, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.13882863340563992, |
|
"grad_norm": 0.9391370415687561, |
|
"learning_rate": 0.00012800000000000002, |
|
"loss": 3.635, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.14316702819956617, |
|
"grad_norm": 1.028341293334961, |
|
"learning_rate": 0.000132, |
|
"loss": 3.845, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1475054229934924, |
|
"grad_norm": 1.0668063163757324, |
|
"learning_rate": 0.00013600000000000003, |
|
"loss": 3.732, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.15184381778741865, |
|
"grad_norm": 1.0369871854782104, |
|
"learning_rate": 0.00014, |
|
"loss": 3.6734, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1561822125813449, |
|
"grad_norm": 1.0699695348739624, |
|
"learning_rate": 0.000144, |
|
"loss": 3.5469, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.16052060737527116, |
|
"grad_norm": 1.1715625524520874, |
|
"learning_rate": 0.000148, |
|
"loss": 3.5759, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.1648590021691974, |
|
"grad_norm": 1.2680530548095703, |
|
"learning_rate": 0.000152, |
|
"loss": 3.7013, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.16919739696312364, |
|
"grad_norm": 1.2043352127075195, |
|
"learning_rate": 0.00015600000000000002, |
|
"loss": 3.74, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.1735357917570499, |
|
"grad_norm": 1.342244029045105, |
|
"learning_rate": 0.00016, |
|
"loss": 3.7761, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.17787418655097614, |
|
"grad_norm": 1.4112831354141235, |
|
"learning_rate": 0.000164, |
|
"loss": 3.6449, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.1822125813449024, |
|
"grad_norm": 1.3947268724441528, |
|
"learning_rate": 0.000168, |
|
"loss": 3.6043, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.18655097613882862, |
|
"grad_norm": 1.5763946771621704, |
|
"learning_rate": 0.000172, |
|
"loss": 3.4768, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.19088937093275488, |
|
"grad_norm": 1.9006760120391846, |
|
"learning_rate": 0.00017600000000000002, |
|
"loss": 3.6424, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.19522776572668113, |
|
"grad_norm": 2.0071113109588623, |
|
"learning_rate": 0.00018, |
|
"loss": 3.7499, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.19956616052060738, |
|
"grad_norm": 2.002067804336548, |
|
"learning_rate": 0.00018400000000000003, |
|
"loss": 3.6082, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2039045553145336, |
|
"grad_norm": 2.4698357582092285, |
|
"learning_rate": 0.000188, |
|
"loss": 3.7604, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.20824295010845986, |
|
"grad_norm": 3.051906108856201, |
|
"learning_rate": 0.000192, |
|
"loss": 3.7546, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.21258134490238612, |
|
"grad_norm": 3.100890636444092, |
|
"learning_rate": 0.000196, |
|
"loss": 3.6134, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.21691973969631237, |
|
"grad_norm": 4.4481425285339355, |
|
"learning_rate": 0.0002, |
|
"loss": 3.355, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22125813449023862, |
|
"grad_norm": 4.157866954803467, |
|
"learning_rate": 0.00019998476951563915, |
|
"loss": 3.5229, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.22559652928416485, |
|
"grad_norm": 5.159533500671387, |
|
"learning_rate": 0.0001999390827019096, |
|
"loss": 3.9098, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.2299349240780911, |
|
"grad_norm": 4.372255325317383, |
|
"learning_rate": 0.0001998629534754574, |
|
"loss": 3.8285, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.23427331887201736, |
|
"grad_norm": 2.7389180660247803, |
|
"learning_rate": 0.00019975640502598244, |
|
"loss": 3.5892, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.2386117136659436, |
|
"grad_norm": 1.623792290687561, |
|
"learning_rate": 0.00019961946980917456, |
|
"loss": 3.6253, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.24295010845986983, |
|
"grad_norm": 1.0698862075805664, |
|
"learning_rate": 0.00019945218953682734, |
|
"loss": 3.4747, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2472885032537961, |
|
"grad_norm": 1.0480446815490723, |
|
"learning_rate": 0.00019925461516413223, |
|
"loss": 3.5076, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.25162689804772237, |
|
"grad_norm": 1.1356984376907349, |
|
"learning_rate": 0.00019902680687415705, |
|
"loss": 3.4875, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.25162689804772237, |
|
"eval_loss": 3.5454585552215576, |
|
"eval_runtime": 43.9485, |
|
"eval_samples_per_second": 8.851, |
|
"eval_steps_per_second": 2.23, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2559652928416486, |
|
"grad_norm": 1.147985816001892, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 3.5368, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.2603036876355748, |
|
"grad_norm": 1.167962670326233, |
|
"learning_rate": 0.00019848077530122083, |
|
"loss": 3.4885, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2646420824295011, |
|
"grad_norm": 1.1241693496704102, |
|
"learning_rate": 0.00019816271834476642, |
|
"loss": 3.5335, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.26898047722342733, |
|
"grad_norm": 1.0841178894042969, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 3.5822, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.27331887201735355, |
|
"grad_norm": 1.1276164054870605, |
|
"learning_rate": 0.00019743700647852354, |
|
"loss": 3.4757, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.27765726681127983, |
|
"grad_norm": 1.192659854888916, |
|
"learning_rate": 0.00019702957262759965, |
|
"loss": 3.4212, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.28199566160520606, |
|
"grad_norm": 1.2061688899993896, |
|
"learning_rate": 0.00019659258262890683, |
|
"loss": 3.4564, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.28633405639913234, |
|
"grad_norm": 1.4012079238891602, |
|
"learning_rate": 0.0001961261695938319, |
|
"loss": 3.423, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.29067245119305857, |
|
"grad_norm": 1.3591368198394775, |
|
"learning_rate": 0.00019563047559630357, |
|
"loss": 3.5284, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2950108459869848, |
|
"grad_norm": 1.3555010557174683, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 3.4406, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2993492407809111, |
|
"grad_norm": 1.4745391607284546, |
|
"learning_rate": 0.0001945518575599317, |
|
"loss": 3.3899, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.3036876355748373, |
|
"grad_norm": 1.6432572603225708, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 3.4343, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3080260303687636, |
|
"grad_norm": 1.9187488555908203, |
|
"learning_rate": 0.00019335804264972018, |
|
"loss": 3.5881, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3123644251626898, |
|
"grad_norm": 2.1937949657440186, |
|
"learning_rate": 0.00019271838545667876, |
|
"loss": 3.3765, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.31670281995661603, |
|
"grad_norm": 2.376640558242798, |
|
"learning_rate": 0.00019205048534524406, |
|
"loss": 3.2758, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.3210412147505423, |
|
"grad_norm": 3.0442004203796387, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 3.5366, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.32537960954446854, |
|
"grad_norm": 3.6359612941741943, |
|
"learning_rate": 0.000190630778703665, |
|
"loss": 3.0313, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3297180043383948, |
|
"grad_norm": 4.367193698883057, |
|
"learning_rate": 0.0001898794046299167, |
|
"loss": 3.3864, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.33405639913232105, |
|
"grad_norm": 5.261653900146484, |
|
"learning_rate": 0.0001891006524188368, |
|
"loss": 3.5411, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.3383947939262473, |
|
"grad_norm": 5.341316223144531, |
|
"learning_rate": 0.00018829475928589271, |
|
"loss": 3.843, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.34273318872017355, |
|
"grad_norm": 2.9030559062957764, |
|
"learning_rate": 0.00018746197071393958, |
|
"loss": 3.4254, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.3470715835140998, |
|
"grad_norm": 1.4780312776565552, |
|
"learning_rate": 0.00018660254037844388, |
|
"loss": 3.4877, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.351409978308026, |
|
"grad_norm": 1.0593628883361816, |
|
"learning_rate": 0.00018571673007021123, |
|
"loss": 3.3987, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3557483731019523, |
|
"grad_norm": 0.9910492897033691, |
|
"learning_rate": 0.0001848048096156426, |
|
"loss": 3.4944, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3600867678958785, |
|
"grad_norm": 1.004767656326294, |
|
"learning_rate": 0.00018386705679454242, |
|
"loss": 3.4143, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3644251626898048, |
|
"grad_norm": 1.012804627418518, |
|
"learning_rate": 0.00018290375725550417, |
|
"loss": 3.4504, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.368763557483731, |
|
"grad_norm": 1.0758857727050781, |
|
"learning_rate": 0.0001819152044288992, |
|
"loss": 3.5181, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.37310195227765725, |
|
"grad_norm": 1.0776313543319702, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 3.4293, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.3774403470715835, |
|
"grad_norm": 1.0856565237045288, |
|
"learning_rate": 0.00017986355100472928, |
|
"loss": 3.3012, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.38177874186550975, |
|
"grad_norm": 1.146246075630188, |
|
"learning_rate": 0.00017880107536067218, |
|
"loss": 3.5778, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.38611713665943603, |
|
"grad_norm": 1.1812922954559326, |
|
"learning_rate": 0.0001777145961456971, |
|
"loss": 3.2835, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.39045553145336226, |
|
"grad_norm": 1.3535960912704468, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 3.1863, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.3947939262472885, |
|
"grad_norm": 1.312524437904358, |
|
"learning_rate": 0.00017547095802227723, |
|
"loss": 3.3794, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.39913232104121477, |
|
"grad_norm": 1.2628040313720703, |
|
"learning_rate": 0.00017431448254773944, |
|
"loss": 3.2127, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.403470715835141, |
|
"grad_norm": 1.3810231685638428, |
|
"learning_rate": 0.00017313537016191706, |
|
"loss": 3.3664, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.4078091106290672, |
|
"grad_norm": 1.5726513862609863, |
|
"learning_rate": 0.0001719339800338651, |
|
"loss": 3.4052, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.4121475054229935, |
|
"grad_norm": 1.5839647054672241, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 3.2746, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.4164859002169197, |
|
"grad_norm": 1.7605924606323242, |
|
"learning_rate": 0.00016946583704589973, |
|
"loss": 3.48, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.420824295010846, |
|
"grad_norm": 2.3345723152160645, |
|
"learning_rate": 0.00016819983600624986, |
|
"loss": 3.2033, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.42516268980477223, |
|
"grad_norm": 1.9480637311935425, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 3.3796, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.42950108459869846, |
|
"grad_norm": 2.3618791103363037, |
|
"learning_rate": 0.00016560590289905073, |
|
"loss": 3.1398, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.43383947939262474, |
|
"grad_norm": 3.546729326248169, |
|
"learning_rate": 0.00016427876096865394, |
|
"loss": 3.0558, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.43817787418655096, |
|
"grad_norm": 1.5932743549346924, |
|
"learning_rate": 0.00016293203910498376, |
|
"loss": 3.3932, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.44251626898047725, |
|
"grad_norm": 2.039661407470703, |
|
"learning_rate": 0.0001615661475325658, |
|
"loss": 3.4066, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.44685466377440347, |
|
"grad_norm": 1.742119312286377, |
|
"learning_rate": 0.00016018150231520486, |
|
"loss": 3.2823, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.4511930585683297, |
|
"grad_norm": 1.5700186491012573, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 3.3591, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.455531453362256, |
|
"grad_norm": 1.136389970779419, |
|
"learning_rate": 0.0001573576436351046, |
|
"loss": 3.4663, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4598698481561822, |
|
"grad_norm": 0.8537334203720093, |
|
"learning_rate": 0.0001559192903470747, |
|
"loss": 3.4367, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.4642082429501085, |
|
"grad_norm": 0.8642299175262451, |
|
"learning_rate": 0.00015446390350150273, |
|
"loss": 3.287, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.4685466377440347, |
|
"grad_norm": 0.9279235601425171, |
|
"learning_rate": 0.0001529919264233205, |
|
"loss": 3.2911, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.47288503253796094, |
|
"grad_norm": 0.9121331572532654, |
|
"learning_rate": 0.00015150380749100545, |
|
"loss": 3.3101, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.4772234273318872, |
|
"grad_norm": 0.9868795275688171, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 3.3431, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.48156182212581344, |
|
"grad_norm": 1.0646886825561523, |
|
"learning_rate": 0.00014848096202463372, |
|
"loss": 3.3876, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.48590021691973967, |
|
"grad_norm": 1.0819416046142578, |
|
"learning_rate": 0.00014694715627858908, |
|
"loss": 3.2128, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.49023861171366595, |
|
"grad_norm": 1.0728636980056763, |
|
"learning_rate": 0.00014539904997395468, |
|
"loss": 3.2076, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.4945770065075922, |
|
"grad_norm": 1.1562669277191162, |
|
"learning_rate": 0.00014383711467890774, |
|
"loss": 3.2825, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.49891540130151846, |
|
"grad_norm": 1.1967557668685913, |
|
"learning_rate": 0.00014226182617406996, |
|
"loss": 3.2185, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5032537960954447, |
|
"grad_norm": 1.3139584064483643, |
|
"learning_rate": 0.00014067366430758004, |
|
"loss": 3.1373, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5032537960954447, |
|
"eval_loss": 3.2728052139282227, |
|
"eval_runtime": 43.9403, |
|
"eval_samples_per_second": 8.853, |
|
"eval_steps_per_second": 2.23, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5075921908893709, |
|
"grad_norm": 1.3170753717422485, |
|
"learning_rate": 0.00013907311284892736, |
|
"loss": 2.9572, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.5119305856832972, |
|
"grad_norm": 1.5243107080459595, |
|
"learning_rate": 0.00013746065934159123, |
|
"loss": 3.3082, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5162689804772235, |
|
"grad_norm": 1.5845880508422852, |
|
"learning_rate": 0.00013583679495453, |
|
"loss": 3.4819, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.5206073752711496, |
|
"grad_norm": 1.66307532787323, |
|
"learning_rate": 0.00013420201433256689, |
|
"loss": 3.1131, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5249457700650759, |
|
"grad_norm": 1.6470588445663452, |
|
"learning_rate": 0.00013255681544571568, |
|
"loss": 3.2847, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.5292841648590022, |
|
"grad_norm": 2.1118075847625732, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 3.4669, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5336225596529284, |
|
"grad_norm": 2.056396722793579, |
|
"learning_rate": 0.00012923717047227368, |
|
"loss": 3.1136, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.5379609544468547, |
|
"grad_norm": 2.2389657497406006, |
|
"learning_rate": 0.0001275637355816999, |
|
"loss": 2.9323, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5422993492407809, |
|
"grad_norm": 2.863621711730957, |
|
"learning_rate": 0.00012588190451025207, |
|
"loss": 2.9585, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5466377440347071, |
|
"grad_norm": 0.8712321519851685, |
|
"learning_rate": 0.00012419218955996676, |
|
"loss": 3.1439, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5509761388286334, |
|
"grad_norm": 1.0713740587234497, |
|
"learning_rate": 0.0001224951054343865, |
|
"loss": 3.2213, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.5553145336225597, |
|
"grad_norm": 1.104315996170044, |
|
"learning_rate": 0.00012079116908177593, |
|
"loss": 3.4522, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.559652928416486, |
|
"grad_norm": 1.0883917808532715, |
|
"learning_rate": 0.00011908089953765449, |
|
"loss": 3.3503, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.5639913232104121, |
|
"grad_norm": 1.0000834465026855, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 3.4036, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5683297180043384, |
|
"grad_norm": 0.8869354128837585, |
|
"learning_rate": 0.0001156434465040231, |
|
"loss": 3.2749, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.5726681127982647, |
|
"grad_norm": 0.8651937246322632, |
|
"learning_rate": 0.00011391731009600654, |
|
"loss": 3.3679, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5770065075921909, |
|
"grad_norm": 0.9174556136131287, |
|
"learning_rate": 0.00011218693434051475, |
|
"loss": 3.311, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.5813449023861171, |
|
"grad_norm": 0.930533230304718, |
|
"learning_rate": 0.00011045284632676536, |
|
"loss": 3.3761, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5856832971800434, |
|
"grad_norm": 0.9851680994033813, |
|
"learning_rate": 0.00010871557427476583, |
|
"loss": 3.2752, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5900216919739696, |
|
"grad_norm": 0.9633740782737732, |
|
"learning_rate": 0.00010697564737441252, |
|
"loss": 3.2373, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5943600867678959, |
|
"grad_norm": 1.132585048675537, |
|
"learning_rate": 0.0001052335956242944, |
|
"loss": 3.2323, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5986984815618221, |
|
"grad_norm": 1.1232091188430786, |
|
"learning_rate": 0.00010348994967025012, |
|
"loss": 3.2874, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6030368763557483, |
|
"grad_norm": 1.2559125423431396, |
|
"learning_rate": 0.00010174524064372837, |
|
"loss": 3.2367, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.6073752711496746, |
|
"grad_norm": 1.2623041868209839, |
|
"learning_rate": 0.0001, |
|
"loss": 3.2243, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6117136659436009, |
|
"grad_norm": 1.3554457426071167, |
|
"learning_rate": 9.825475935627165e-05, |
|
"loss": 3.4802, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.6160520607375272, |
|
"grad_norm": 1.4170132875442505, |
|
"learning_rate": 9.651005032974994e-05, |
|
"loss": 3.354, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6203904555314533, |
|
"grad_norm": 1.4309097528457642, |
|
"learning_rate": 9.476640437570562e-05, |
|
"loss": 3.1352, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.6247288503253796, |
|
"grad_norm": 1.5829153060913086, |
|
"learning_rate": 9.302435262558747e-05, |
|
"loss": 3.2455, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.6290672451193059, |
|
"grad_norm": 1.8210502862930298, |
|
"learning_rate": 9.128442572523417e-05, |
|
"loss": 3.2991, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6334056399132321, |
|
"grad_norm": 1.842761516571045, |
|
"learning_rate": 8.954715367323468e-05, |
|
"loss": 3.2255, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.6377440347071583, |
|
"grad_norm": 1.9258646965026855, |
|
"learning_rate": 8.781306565948528e-05, |
|
"loss": 3.1397, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.6420824295010846, |
|
"grad_norm": 2.1189215183258057, |
|
"learning_rate": 8.608268990399349e-05, |
|
"loss": 3.0414, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6464208242950108, |
|
"grad_norm": 2.4063761234283447, |
|
"learning_rate": 8.435655349597689e-05, |
|
"loss": 2.8524, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.6507592190889371, |
|
"grad_norm": 3.6420836448669434, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 3.0156, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6550976138828634, |
|
"grad_norm": 0.7080674171447754, |
|
"learning_rate": 8.091910046234552e-05, |
|
"loss": 3.1636, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.6594360086767896, |
|
"grad_norm": 0.798520565032959, |
|
"learning_rate": 7.920883091822408e-05, |
|
"loss": 3.212, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6637744034707158, |
|
"grad_norm": 0.8640486001968384, |
|
"learning_rate": 7.750489456561352e-05, |
|
"loss": 3.1644, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.6681127982646421, |
|
"grad_norm": 0.870906412601471, |
|
"learning_rate": 7.580781044003324e-05, |
|
"loss": 3.1876, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6724511930585684, |
|
"grad_norm": 0.8581348061561584, |
|
"learning_rate": 7.411809548974792e-05, |
|
"loss": 3.2739, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.6767895878524945, |
|
"grad_norm": 0.8691614270210266, |
|
"learning_rate": 7.243626441830009e-05, |
|
"loss": 3.2444, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.6811279826464208, |
|
"grad_norm": 0.9455673098564148, |
|
"learning_rate": 7.076282952772633e-05, |
|
"loss": 3.3004, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.6854663774403471, |
|
"grad_norm": 0.8873337507247925, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 3.1778, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6898047722342733, |
|
"grad_norm": 0.910775363445282, |
|
"learning_rate": 6.744318455428436e-05, |
|
"loss": 3.1346, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.6941431670281996, |
|
"grad_norm": 0.9872409105300903, |
|
"learning_rate": 6.579798566743314e-05, |
|
"loss": 3.1665, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.6984815618221258, |
|
"grad_norm": 1.0516481399536133, |
|
"learning_rate": 6.416320504546997e-05, |
|
"loss": 3.3064, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.702819956616052, |
|
"grad_norm": 1.0263571739196777, |
|
"learning_rate": 6.25393406584088e-05, |
|
"loss": 3.3698, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.7071583514099783, |
|
"grad_norm": 1.1050878763198853, |
|
"learning_rate": 6.092688715107264e-05, |
|
"loss": 3.2436, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.7114967462039046, |
|
"grad_norm": 1.1121841669082642, |
|
"learning_rate": 5.9326335692419995e-05, |
|
"loss": 2.9433, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7158351409978309, |
|
"grad_norm": 1.2424358129501343, |
|
"learning_rate": 5.773817382593008e-05, |
|
"loss": 3.3616, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.720173535791757, |
|
"grad_norm": 1.1899327039718628, |
|
"learning_rate": 5.616288532109225e-05, |
|
"loss": 3.0392, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.7245119305856833, |
|
"grad_norm": 1.3395730257034302, |
|
"learning_rate": 5.4600950026045326e-05, |
|
"loss": 3.0905, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.7288503253796096, |
|
"grad_norm": 1.4268842935562134, |
|
"learning_rate": 5.305284372141095e-05, |
|
"loss": 3.1247, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.7331887201735358, |
|
"grad_norm": 1.5514875650405884, |
|
"learning_rate": 5.15190379753663e-05, |
|
"loss": 3.3772, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.737527114967462, |
|
"grad_norm": 1.8371058702468872, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 2.9262, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7418655097613883, |
|
"grad_norm": 1.7641676664352417, |
|
"learning_rate": 4.8496192508994576e-05, |
|
"loss": 3.0113, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.7462039045553145, |
|
"grad_norm": 1.8325039148330688, |
|
"learning_rate": 4.700807357667952e-05, |
|
"loss": 3.0551, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7505422993492408, |
|
"grad_norm": 1.9740185737609863, |
|
"learning_rate": 4.5536096498497295e-05, |
|
"loss": 3.1479, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.754880694143167, |
|
"grad_norm": 2.327420234680176, |
|
"learning_rate": 4.4080709652925336e-05, |
|
"loss": 3.2149, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.754880694143167, |
|
"eval_loss": 3.163884162902832, |
|
"eval_runtime": 43.9914, |
|
"eval_samples_per_second": 8.843, |
|
"eval_steps_per_second": 2.228, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7592190889370932, |
|
"grad_norm": 3.558817148208618, |
|
"learning_rate": 4.264235636489542e-05, |
|
"loss": 3.0184, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7635574837310195, |
|
"grad_norm": 0.48007091879844666, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 3.0708, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7678958785249458, |
|
"grad_norm": 0.606035590171814, |
|
"learning_rate": 3.981849768479517e-05, |
|
"loss": 3.1584, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.7722342733188721, |
|
"grad_norm": 0.6647348999977112, |
|
"learning_rate": 3.843385246743417e-05, |
|
"loss": 3.3003, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7765726681127982, |
|
"grad_norm": 0.6956514120101929, |
|
"learning_rate": 3.7067960895016275e-05, |
|
"loss": 3.2168, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.7809110629067245, |
|
"grad_norm": 0.7575390338897705, |
|
"learning_rate": 3.5721239031346066e-05, |
|
"loss": 3.2576, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7852494577006508, |
|
"grad_norm": 0.8106915950775146, |
|
"learning_rate": 3.439409710094929e-05, |
|
"loss": 3.124, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.789587852494577, |
|
"grad_norm": 0.873997688293457, |
|
"learning_rate": 3.308693936411421e-05, |
|
"loss": 3.0977, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.7939262472885033, |
|
"grad_norm": 0.9612168073654175, |
|
"learning_rate": 3.1800163993750166e-05, |
|
"loss": 3.435, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.7982646420824295, |
|
"grad_norm": 0.9549990892410278, |
|
"learning_rate": 3.053416295410026e-05, |
|
"loss": 3.2216, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.8026030368763557, |
|
"grad_norm": 0.9309582710266113, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 3.0546, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.806941431670282, |
|
"grad_norm": 1.0800687074661255, |
|
"learning_rate": 2.8066019966134904e-05, |
|
"loss": 3.2283, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.8112798264642083, |
|
"grad_norm": 1.0009733438491821, |
|
"learning_rate": 2.6864629838082956e-05, |
|
"loss": 3.1638, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.8156182212581344, |
|
"grad_norm": 1.1134998798370361, |
|
"learning_rate": 2.5685517452260567e-05, |
|
"loss": 3.2642, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.8199566160520607, |
|
"grad_norm": 1.1395593881607056, |
|
"learning_rate": 2.45290419777228e-05, |
|
"loss": 3.0712, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.824295010845987, |
|
"grad_norm": 1.1547160148620605, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 3.2101, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8286334056399133, |
|
"grad_norm": 1.2223323583602905, |
|
"learning_rate": 2.2285403854302912e-05, |
|
"loss": 3.1109, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.8329718004338394, |
|
"grad_norm": 1.4417051076889038, |
|
"learning_rate": 2.119892463932781e-05, |
|
"loss": 3.2497, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.8373101952277657, |
|
"grad_norm": 1.3542780876159668, |
|
"learning_rate": 2.013644899527074e-05, |
|
"loss": 3.23, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.841648590021692, |
|
"grad_norm": 1.5529882907867432, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 3.2404, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.8459869848156182, |
|
"grad_norm": 1.63187575340271, |
|
"learning_rate": 1.808479557110081e-05, |
|
"loss": 3.0776, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8503253796095445, |
|
"grad_norm": 1.6470518112182617, |
|
"learning_rate": 1.7096242744495837e-05, |
|
"loss": 3.1312, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.8546637744034707, |
|
"grad_norm": 1.8358676433563232, |
|
"learning_rate": 1.6132943205457606e-05, |
|
"loss": 3.0136, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.8590021691973969, |
|
"grad_norm": 2.2392208576202393, |
|
"learning_rate": 1.5195190384357404e-05, |
|
"loss": 3.0873, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8633405639913232, |
|
"grad_norm": 2.3587329387664795, |
|
"learning_rate": 1.4283269929788779e-05, |
|
"loss": 3.1336, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.8676789587852495, |
|
"grad_norm": 3.4689748287200928, |
|
"learning_rate": 1.339745962155613e-05, |
|
"loss": 3.2269, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8720173535791758, |
|
"grad_norm": 0.4676075577735901, |
|
"learning_rate": 1.2538029286060426e-05, |
|
"loss": 3.2194, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.8763557483731019, |
|
"grad_norm": 0.5948041081428528, |
|
"learning_rate": 1.1705240714107302e-05, |
|
"loss": 3.2006, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.8806941431670282, |
|
"grad_norm": 0.6200747489929199, |
|
"learning_rate": 1.0899347581163221e-05, |
|
"loss": 3.1966, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.8850325379609545, |
|
"grad_norm": 0.6264815926551819, |
|
"learning_rate": 1.0120595370083318e-05, |
|
"loss": 3.1552, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.8893709327548807, |
|
"grad_norm": 0.6958035230636597, |
|
"learning_rate": 9.369221296335006e-06, |
|
"loss": 3.21, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.8937093275488069, |
|
"grad_norm": 0.7550477981567383, |
|
"learning_rate": 8.645454235739903e-06, |
|
"loss": 3.2491, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.8980477223427332, |
|
"grad_norm": 0.78013014793396, |
|
"learning_rate": 7.949514654755962e-06, |
|
"loss": 3.158, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.9023861171366594, |
|
"grad_norm": 0.786949098110199, |
|
"learning_rate": 7.281614543321269e-06, |
|
"loss": 3.2446, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.9067245119305857, |
|
"grad_norm": 0.8102577924728394, |
|
"learning_rate": 6.6419573502798374e-06, |
|
"loss": 3.2238, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.911062906724512, |
|
"grad_norm": 0.8839837908744812, |
|
"learning_rate": 6.030737921409169e-06, |
|
"loss": 3.1035, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9154013015184381, |
|
"grad_norm": 0.9286414980888367, |
|
"learning_rate": 5.448142440068316e-06, |
|
"loss": 3.2198, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.9197396963123644, |
|
"grad_norm": 1.031367540359497, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 3.207, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.9240780911062907, |
|
"grad_norm": 1.1086468696594238, |
|
"learning_rate": 4.369524403696457e-06, |
|
"loss": 3.2715, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.928416485900217, |
|
"grad_norm": 1.0586810111999512, |
|
"learning_rate": 3.873830406168111e-06, |
|
"loss": 3.2091, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.9327548806941431, |
|
"grad_norm": 1.0433012247085571, |
|
"learning_rate": 3.40741737109318e-06, |
|
"loss": 3.11, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.9370932754880694, |
|
"grad_norm": 1.214693546295166, |
|
"learning_rate": 2.970427372400353e-06, |
|
"loss": 3.1984, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.9414316702819957, |
|
"grad_norm": 1.3140201568603516, |
|
"learning_rate": 2.5629935214764865e-06, |
|
"loss": 3.1381, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.9457700650759219, |
|
"grad_norm": 1.439610242843628, |
|
"learning_rate": 2.1852399266194314e-06, |
|
"loss": 3.2828, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.9501084598698482, |
|
"grad_norm": 1.447763204574585, |
|
"learning_rate": 1.8372816552336026e-06, |
|
"loss": 3.1896, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.9544468546637744, |
|
"grad_norm": 1.5651224851608276, |
|
"learning_rate": 1.5192246987791981e-06, |
|
"loss": 3.0176, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9587852494577006, |
|
"grad_norm": 1.7138340473175049, |
|
"learning_rate": 1.231165940486234e-06, |
|
"loss": 3.0649, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.9631236442516269, |
|
"grad_norm": 1.7278990745544434, |
|
"learning_rate": 9.731931258429638e-07, |
|
"loss": 2.901, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9674620390455532, |
|
"grad_norm": 1.8585275411605835, |
|
"learning_rate": 7.453848358678017e-07, |
|
"loss": 2.9228, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.9718004338394793, |
|
"grad_norm": 2.438549757003784, |
|
"learning_rate": 5.478104631726711e-07, |
|
"loss": 2.9518, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9761388286334056, |
|
"grad_norm": 3.6199636459350586, |
|
"learning_rate": 3.805301908254455e-07, |
|
"loss": 2.9323, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9804772234273319, |
|
"grad_norm": 0.5968942046165466, |
|
"learning_rate": 2.4359497401758024e-07, |
|
"loss": 3.0911, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.9848156182212582, |
|
"grad_norm": 0.8684574365615845, |
|
"learning_rate": 1.3704652454261668e-07, |
|
"loss": 3.2219, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.9891540130151844, |
|
"grad_norm": 1.1482932567596436, |
|
"learning_rate": 6.09172980904238e-08, |
|
"loss": 3.0405, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.9934924078091106, |
|
"grad_norm": 1.4431898593902588, |
|
"learning_rate": 1.5230484360873044e-08, |
|
"loss": 3.1692, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.9978308026030369, |
|
"grad_norm": 1.78467857837677, |
|
"learning_rate": 0.0, |
|
"loss": 2.932, |
|
"step": 230 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 230, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 58, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.142510989790413e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|