|
{ |
|
"best_metric": 1.6708096265792847, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-250", |
|
"epoch": 1.9065776930409915, |
|
"eval_steps": 25, |
|
"global_step": 250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0076263107721639654, |
|
"grad_norm": 0.9196067452430725, |
|
"learning_rate": 4.285714285714285e-05, |
|
"loss": 3.2672, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0076263107721639654, |
|
"eval_loss": 3.787529230117798, |
|
"eval_runtime": 0.2862, |
|
"eval_samples_per_second": 174.68, |
|
"eval_steps_per_second": 45.417, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.015252621544327931, |
|
"grad_norm": 0.8998702168464661, |
|
"learning_rate": 8.57142857142857e-05, |
|
"loss": 3.4083, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.022878932316491896, |
|
"grad_norm": 0.9599621295928955, |
|
"learning_rate": 0.00012857142857142855, |
|
"loss": 3.5199, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.030505243088655862, |
|
"grad_norm": 0.9405797123908997, |
|
"learning_rate": 0.0001714285714285714, |
|
"loss": 3.5021, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03813155386081983, |
|
"grad_norm": 0.9620326161384583, |
|
"learning_rate": 0.00021428571428571427, |
|
"loss": 3.5165, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04575786463298379, |
|
"grad_norm": 0.9596439003944397, |
|
"learning_rate": 0.0002571428571428571, |
|
"loss": 3.4724, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.05338417540514776, |
|
"grad_norm": 0.917304277420044, |
|
"learning_rate": 0.0003, |
|
"loss": 3.4023, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.061010486177311724, |
|
"grad_norm": 0.8579983711242676, |
|
"learning_rate": 0.0002999898347482845, |
|
"loss": 3.3249, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.06863679694947569, |
|
"grad_norm": 0.8907164335250854, |
|
"learning_rate": 0.00029995934052398757, |
|
"loss": 3.2235, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.07626310772163966, |
|
"grad_norm": 0.9348911046981812, |
|
"learning_rate": 0.00029990852191942715, |
|
"loss": 3.1403, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.08388941849380362, |
|
"grad_norm": 0.9178524613380432, |
|
"learning_rate": 0.0002998373865876983, |
|
"loss": 3.1472, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.09151572926596759, |
|
"grad_norm": 1.1779893636703491, |
|
"learning_rate": 0.0002997459452415201, |
|
"loss": 3.1228, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.09914204003813155, |
|
"grad_norm": 1.2700427770614624, |
|
"learning_rate": 0.00029963421165162316, |
|
"loss": 2.8118, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.10676835081029552, |
|
"grad_norm": 1.2745684385299683, |
|
"learning_rate": 0.00029950220264467496, |
|
"loss": 2.5948, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.11439466158245949, |
|
"grad_norm": 1.0857181549072266, |
|
"learning_rate": 0.0002993499381007466, |
|
"loss": 2.5336, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.12202097235462345, |
|
"grad_norm": 0.926810622215271, |
|
"learning_rate": 0.00029917744095031806, |
|
"loss": 2.5163, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.12964728312678742, |
|
"grad_norm": 0.7827774882316589, |
|
"learning_rate": 0.0002989847371708258, |
|
"loss": 2.4039, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.13727359389895138, |
|
"grad_norm": 0.6036872863769531, |
|
"learning_rate": 0.00029877185578275025, |
|
"loss": 2.3962, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.14489990467111535, |
|
"grad_norm": 0.8695657253265381, |
|
"learning_rate": 0.0002985388288452454, |
|
"loss": 2.3643, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.15252621544327932, |
|
"grad_norm": 0.9263712763786316, |
|
"learning_rate": 0.0002982856914513109, |
|
"loss": 2.3404, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1601525262154433, |
|
"grad_norm": 0.41602811217308044, |
|
"learning_rate": 0.00029801248172250705, |
|
"loss": 2.3645, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.16777883698760723, |
|
"grad_norm": 0.8675795197486877, |
|
"learning_rate": 0.0002977192408032142, |
|
"loss": 2.3502, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1754051477597712, |
|
"grad_norm": 0.9460420608520508, |
|
"learning_rate": 0.0002974060128544361, |
|
"loss": 2.374, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.18303145853193517, |
|
"grad_norm": 0.6238827705383301, |
|
"learning_rate": 0.0002970728450471497, |
|
"loss": 2.4411, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.19065776930409914, |
|
"grad_norm": 0.6706355214118958, |
|
"learning_rate": 0.0002967197875552013, |
|
"loss": 2.5664, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.19065776930409914, |
|
"eval_loss": 2.3339245319366455, |
|
"eval_runtime": 0.2802, |
|
"eval_samples_per_second": 178.467, |
|
"eval_steps_per_second": 46.402, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1982840800762631, |
|
"grad_norm": 1.384001612663269, |
|
"learning_rate": 0.0002963468935477506, |
|
"loss": 2.1595, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.20591039084842708, |
|
"grad_norm": 0.8498388528823853, |
|
"learning_rate": 0.00029595421918126344, |
|
"loss": 2.109, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.21353670162059105, |
|
"grad_norm": 0.3540984094142914, |
|
"learning_rate": 0.00029554182359105497, |
|
"loss": 2.1542, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.22116301239275502, |
|
"grad_norm": 0.6427699327468872, |
|
"learning_rate": 0.00029510976888238435, |
|
"loss": 2.1396, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.22878932316491898, |
|
"grad_norm": 0.9017099738121033, |
|
"learning_rate": 0.0002946581201211013, |
|
"loss": 2.1525, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.23641563393708293, |
|
"grad_norm": 0.5602626204490662, |
|
"learning_rate": 0.00029418694532384816, |
|
"loss": 2.1099, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2440419447092469, |
|
"grad_norm": 0.35870033502578735, |
|
"learning_rate": 0.0002936963154478161, |
|
"loss": 2.073, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.25166825548141086, |
|
"grad_norm": 0.4111023545265198, |
|
"learning_rate": 0.0002931863043800599, |
|
"loss": 2.1075, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.25929456625357483, |
|
"grad_norm": 0.5750699043273926, |
|
"learning_rate": 0.00029265698892637034, |
|
"loss": 2.0506, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.2669208770257388, |
|
"grad_norm": 0.687319278717041, |
|
"learning_rate": 0.00029210844879970775, |
|
"loss": 2.154, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.27454718779790277, |
|
"grad_norm": 0.49932411313056946, |
|
"learning_rate": 0.0002915407666081976, |
|
"loss": 2.165, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.28217349857006674, |
|
"grad_norm": 0.48046040534973145, |
|
"learning_rate": 0.0002909540278426897, |
|
"loss": 2.1741, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2897998093422307, |
|
"grad_norm": 0.797716498374939, |
|
"learning_rate": 0.0002903483208638841, |
|
"loss": 2.1329, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2974261201143947, |
|
"grad_norm": 0.425375372171402, |
|
"learning_rate": 0.0002897237368890237, |
|
"loss": 1.916, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.30505243088655865, |
|
"grad_norm": 0.40925440192222595, |
|
"learning_rate": 0.0002890803699781578, |
|
"loss": 1.904, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.3126787416587226, |
|
"grad_norm": 0.575661838054657, |
|
"learning_rate": 0.0002884183170199766, |
|
"loss": 1.9828, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.3203050524308866, |
|
"grad_norm": 0.7085393667221069, |
|
"learning_rate": 0.0002877376777172205, |
|
"loss": 1.9275, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.3279313632030505, |
|
"grad_norm": 0.4340769648551941, |
|
"learning_rate": 0.00028703855457166483, |
|
"loss": 1.9285, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.33555767397521447, |
|
"grad_norm": 0.32672634720802307, |
|
"learning_rate": 0.00028632105286868374, |
|
"loss": 1.9382, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.34318398474737843, |
|
"grad_norm": 0.3794914782047272, |
|
"learning_rate": 0.0002855852806613945, |
|
"loss": 1.8947, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3508102955195424, |
|
"grad_norm": 0.37154510617256165, |
|
"learning_rate": 0.00028483134875438527, |
|
"loss": 1.9039, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.35843660629170637, |
|
"grad_norm": 0.3726259768009186, |
|
"learning_rate": 0.0002840593706870279, |
|
"loss": 1.9194, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.36606291706387034, |
|
"grad_norm": 0.370515912771225, |
|
"learning_rate": 0.00028326946271637986, |
|
"loss": 1.8939, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3736892278360343, |
|
"grad_norm": 0.42510128021240234, |
|
"learning_rate": 0.00028246174379967606, |
|
"loss": 1.9624, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3813155386081983, |
|
"grad_norm": 0.531184732913971, |
|
"learning_rate": 0.0002816363355764142, |
|
"loss": 2.0973, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.3813155386081983, |
|
"eval_loss": 1.9253315925598145, |
|
"eval_runtime": 0.2792, |
|
"eval_samples_per_second": 179.063, |
|
"eval_steps_per_second": 46.557, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.38894184938036225, |
|
"grad_norm": 1.3231308460235596, |
|
"learning_rate": 0.00028079336235003674, |
|
"loss": 1.8672, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3965681601525262, |
|
"grad_norm": 0.3953985869884491, |
|
"learning_rate": 0.0002799329510692108, |
|
"loss": 1.7821, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.4041944709246902, |
|
"grad_norm": 0.34664496779441833, |
|
"learning_rate": 0.0002790552313087104, |
|
"loss": 1.8357, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.41182078169685415, |
|
"grad_norm": 0.32160669565200806, |
|
"learning_rate": 0.0002781603352499031, |
|
"loss": 1.8114, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.4194470924690181, |
|
"grad_norm": 0.35647809505462646, |
|
"learning_rate": 0.0002772483976608436, |
|
"loss": 1.9044, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.4270734032411821, |
|
"grad_norm": 0.5951141715049744, |
|
"learning_rate": 0.0002763195558759784, |
|
"loss": 1.8211, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.43469971401334606, |
|
"grad_norm": 0.5895670652389526, |
|
"learning_rate": 0.00027537394977546377, |
|
"loss": 1.8736, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.44232602478551003, |
|
"grad_norm": 0.3808022439479828, |
|
"learning_rate": 0.00027441172176410027, |
|
"loss": 1.8487, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.449952335557674, |
|
"grad_norm": 0.3616071343421936, |
|
"learning_rate": 0.000273433016749887, |
|
"loss": 1.86, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.45757864632983797, |
|
"grad_norm": 0.34098148345947266, |
|
"learning_rate": 0.00027243798212219926, |
|
"loss": 1.8849, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.4652049571020019, |
|
"grad_norm": 0.3677496910095215, |
|
"learning_rate": 0.0002714267677295918, |
|
"loss": 1.9066, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.47283126787416585, |
|
"grad_norm": 0.42355066537857056, |
|
"learning_rate": 0.0002703995258572327, |
|
"loss": 1.8943, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4804575786463298, |
|
"grad_norm": 0.8111021518707275, |
|
"learning_rate": 0.0002693564112039695, |
|
"loss": 1.8816, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4880838894184938, |
|
"grad_norm": 0.5244035720825195, |
|
"learning_rate": 0.00026829758085903196, |
|
"loss": 1.7596, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.49571020019065776, |
|
"grad_norm": 0.49128928780555725, |
|
"learning_rate": 0.0002672231942783754, |
|
"loss": 1.7882, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5033365109628217, |
|
"grad_norm": 0.35969385504722595, |
|
"learning_rate": 0.000266133413260667, |
|
"loss": 1.7607, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5109628217349858, |
|
"grad_norm": 0.30560654401779175, |
|
"learning_rate": 0.0002650284019229195, |
|
"loss": 1.7378, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.5185891325071497, |
|
"grad_norm": 0.2821063697338104, |
|
"learning_rate": 0.0002639083266757757, |
|
"loss": 1.7452, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.5262154432793136, |
|
"grad_norm": 0.30434897541999817, |
|
"learning_rate": 0.000262773356198448, |
|
"loss": 1.757, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.5338417540514776, |
|
"grad_norm": 0.35866251587867737, |
|
"learning_rate": 0.0002616236614133155, |
|
"loss": 1.8456, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5414680648236415, |
|
"grad_norm": 0.36527636647224426, |
|
"learning_rate": 0.0002604594154601839, |
|
"loss": 1.7636, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.5490943755958055, |
|
"grad_norm": 0.4293094575405121, |
|
"learning_rate": 0.00025928079367021134, |
|
"loss": 1.7983, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.5567206863679695, |
|
"grad_norm": 0.4558382034301758, |
|
"learning_rate": 0.000258087973539504, |
|
"loss": 1.8167, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5643469971401335, |
|
"grad_norm": 0.46237555146217346, |
|
"learning_rate": 0.00025688113470238616, |
|
"loss": 1.8516, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5719733079122974, |
|
"grad_norm": 0.5830234885215759, |
|
"learning_rate": 0.00025566045890434747, |
|
"loss": 1.8979, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5719733079122974, |
|
"eval_loss": 1.8215560913085938, |
|
"eval_runtime": 0.2804, |
|
"eval_samples_per_second": 178.348, |
|
"eval_steps_per_second": 46.371, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5795996186844614, |
|
"grad_norm": 0.4658236503601074, |
|
"learning_rate": 0.00025442612997467315, |
|
"loss": 1.7275, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.5872259294566253, |
|
"grad_norm": 0.45749977231025696, |
|
"learning_rate": 0.0002531783337987598, |
|
"loss": 1.7482, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5948522402287894, |
|
"grad_norm": 0.4462372660636902, |
|
"learning_rate": 0.0002519172582901218, |
|
"loss": 1.7561, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.6024785510009533, |
|
"grad_norm": 0.38078993558883667, |
|
"learning_rate": 0.00025064309336209214, |
|
"loss": 1.7398, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.6101048617731173, |
|
"grad_norm": 0.3360918164253235, |
|
"learning_rate": 0.00024935603089922215, |
|
"loss": 1.7546, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.6177311725452812, |
|
"grad_norm": 0.30906444787979126, |
|
"learning_rate": 0.0002480562647283846, |
|
"loss": 1.7487, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.6253574833174452, |
|
"grad_norm": 0.36905303597450256, |
|
"learning_rate": 0.00024674399058958394, |
|
"loss": 1.7589, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.6329837940896091, |
|
"grad_norm": 0.3760243058204651, |
|
"learning_rate": 0.0002454194061064785, |
|
"loss": 1.7732, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.6406101048617732, |
|
"grad_norm": 0.41075772047042847, |
|
"learning_rate": 0.0002440827107566192, |
|
"loss": 1.7812, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.6482364156339371, |
|
"grad_norm": 0.41905277967453003, |
|
"learning_rate": 0.00024273410584140913, |
|
"loss": 1.7692, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.655862726406101, |
|
"grad_norm": 0.41411152482032776, |
|
"learning_rate": 0.00024137379445578774, |
|
"loss": 1.8508, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.663489037178265, |
|
"grad_norm": 0.47408804297447205, |
|
"learning_rate": 0.0002400019814576463, |
|
"loss": 1.847, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.6711153479504289, |
|
"grad_norm": 0.4235435724258423, |
|
"learning_rate": 0.00023861887343697624, |
|
"loss": 1.8122, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.678741658722593, |
|
"grad_norm": 0.3859807252883911, |
|
"learning_rate": 0.00023722467868475812, |
|
"loss": 1.6975, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.6863679694947569, |
|
"grad_norm": 0.3468429148197174, |
|
"learning_rate": 0.0002358196071615933, |
|
"loss": 1.694, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6939942802669209, |
|
"grad_norm": 0.3533165156841278, |
|
"learning_rate": 0.00023440387046608487, |
|
"loss": 1.6882, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.7016205910390848, |
|
"grad_norm": 0.3266445994377136, |
|
"learning_rate": 0.00023297768180297187, |
|
"loss": 1.6909, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.7092469018112488, |
|
"grad_norm": 0.3379365801811218, |
|
"learning_rate": 0.00023154125595102083, |
|
"loss": 1.7055, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.7168732125834127, |
|
"grad_norm": 0.32845547795295715, |
|
"learning_rate": 0.00023009480923068157, |
|
"loss": 1.7529, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.7244995233555768, |
|
"grad_norm": 0.3393206000328064, |
|
"learning_rate": 0.00022863855947150968, |
|
"loss": 1.7702, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.7321258341277407, |
|
"grad_norm": 0.3422520160675049, |
|
"learning_rate": 0.0002271727259793624, |
|
"loss": 1.7063, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.7397521448999047, |
|
"grad_norm": 0.378302663564682, |
|
"learning_rate": 0.0002256975295033719, |
|
"loss": 1.7602, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.7473784556720686, |
|
"grad_norm": 0.41780129075050354, |
|
"learning_rate": 0.0002242131922027012, |
|
"loss": 1.8039, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.7550047664442326, |
|
"grad_norm": 0.44365987181663513, |
|
"learning_rate": 0.00022271993761308807, |
|
"loss": 1.7738, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.7626310772163966, |
|
"grad_norm": 0.5752303600311279, |
|
"learning_rate": 0.00022121799061318104, |
|
"loss": 1.9044, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7626310772163966, |
|
"eval_loss": 1.7645323276519775, |
|
"eval_runtime": 0.28, |
|
"eval_samples_per_second": 178.583, |
|
"eval_steps_per_second": 46.431, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7702573879885606, |
|
"grad_norm": 0.5027046203613281, |
|
"learning_rate": 0.00021970757739067358, |
|
"loss": 1.6627, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.7778836987607245, |
|
"grad_norm": 0.4292294681072235, |
|
"learning_rate": 0.00021818892540824148, |
|
"loss": 1.6495, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.7855100095328885, |
|
"grad_norm": 0.37569937109947205, |
|
"learning_rate": 0.00021666226336928708, |
|
"loss": 1.6692, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.7931363203050524, |
|
"grad_norm": 0.3786998391151428, |
|
"learning_rate": 0.00021512782118349806, |
|
"loss": 1.6581, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.8007626310772163, |
|
"grad_norm": 0.36832037568092346, |
|
"learning_rate": 0.0002135858299322234, |
|
"loss": 1.6714, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.8083889418493804, |
|
"grad_norm": 0.336896687746048, |
|
"learning_rate": 0.00021203652183367363, |
|
"loss": 1.7207, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.8160152526215443, |
|
"grad_norm": 0.35588911175727844, |
|
"learning_rate": 0.00021048013020794968, |
|
"loss": 1.7085, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.8236415633937083, |
|
"grad_norm": 0.4056905508041382, |
|
"learning_rate": 0.00020891688944190548, |
|
"loss": 1.7094, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.8312678741658722, |
|
"grad_norm": 0.40606462955474854, |
|
"learning_rate": 0.00020734703495385037, |
|
"loss": 1.7239, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.8388941849380362, |
|
"grad_norm": 0.40182796120643616, |
|
"learning_rate": 0.0002057708031580958, |
|
"loss": 1.7333, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.8465204957102002, |
|
"grad_norm": 0.44995224475860596, |
|
"learning_rate": 0.00020418843142935237, |
|
"loss": 1.7049, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.8541468064823642, |
|
"grad_norm": 0.4388667047023773, |
|
"learning_rate": 0.00020260015806698213, |
|
"loss": 1.783, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.8617731172545281, |
|
"grad_norm": 0.44865381717681885, |
|
"learning_rate": 0.00020100622225911128, |
|
"loss": 1.7508, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.8693994280266921, |
|
"grad_norm": 0.4294881522655487, |
|
"learning_rate": 0.00019940686404660947, |
|
"loss": 1.6571, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.877025738798856, |
|
"grad_norm": 0.3841921091079712, |
|
"learning_rate": 0.00019780232428694063, |
|
"loss": 1.695, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.8846520495710201, |
|
"grad_norm": 0.3785157799720764, |
|
"learning_rate": 0.0001961928446178906, |
|
"loss": 1.6545, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.892278360343184, |
|
"grad_norm": 0.33887144923210144, |
|
"learning_rate": 0.00019457866742117737, |
|
"loss": 1.6715, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.899904671115348, |
|
"grad_norm": 0.32863056659698486, |
|
"learning_rate": 0.00019296003578594948, |
|
"loss": 1.6952, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.9075309818875119, |
|
"grad_norm": 0.3387095034122467, |
|
"learning_rate": 0.00019133719347217733, |
|
"loss": 1.6291, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.9151572926596759, |
|
"grad_norm": 0.3665367066860199, |
|
"learning_rate": 0.00018971038487394402, |
|
"loss": 1.7321, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.9227836034318398, |
|
"grad_norm": 0.3495505452156067, |
|
"learning_rate": 0.00018807985498264066, |
|
"loss": 1.6587, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.9304099142040038, |
|
"grad_norm": 0.40355828404426575, |
|
"learning_rate": 0.00018644584935007127, |
|
"loss": 1.7027, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.9380362249761678, |
|
"grad_norm": 0.4357530474662781, |
|
"learning_rate": 0.0001848086140514738, |
|
"loss": 1.7724, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.9456625357483317, |
|
"grad_norm": 0.44958412647247314, |
|
"learning_rate": 0.000183168395648462, |
|
"loss": 1.7454, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.9532888465204957, |
|
"grad_norm": 0.5644071102142334, |
|
"learning_rate": 0.00018152544115189416, |
|
"loss": 1.8156, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9532888465204957, |
|
"eval_loss": 1.7262933254241943, |
|
"eval_runtime": 0.2797, |
|
"eval_samples_per_second": 178.792, |
|
"eval_steps_per_second": 46.486, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.9609151572926596, |
|
"grad_norm": 0.45155230164527893, |
|
"learning_rate": 0.0001798799979846742, |
|
"loss": 1.6338, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.9685414680648237, |
|
"grad_norm": 0.41884180903434753, |
|
"learning_rate": 0.00017823231394449072, |
|
"loss": 1.6829, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.9761677788369876, |
|
"grad_norm": 0.3723445534706116, |
|
"learning_rate": 0.0001765826371664994, |
|
"loss": 1.6707, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.9837940896091516, |
|
"grad_norm": 0.3832674026489258, |
|
"learning_rate": 0.00017493121608595511, |
|
"loss": 1.7397, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.9914204003813155, |
|
"grad_norm": 0.37408822774887085, |
|
"learning_rate": 0.00017327829940079817, |
|
"loss": 1.6765, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.9990467111534795, |
|
"grad_norm": 0.42527204751968384, |
|
"learning_rate": 0.00017162413603420142, |
|
"loss": 1.791, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.0066730219256435, |
|
"grad_norm": 1.3834341764450073, |
|
"learning_rate": 0.00016996897509708345, |
|
"loss": 3.4039, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.0142993326978074, |
|
"grad_norm": 0.43991145491600037, |
|
"learning_rate": 0.00016831306585059317, |
|
"loss": 1.6506, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.0219256434699715, |
|
"grad_norm": 0.37817224860191345, |
|
"learning_rate": 0.0001666566576685722, |
|
"loss": 1.5943, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.0295519542421354, |
|
"grad_norm": 0.34133297204971313, |
|
"learning_rate": 0.000165, |
|
"loss": 1.5756, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.0371782650142993, |
|
"grad_norm": 0.32997646927833557, |
|
"learning_rate": 0.0001633433423314278, |
|
"loss": 1.6236, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.0448045757864632, |
|
"grad_norm": 0.3591316342353821, |
|
"learning_rate": 0.00016168693414940683, |
|
"loss": 1.6221, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.0524308865586272, |
|
"grad_norm": 0.3701683580875397, |
|
"learning_rate": 0.00016003102490291655, |
|
"loss": 1.6099, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.0600571973307913, |
|
"grad_norm": 0.3921620845794678, |
|
"learning_rate": 0.00015837586396579858, |
|
"loss": 1.6507, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.0676835081029552, |
|
"grad_norm": 0.411425918340683, |
|
"learning_rate": 0.00015672170059920183, |
|
"loss": 1.6658, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.0753098188751191, |
|
"grad_norm": 0.4283716082572937, |
|
"learning_rate": 0.00015506878391404488, |
|
"loss": 1.6525, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.082936129647283, |
|
"grad_norm": 0.43306857347488403, |
|
"learning_rate": 0.00015341736283350064, |
|
"loss": 1.6808, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.0905624404194472, |
|
"grad_norm": 0.46868017315864563, |
|
"learning_rate": 0.0001517676860555093, |
|
"loss": 1.7022, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.098188751191611, |
|
"grad_norm": 0.3939070403575897, |
|
"learning_rate": 0.0001501200020153258, |
|
"loss": 1.608, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.105815061963775, |
|
"grad_norm": 0.5079172253608704, |
|
"learning_rate": 0.00014847455884810581, |
|
"loss": 1.664, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.113441372735939, |
|
"grad_norm": 0.4701906740665436, |
|
"learning_rate": 0.00014683160435153796, |
|
"loss": 1.5924, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.121067683508103, |
|
"grad_norm": 0.4438985288143158, |
|
"learning_rate": 0.00014519138594852615, |
|
"loss": 1.6186, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.128693994280267, |
|
"grad_norm": 0.40085965394973755, |
|
"learning_rate": 0.00014355415064992873, |
|
"loss": 1.6421, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.1363203050524309, |
|
"grad_norm": 0.3875059485435486, |
|
"learning_rate": 0.00014192014501735934, |
|
"loss": 1.5903, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.1439466158245948, |
|
"grad_norm": 0.3794068992137909, |
|
"learning_rate": 0.00014028961512605598, |
|
"loss": 1.6741, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1439466158245948, |
|
"eval_loss": 1.7033360004425049, |
|
"eval_runtime": 0.2798, |
|
"eval_samples_per_second": 178.678, |
|
"eval_steps_per_second": 46.456, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.1515729265967587, |
|
"grad_norm": 0.38320392370224, |
|
"learning_rate": 0.00013866280652782267, |
|
"loss": 1.6258, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.1591992373689228, |
|
"grad_norm": 0.3931479752063751, |
|
"learning_rate": 0.00013703996421405052, |
|
"loss": 1.6313, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.1668255481410867, |
|
"grad_norm": 0.4570615589618683, |
|
"learning_rate": 0.00013542133257882257, |
|
"loss": 1.6801, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.1744518589132507, |
|
"grad_norm": 0.47714921832084656, |
|
"learning_rate": 0.0001338071553821094, |
|
"loss": 1.6307, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.1820781696854148, |
|
"grad_norm": 0.4591121971607208, |
|
"learning_rate": 0.00013219767571305937, |
|
"loss": 1.7064, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.1897044804575787, |
|
"grad_norm": 0.5732882022857666, |
|
"learning_rate": 0.00013059313595339053, |
|
"loss": 1.7405, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.1973307912297426, |
|
"grad_norm": 0.4108792543411255, |
|
"learning_rate": 0.00012899377774088872, |
|
"loss": 1.6063, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.2049571020019065, |
|
"grad_norm": 0.42478087544441223, |
|
"learning_rate": 0.00012739984193301784, |
|
"loss": 1.5782, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.2125834127740704, |
|
"grad_norm": 0.44489574432373047, |
|
"learning_rate": 0.0001258115685706476, |
|
"loss": 1.5959, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.2202097235462346, |
|
"grad_norm": 0.41875404119491577, |
|
"learning_rate": 0.0001242291968419042, |
|
"loss": 1.6163, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.2278360343183985, |
|
"grad_norm": 0.3827251195907593, |
|
"learning_rate": 0.00012265296504614963, |
|
"loss": 1.6228, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.2354623450905624, |
|
"grad_norm": 0.3817841112613678, |
|
"learning_rate": 0.0001210831105580945, |
|
"loss": 1.5694, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.2430886558627263, |
|
"grad_norm": 0.3716193735599518, |
|
"learning_rate": 0.00011951986979205029, |
|
"loss": 1.6367, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.2507149666348902, |
|
"grad_norm": 0.37516316771507263, |
|
"learning_rate": 0.00011796347816632634, |
|
"loss": 1.6157, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.2583412774070544, |
|
"grad_norm": 0.4124738276004791, |
|
"learning_rate": 0.00011641417006777658, |
|
"loss": 1.5697, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.2659675881792183, |
|
"grad_norm": 0.4279733896255493, |
|
"learning_rate": 0.00011487217881650195, |
|
"loss": 1.6447, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.2735938989513822, |
|
"grad_norm": 0.4881094992160797, |
|
"learning_rate": 0.00011333773663071288, |
|
"loss": 1.6122, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.2812202097235463, |
|
"grad_norm": 0.521618127822876, |
|
"learning_rate": 0.00011181107459175851, |
|
"loss": 1.7202, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.2888465204957102, |
|
"grad_norm": 0.3777306079864502, |
|
"learning_rate": 0.00011029242260932638, |
|
"loss": 1.5756, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.2964728312678742, |
|
"grad_norm": 0.4733025133609772, |
|
"learning_rate": 0.000108782009386819, |
|
"loss": 1.6479, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.304099142040038, |
|
"grad_norm": 0.41860291361808777, |
|
"learning_rate": 0.00010728006238691194, |
|
"loss": 1.5983, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.311725452812202, |
|
"grad_norm": 0.46844813227653503, |
|
"learning_rate": 0.00010578680779729879, |
|
"loss": 1.578, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.3193517635843661, |
|
"grad_norm": 0.40656524896621704, |
|
"learning_rate": 0.0001043024704966281, |
|
"loss": 1.6255, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.32697807435653, |
|
"grad_norm": 0.38256990909576416, |
|
"learning_rate": 0.00010282727402063758, |
|
"loss": 1.5675, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.334604385128694, |
|
"grad_norm": 0.3779941201210022, |
|
"learning_rate": 0.00010136144052849031, |
|
"loss": 1.5789, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.334604385128694, |
|
"eval_loss": 1.695977807044983, |
|
"eval_runtime": 0.2808, |
|
"eval_samples_per_second": 178.081, |
|
"eval_steps_per_second": 46.301, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.342230695900858, |
|
"grad_norm": 0.3875720798969269, |
|
"learning_rate": 9.990519076931843e-05, |
|
"loss": 1.656, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.349857006673022, |
|
"grad_norm": 0.38068655133247375, |
|
"learning_rate": 9.845874404897915e-05, |
|
"loss": 1.623, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.357483317445186, |
|
"grad_norm": 0.4605511724948883, |
|
"learning_rate": 9.702231819702814e-05, |
|
"loss": 1.627, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.3651096282173498, |
|
"grad_norm": 0.4176296889781952, |
|
"learning_rate": 9.559612953391507e-05, |
|
"loss": 1.6706, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.3727359389895137, |
|
"grad_norm": 0.4767864942550659, |
|
"learning_rate": 9.418039283840671e-05, |
|
"loss": 1.6709, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.3803622497616779, |
|
"grad_norm": 0.567336916923523, |
|
"learning_rate": 9.27753213152419e-05, |
|
"loss": 1.8214, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.3879885605338418, |
|
"grad_norm": 0.4051623046398163, |
|
"learning_rate": 9.138112656302376e-05, |
|
"loss": 1.6248, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.3956148713060057, |
|
"grad_norm": 0.380164235830307, |
|
"learning_rate": 8.999801854235373e-05, |
|
"loss": 1.5668, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.4032411820781696, |
|
"grad_norm": 0.37853559851646423, |
|
"learning_rate": 8.862620554421221e-05, |
|
"loss": 1.6079, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.4108674928503335, |
|
"grad_norm": 0.38022464513778687, |
|
"learning_rate": 8.726589415859088e-05, |
|
"loss": 1.6109, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.4184938036224977, |
|
"grad_norm": 0.3726414442062378, |
|
"learning_rate": 8.591728924338075e-05, |
|
"loss": 1.5726, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.4261201143946616, |
|
"grad_norm": 0.39313751459121704, |
|
"learning_rate": 8.45805938935215e-05, |
|
"loss": 1.5881, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.4337464251668255, |
|
"grad_norm": 0.40941789746284485, |
|
"learning_rate": 8.325600941041607e-05, |
|
"loss": 1.6375, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.4413727359389896, |
|
"grad_norm": 0.38843002915382385, |
|
"learning_rate": 8.194373527161539e-05, |
|
"loss": 1.5911, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.4489990467111535, |
|
"grad_norm": 0.38351744413375854, |
|
"learning_rate": 8.064396910077785e-05, |
|
"loss": 1.6153, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.4566253574833175, |
|
"grad_norm": 0.42547914385795593, |
|
"learning_rate": 7.935690663790787e-05, |
|
"loss": 1.5872, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.4642516682554814, |
|
"grad_norm": 0.45052269101142883, |
|
"learning_rate": 7.808274170987818e-05, |
|
"loss": 1.6048, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.4718779790276453, |
|
"grad_norm": 0.5102285742759705, |
|
"learning_rate": 7.682166620124017e-05, |
|
"loss": 1.6611, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.4795042897998094, |
|
"grad_norm": 0.3973918855190277, |
|
"learning_rate": 7.55738700253268e-05, |
|
"loss": 1.6591, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.4871306005719733, |
|
"grad_norm": 0.4221573770046234, |
|
"learning_rate": 7.43395410956525e-05, |
|
"loss": 1.5788, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.4947569113441372, |
|
"grad_norm": 0.38037997484207153, |
|
"learning_rate": 7.311886529761383e-05, |
|
"loss": 1.543, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.5023832221163014, |
|
"grad_norm": 0.3961423635482788, |
|
"learning_rate": 7.191202646049596e-05, |
|
"loss": 1.5559, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.510009532888465, |
|
"grad_norm": 0.40945950150489807, |
|
"learning_rate": 7.071920632978867e-05, |
|
"loss": 1.6016, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.5176358436606292, |
|
"grad_norm": 0.3885650038719177, |
|
"learning_rate": 6.954058453981609e-05, |
|
"loss": 1.587, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.5252621544327931, |
|
"grad_norm": 0.4135299623012543, |
|
"learning_rate": 6.837633858668448e-05, |
|
"loss": 1.6103, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.5252621544327931, |
|
"eval_loss": 1.6848325729370117, |
|
"eval_runtime": 0.2814, |
|
"eval_samples_per_second": 177.685, |
|
"eval_steps_per_second": 46.198, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.532888465204957, |
|
"grad_norm": 0.382758229970932, |
|
"learning_rate": 6.722664380155198e-05, |
|
"loss": 1.6259, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.5405147759771212, |
|
"grad_norm": 0.3803237974643707, |
|
"learning_rate": 6.609167332422427e-05, |
|
"loss": 1.5547, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.548141086749285, |
|
"grad_norm": 0.4076535701751709, |
|
"learning_rate": 6.497159807708055e-05, |
|
"loss": 1.5846, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.555767397521449, |
|
"grad_norm": 0.41619113087654114, |
|
"learning_rate": 6.386658673933301e-05, |
|
"loss": 1.6648, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.5633937082936131, |
|
"grad_norm": 0.4555559754371643, |
|
"learning_rate": 6.277680572162459e-05, |
|
"loss": 1.6636, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.5710200190657768, |
|
"grad_norm": 0.5715373158454895, |
|
"learning_rate": 6.170241914096804e-05, |
|
"loss": 1.7265, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.578646329837941, |
|
"grad_norm": 0.39986926317214966, |
|
"learning_rate": 6.06435887960305e-05, |
|
"loss": 1.614, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.5862726406101049, |
|
"grad_norm": 0.4014238119125366, |
|
"learning_rate": 5.960047414276724e-05, |
|
"loss": 1.5169, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.5938989513822688, |
|
"grad_norm": 0.40676450729370117, |
|
"learning_rate": 5.857323227040816e-05, |
|
"loss": 1.5836, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.601525262154433, |
|
"grad_norm": 0.38130614161491394, |
|
"learning_rate": 5.756201787780074e-05, |
|
"loss": 1.5636, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.6091515729265966, |
|
"grad_norm": 0.4001769721508026, |
|
"learning_rate": 5.656698325011295e-05, |
|
"loss": 1.5641, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.6167778836987607, |
|
"grad_norm": 0.3762960433959961, |
|
"learning_rate": 5.5588278235899724e-05, |
|
"loss": 1.615, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.6244041944709247, |
|
"grad_norm": 0.372916042804718, |
|
"learning_rate": 5.462605022453621e-05, |
|
"loss": 1.6307, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.6320305052430886, |
|
"grad_norm": 0.4022381603717804, |
|
"learning_rate": 5.368044412402161e-05, |
|
"loss": 1.5634, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.6396568160152527, |
|
"grad_norm": 0.39602571725845337, |
|
"learning_rate": 5.275160233915637e-05, |
|
"loss": 1.6328, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.6472831267874166, |
|
"grad_norm": 0.4366743266582489, |
|
"learning_rate": 5.183966475009686e-05, |
|
"loss": 1.6038, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.6549094375595805, |
|
"grad_norm": 0.458132803440094, |
|
"learning_rate": 5.0944768691289534e-05, |
|
"loss": 1.6384, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.6625357483317447, |
|
"grad_norm": 0.4853437840938568, |
|
"learning_rate": 5.0067048930789196e-05, |
|
"loss": 1.6787, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.6701620591039084, |
|
"grad_norm": 0.4034062325954437, |
|
"learning_rate": 4.920663764996328e-05, |
|
"loss": 1.5721, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.6777883698760725, |
|
"grad_norm": 0.4523112177848816, |
|
"learning_rate": 4.8363664423585795e-05, |
|
"loss": 1.6327, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.6854146806482364, |
|
"grad_norm": 0.4193178415298462, |
|
"learning_rate": 4.753825620032397e-05, |
|
"loss": 1.5354, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.6930409914204003, |
|
"grad_norm": 0.3819790482521057, |
|
"learning_rate": 4.673053728362012e-05, |
|
"loss": 1.5833, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.7006673021925645, |
|
"grad_norm": 0.3742893636226654, |
|
"learning_rate": 4.5940629312972085e-05, |
|
"loss": 1.5805, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.7082936129647281, |
|
"grad_norm": 0.37698328495025635, |
|
"learning_rate": 4.516865124561473e-05, |
|
"loss": 1.5632, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.7159199237368923, |
|
"grad_norm": 0.39410701394081116, |
|
"learning_rate": 4.4414719338605445e-05, |
|
"loss": 1.6016, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.7159199237368923, |
|
"eval_loss": 1.6763724088668823, |
|
"eval_runtime": 0.281, |
|
"eval_samples_per_second": 177.963, |
|
"eval_steps_per_second": 46.27, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.7235462345090562, |
|
"grad_norm": 0.41081416606903076, |
|
"learning_rate": 4.367894713131622e-05, |
|
"loss": 1.5998, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.73117254528122, |
|
"grad_norm": 0.4001181721687317, |
|
"learning_rate": 4.296144542833515e-05, |
|
"loss": 1.6213, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.7387988560533842, |
|
"grad_norm": 0.4368586242198944, |
|
"learning_rate": 4.226232228277948e-05, |
|
"loss": 1.6338, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.7464251668255482, |
|
"grad_norm": 0.4104710817337036, |
|
"learning_rate": 4.1581682980023354e-05, |
|
"loss": 1.6433, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.754051477597712, |
|
"grad_norm": 0.4876128137111664, |
|
"learning_rate": 4.0919630021842204e-05, |
|
"loss": 1.6381, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.7616777883698762, |
|
"grad_norm": 0.5875245332717896, |
|
"learning_rate": 4.027626311097629e-05, |
|
"loss": 1.7134, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.76930409914204, |
|
"grad_norm": 0.40997183322906494, |
|
"learning_rate": 3.965167913611591e-05, |
|
"loss": 1.5599, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.776930409914204, |
|
"grad_norm": 0.42695024609565735, |
|
"learning_rate": 3.9045972157310256e-05, |
|
"loss": 1.5685, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.784556720686368, |
|
"grad_norm": 0.4017082452774048, |
|
"learning_rate": 3.845923339180239e-05, |
|
"loss": 1.5493, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.7921830314585319, |
|
"grad_norm": 0.38347816467285156, |
|
"learning_rate": 3.78915512002922e-05, |
|
"loss": 1.5464, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.799809342230696, |
|
"grad_norm": 0.40365535020828247, |
|
"learning_rate": 3.734301107362964e-05, |
|
"loss": 1.6257, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.80743565300286, |
|
"grad_norm": 0.3748120963573456, |
|
"learning_rate": 3.681369561994005e-05, |
|
"loss": 1.5456, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.8150619637750238, |
|
"grad_norm": 0.36911335587501526, |
|
"learning_rate": 3.6303684552183827e-05, |
|
"loss": 1.5886, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.822688274547188, |
|
"grad_norm": 0.3796713948249817, |
|
"learning_rate": 3.581305467615181e-05, |
|
"loss": 1.5858, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.8303145853193517, |
|
"grad_norm": 0.4003843665122986, |
|
"learning_rate": 3.5341879878898615e-05, |
|
"loss": 1.6126, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.8379408960915158, |
|
"grad_norm": 0.4427483379840851, |
|
"learning_rate": 3.489023111761562e-05, |
|
"loss": 1.6487, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.8455672068636797, |
|
"grad_norm": 0.4299188554286957, |
|
"learning_rate": 3.445817640894497e-05, |
|
"loss": 1.6723, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.8531935176358436, |
|
"grad_norm": 0.5241357684135437, |
|
"learning_rate": 3.404578081873656e-05, |
|
"loss": 1.6198, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.8608198284080077, |
|
"grad_norm": 0.40499526262283325, |
|
"learning_rate": 3.365310645224939e-05, |
|
"loss": 1.5758, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.8684461391801714, |
|
"grad_norm": 0.43495872616767883, |
|
"learning_rate": 3.328021244479866e-05, |
|
"loss": 1.5897, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.8760724499523356, |
|
"grad_norm": 0.39778581261634827, |
|
"learning_rate": 3.292715495285028e-05, |
|
"loss": 1.5267, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.8836987607244995, |
|
"grad_norm": 0.3671037256717682, |
|
"learning_rate": 3.259398714556389e-05, |
|
"loss": 1.5499, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.8913250714966634, |
|
"grad_norm": 0.38671308755874634, |
|
"learning_rate": 3.2280759196785803e-05, |
|
"loss": 1.628, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.8989513822688275, |
|
"grad_norm": 0.41623926162719727, |
|
"learning_rate": 3.1987518277492934e-05, |
|
"loss": 1.5699, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.9065776930409915, |
|
"grad_norm": 0.38484105467796326, |
|
"learning_rate": 3.171430854868911e-05, |
|
"loss": 1.5702, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.9065776930409915, |
|
"eval_loss": 1.6708096265792847, |
|
"eval_runtime": 0.2807, |
|
"eval_samples_per_second": 178.136, |
|
"eval_steps_per_second": 46.315, |
|
"step": 250 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 263, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 1, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.11070068867072e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|