|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9992646286374619, |
|
"eval_steps": 500, |
|
"global_step": 1189, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008404244143292362, |
|
"grad_norm": 0.7383340001106262, |
|
"learning_rate": 2e-05, |
|
"loss": 2.4066, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.016808488286584725, |
|
"grad_norm": 0.46394309401512146, |
|
"learning_rate": 4e-05, |
|
"loss": 2.0375, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.025212732429877087, |
|
"grad_norm": 0.4739478528499603, |
|
"learning_rate": 6e-05, |
|
"loss": 1.5044, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03361697657316945, |
|
"grad_norm": 0.20930196344852448, |
|
"learning_rate": 8e-05, |
|
"loss": 0.8704, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.04202122071646181, |
|
"grad_norm": 0.15288038551807404, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6533, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.050425464859754174, |
|
"grad_norm": 0.13073962926864624, |
|
"learning_rate": 0.00012, |
|
"loss": 0.586, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.058829709003046536, |
|
"grad_norm": 0.14555367827415466, |
|
"learning_rate": 0.00014, |
|
"loss": 0.5793, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0672339531463389, |
|
"grad_norm": 0.12397414445877075, |
|
"learning_rate": 0.00016, |
|
"loss": 0.581, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.07563819728963127, |
|
"grad_norm": 0.13021130859851837, |
|
"learning_rate": 0.00018, |
|
"loss": 0.5512, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08404244143292362, |
|
"grad_norm": 0.13012883067131042, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5403, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.09244668557621599, |
|
"grad_norm": 0.11942347884178162, |
|
"learning_rate": 0.00019942313239111625, |
|
"loss": 0.5247, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.10085092971950835, |
|
"grad_norm": 0.11690942198038101, |
|
"learning_rate": 0.0001988462647822325, |
|
"loss": 0.5417, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.10925517386280072, |
|
"grad_norm": 0.1355101615190506, |
|
"learning_rate": 0.00019826939717334873, |
|
"loss": 0.5273, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.11765941800609307, |
|
"grad_norm": 0.1345665603876114, |
|
"learning_rate": 0.00019769252956446497, |
|
"loss": 0.5243, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.12606366214938544, |
|
"grad_norm": 0.12515193223953247, |
|
"learning_rate": 0.0001971156619555812, |
|
"loss": 0.5344, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1344679062926778, |
|
"grad_norm": 0.15686553716659546, |
|
"learning_rate": 0.00019653879434669745, |
|
"loss": 0.5118, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.14287215043597015, |
|
"grad_norm": 0.12068944424390793, |
|
"learning_rate": 0.0001959619267378137, |
|
"loss": 0.4979, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.15127639457926254, |
|
"grad_norm": 0.13319459557533264, |
|
"learning_rate": 0.00019538505912892993, |
|
"loss": 0.503, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.1596806387225549, |
|
"grad_norm": 0.11806949228048325, |
|
"learning_rate": 0.00019480819152004617, |
|
"loss": 0.49, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.16808488286584725, |
|
"grad_norm": 0.12932075560092926, |
|
"learning_rate": 0.00019423132391116238, |
|
"loss": 0.514, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17648912700913963, |
|
"grad_norm": 0.11743929982185364, |
|
"learning_rate": 0.00019365445630227862, |
|
"loss": 0.4788, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.18489337115243198, |
|
"grad_norm": 0.11788313835859299, |
|
"learning_rate": 0.00019307758869339486, |
|
"loss": 0.4891, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.19329761529572434, |
|
"grad_norm": 0.11414741724729538, |
|
"learning_rate": 0.0001925007210845111, |
|
"loss": 0.5033, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.2017018594390167, |
|
"grad_norm": 0.11419043689966202, |
|
"learning_rate": 0.00019192385347562737, |
|
"loss": 0.4844, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.21010610358230908, |
|
"grad_norm": 0.12788020074367523, |
|
"learning_rate": 0.0001913469858667436, |
|
"loss": 0.4697, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.21851034772560143, |
|
"grad_norm": 0.13661302626132965, |
|
"learning_rate": 0.00019077011825785982, |
|
"loss": 0.4627, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2269145918688938, |
|
"grad_norm": 0.12041325867176056, |
|
"learning_rate": 0.00019019325064897606, |
|
"loss": 0.4964, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.23531883601218614, |
|
"grad_norm": 0.133742094039917, |
|
"learning_rate": 0.0001896163830400923, |
|
"loss": 0.4658, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.24372308015547853, |
|
"grad_norm": 0.1261977106332779, |
|
"learning_rate": 0.00018903951543120854, |
|
"loss": 0.4781, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.2521273242987709, |
|
"grad_norm": 0.130150705575943, |
|
"learning_rate": 0.00018846264782232478, |
|
"loss": 0.4922, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.26053156844206327, |
|
"grad_norm": 0.13174410164356232, |
|
"learning_rate": 0.00018788578021344102, |
|
"loss": 0.4559, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2689358125853556, |
|
"grad_norm": 0.1186077669262886, |
|
"learning_rate": 0.00018730891260455726, |
|
"loss": 0.4722, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.277340056728648, |
|
"grad_norm": 0.116569384932518, |
|
"learning_rate": 0.0001867320449956735, |
|
"loss": 0.4457, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2857443008719403, |
|
"grad_norm": 0.12219471484422684, |
|
"learning_rate": 0.00018615517738678974, |
|
"loss": 0.4849, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.2941485450152327, |
|
"grad_norm": 0.12746909260749817, |
|
"learning_rate": 0.00018557830977790598, |
|
"loss": 0.4821, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.30255278915852507, |
|
"grad_norm": 0.14125944674015045, |
|
"learning_rate": 0.00018500144216902222, |
|
"loss": 0.4605, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3109570333018174, |
|
"grad_norm": 0.19157269597053528, |
|
"learning_rate": 0.00018442457456013846, |
|
"loss": 0.4541, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.3193612774451098, |
|
"grad_norm": 0.12603330612182617, |
|
"learning_rate": 0.0001838477069512547, |
|
"loss": 0.4536, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.32776552158840216, |
|
"grad_norm": 0.12653909623622894, |
|
"learning_rate": 0.00018327083934237091, |
|
"loss": 0.4468, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3361697657316945, |
|
"grad_norm": 0.15930472314357758, |
|
"learning_rate": 0.00018269397173348718, |
|
"loss": 0.4542, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.3445740098749869, |
|
"grad_norm": 0.13266988098621368, |
|
"learning_rate": 0.00018211710412460342, |
|
"loss": 0.4335, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.35297825401827926, |
|
"grad_norm": 0.12103667855262756, |
|
"learning_rate": 0.00018154023651571966, |
|
"loss": 0.4575, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.3613824981615716, |
|
"grad_norm": 0.14439740777015686, |
|
"learning_rate": 0.0001809633689068359, |
|
"loss": 0.4377, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.36978674230486397, |
|
"grad_norm": 0.12652407586574554, |
|
"learning_rate": 0.00018038650129795214, |
|
"loss": 0.4363, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.3781909864481563, |
|
"grad_norm": 0.14594405889511108, |
|
"learning_rate": 0.00017980963368906835, |
|
"loss": 0.4306, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3865952305914487, |
|
"grad_norm": 0.12562687695026398, |
|
"learning_rate": 0.0001792327660801846, |
|
"loss": 0.4501, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.39499947473474106, |
|
"grad_norm": 0.14584492146968842, |
|
"learning_rate": 0.00017865589847130083, |
|
"loss": 0.4509, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.4034037188780334, |
|
"grad_norm": 0.13192500174045563, |
|
"learning_rate": 0.00017807903086241707, |
|
"loss": 0.4505, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.4118079630213258, |
|
"grad_norm": 0.14266645908355713, |
|
"learning_rate": 0.00017750216325353331, |
|
"loss": 0.4585, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.42021220716461816, |
|
"grad_norm": 0.1400412619113922, |
|
"learning_rate": 0.00017692529564464958, |
|
"loss": 0.4365, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.4286164513079105, |
|
"grad_norm": 0.14728468656539917, |
|
"learning_rate": 0.0001763484280357658, |
|
"loss": 0.4303, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.43702069545120287, |
|
"grad_norm": 0.15791365504264832, |
|
"learning_rate": 0.00017577156042688203, |
|
"loss": 0.4407, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4454249395944952, |
|
"grad_norm": 0.15447258949279785, |
|
"learning_rate": 0.00017519469281799827, |
|
"loss": 0.4365, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.4538291837377876, |
|
"grad_norm": 0.1518252044916153, |
|
"learning_rate": 0.00017461782520911451, |
|
"loss": 0.4305, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.46223342788107996, |
|
"grad_norm": 0.1154065877199173, |
|
"learning_rate": 0.00017404095760023075, |
|
"loss": 0.4212, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.4706376720243723, |
|
"grad_norm": 0.12900012731552124, |
|
"learning_rate": 0.000173464089991347, |
|
"loss": 0.4277, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.47904191616766467, |
|
"grad_norm": 0.1349458247423172, |
|
"learning_rate": 0.00017288722238246323, |
|
"loss": 0.4051, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.48744616031095706, |
|
"grad_norm": 0.16337165236473083, |
|
"learning_rate": 0.00017231035477357947, |
|
"loss": 0.407, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.4958504044542494, |
|
"grad_norm": 0.13420593738555908, |
|
"learning_rate": 0.0001717334871646957, |
|
"loss": 0.4138, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.5042546485975418, |
|
"grad_norm": 0.13840581476688385, |
|
"learning_rate": 0.00017115661955581195, |
|
"loss": 0.4099, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.5126588927408341, |
|
"grad_norm": 0.1378021389245987, |
|
"learning_rate": 0.0001705797519469282, |
|
"loss": 0.4254, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.5210631368841265, |
|
"grad_norm": 0.1607150137424469, |
|
"learning_rate": 0.00017000288433804443, |
|
"loss": 0.4353, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.5294673810274189, |
|
"grad_norm": 0.13462169468402863, |
|
"learning_rate": 0.00016942601672916067, |
|
"loss": 0.4267, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5378716251707112, |
|
"grad_norm": 0.14311543107032776, |
|
"learning_rate": 0.00016884914912027689, |
|
"loss": 0.4301, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.5462758693140036, |
|
"grad_norm": 0.15559442341327667, |
|
"learning_rate": 0.00016827228151139313, |
|
"loss": 0.4102, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.554680113457296, |
|
"grad_norm": 0.15557149052619934, |
|
"learning_rate": 0.00016769541390250937, |
|
"loss": 0.4136, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5630843576005883, |
|
"grad_norm": 0.135511115193367, |
|
"learning_rate": 0.00016711854629362563, |
|
"loss": 0.4153, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.5714886017438806, |
|
"grad_norm": 0.13760776817798615, |
|
"learning_rate": 0.00016654167868474187, |
|
"loss": 0.4145, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.579892845887173, |
|
"grad_norm": 0.14971590042114258, |
|
"learning_rate": 0.0001659648110758581, |
|
"loss": 0.3875, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.5882970900304654, |
|
"grad_norm": 0.16005663573741913, |
|
"learning_rate": 0.00016538794346697433, |
|
"loss": 0.3938, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.5967013341737577, |
|
"grad_norm": 0.1625218689441681, |
|
"learning_rate": 0.00016481107585809057, |
|
"loss": 0.3871, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.6051055783170501, |
|
"grad_norm": 0.17047689855098724, |
|
"learning_rate": 0.0001642342082492068, |
|
"loss": 0.412, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.6135098224603425, |
|
"grad_norm": 0.13825903832912445, |
|
"learning_rate": 0.00016365734064032305, |
|
"loss": 0.3948, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.6219140666036348, |
|
"grad_norm": 0.14830929040908813, |
|
"learning_rate": 0.00016308047303143929, |
|
"loss": 0.3927, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.6303183107469272, |
|
"grad_norm": 0.13950933516025543, |
|
"learning_rate": 0.00016250360542255553, |
|
"loss": 0.4051, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6387225548902196, |
|
"grad_norm": 0.15511371195316315, |
|
"learning_rate": 0.0001619267378136718, |
|
"loss": 0.4041, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.6471267990335119, |
|
"grad_norm": 0.14828190207481384, |
|
"learning_rate": 0.000161349870204788, |
|
"loss": 0.3824, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.6555310431768043, |
|
"grad_norm": 0.144051194190979, |
|
"learning_rate": 0.00016077300259590425, |
|
"loss": 0.3829, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6639352873200967, |
|
"grad_norm": 0.14780694246292114, |
|
"learning_rate": 0.00016019613498702049, |
|
"loss": 0.3814, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.672339531463389, |
|
"grad_norm": 0.15042325854301453, |
|
"learning_rate": 0.00015961926737813673, |
|
"loss": 0.3962, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.6807437756066814, |
|
"grad_norm": 0.16325107216835022, |
|
"learning_rate": 0.00015904239976925297, |
|
"loss": 0.3801, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6891480197499738, |
|
"grad_norm": 0.14843328297138214, |
|
"learning_rate": 0.0001584655321603692, |
|
"loss": 0.4082, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.6975522638932661, |
|
"grad_norm": 0.16731064021587372, |
|
"learning_rate": 0.00015788866455148545, |
|
"loss": 0.4192, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.7059565080365585, |
|
"grad_norm": 0.18703435361385345, |
|
"learning_rate": 0.00015731179694260169, |
|
"loss": 0.4009, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.7143607521798508, |
|
"grad_norm": 0.13935630023479462, |
|
"learning_rate": 0.00015673492933371793, |
|
"loss": 0.3618, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.7227649963231432, |
|
"grad_norm": 0.13263636827468872, |
|
"learning_rate": 0.00015615806172483417, |
|
"loss": 0.3963, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.7311692404664355, |
|
"grad_norm": 0.14940643310546875, |
|
"learning_rate": 0.0001555811941159504, |
|
"loss": 0.3585, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7395734846097279, |
|
"grad_norm": 0.14807912707328796, |
|
"learning_rate": 0.00015500432650706665, |
|
"loss": 0.3748, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.7479777287530203, |
|
"grad_norm": 0.15254080295562744, |
|
"learning_rate": 0.00015442745889818286, |
|
"loss": 0.3718, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.7563819728963126, |
|
"grad_norm": 0.16590768098831177, |
|
"learning_rate": 0.0001538505912892991, |
|
"loss": 0.386, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.764786217039605, |
|
"grad_norm": 0.15733902156352997, |
|
"learning_rate": 0.00015327372368041534, |
|
"loss": 0.3756, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.7731904611828974, |
|
"grad_norm": 0.13757385313510895, |
|
"learning_rate": 0.00015269685607153158, |
|
"loss": 0.3843, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7815947053261897, |
|
"grad_norm": 0.14952607452869415, |
|
"learning_rate": 0.00015211998846264784, |
|
"loss": 0.3634, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7899989494694821, |
|
"grad_norm": 0.1516282558441162, |
|
"learning_rate": 0.00015154312085376408, |
|
"loss": 0.3798, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.7984031936127745, |
|
"grad_norm": 0.17785628139972687, |
|
"learning_rate": 0.00015096625324488032, |
|
"loss": 0.3681, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.8068074377560668, |
|
"grad_norm": 0.171351820230484, |
|
"learning_rate": 0.00015038938563599654, |
|
"loss": 0.3686, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.8152116818993592, |
|
"grad_norm": 0.1742231398820877, |
|
"learning_rate": 0.00014981251802711278, |
|
"loss": 0.3792, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.8236159260426515, |
|
"grad_norm": 0.16650599241256714, |
|
"learning_rate": 0.00014923565041822902, |
|
"loss": 0.3577, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.8320201701859439, |
|
"grad_norm": 0.1497887670993805, |
|
"learning_rate": 0.00014865878280934526, |
|
"loss": 0.3553, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8404244143292363, |
|
"grad_norm": 0.14781557023525238, |
|
"learning_rate": 0.0001480819152004615, |
|
"loss": 0.3538, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.8488286584725286, |
|
"grad_norm": 0.15724751353263855, |
|
"learning_rate": 0.00014750504759157774, |
|
"loss": 0.3597, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.857232902615821, |
|
"grad_norm": 0.18635571002960205, |
|
"learning_rate": 0.00014692817998269398, |
|
"loss": 0.3615, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8656371467591134, |
|
"grad_norm": 0.17742526531219482, |
|
"learning_rate": 0.00014635131237381022, |
|
"loss": 0.348, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.8740413909024057, |
|
"grad_norm": 0.20535768568515778, |
|
"learning_rate": 0.00014577444476492646, |
|
"loss": 0.3343, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.8824456350456981, |
|
"grad_norm": 0.18968522548675537, |
|
"learning_rate": 0.0001451975771560427, |
|
"loss": 0.3615, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8908498791889904, |
|
"grad_norm": 0.1528492122888565, |
|
"learning_rate": 0.00014462070954715894, |
|
"loss": 0.3786, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.8992541233322828, |
|
"grad_norm": 0.15841075778007507, |
|
"learning_rate": 0.00014404384193827518, |
|
"loss": 0.3761, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.9076583674755752, |
|
"grad_norm": 0.15167982876300812, |
|
"learning_rate": 0.0001434669743293914, |
|
"loss": 0.3528, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.9160626116188675, |
|
"grad_norm": 0.14096671342849731, |
|
"learning_rate": 0.00014289010672050766, |
|
"loss": 0.371, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.9244668557621599, |
|
"grad_norm": 0.1579194813966751, |
|
"learning_rate": 0.0001423132391116239, |
|
"loss": 0.3491, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.9328710999054523, |
|
"grad_norm": 0.16789057850837708, |
|
"learning_rate": 0.00014173637150274014, |
|
"loss": 0.3536, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.9412753440487446, |
|
"grad_norm": 0.13980717957019806, |
|
"learning_rate": 0.00014115950389385638, |
|
"loss": 0.3423, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.949679588192037, |
|
"grad_norm": 0.19879643619060516, |
|
"learning_rate": 0.00014058263628497262, |
|
"loss": 0.3285, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.9580838323353293, |
|
"grad_norm": 0.16574440896511078, |
|
"learning_rate": 0.00014000576867608886, |
|
"loss": 0.3568, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9664880764786217, |
|
"grad_norm": 0.15376180410385132, |
|
"learning_rate": 0.00013942890106720507, |
|
"loss": 0.3558, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.9748923206219141, |
|
"grad_norm": 0.17232170701026917, |
|
"learning_rate": 0.0001388520334583213, |
|
"loss": 0.342, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.9832965647652064, |
|
"grad_norm": 0.1959993690252304, |
|
"learning_rate": 0.00013827516584943755, |
|
"loss": 0.3458, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.9917008089084988, |
|
"grad_norm": 0.14029347896575928, |
|
"learning_rate": 0.0001376982982405538, |
|
"loss": 0.3297, |
|
"step": 1180 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3567, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.069523410383258e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|