|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9994547062516325, |
|
"eval_steps": 500, |
|
"global_step": 1704, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005865344520256058, |
|
"grad_norm": 0.134765625, |
|
"learning_rate": 1.1695906432748538e-06, |
|
"loss": 1.5828, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0029326722601280297, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 5.8479532163742686e-06, |
|
"loss": 1.5339, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.005865344520256059, |
|
"grad_norm": 0.12890625, |
|
"learning_rate": 1.1695906432748537e-05, |
|
"loss": 1.5449, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008798016780384088, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 1.7543859649122806e-05, |
|
"loss": 1.5426, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.011730689040512119, |
|
"grad_norm": 0.1279296875, |
|
"learning_rate": 2.3391812865497074e-05, |
|
"loss": 1.5446, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.014663361300640148, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 2.9239766081871346e-05, |
|
"loss": 1.5188, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.017596033560768175, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 3.508771929824561e-05, |
|
"loss": 1.5175, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.020528705820896206, |
|
"grad_norm": 0.119140625, |
|
"learning_rate": 4.093567251461988e-05, |
|
"loss": 1.4769, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.023461378081024237, |
|
"grad_norm": 0.1171875, |
|
"learning_rate": 4.678362573099415e-05, |
|
"loss": 1.4503, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.026394050341152265, |
|
"grad_norm": 0.0947265625, |
|
"learning_rate": 5.2631578947368424e-05, |
|
"loss": 1.4078, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.029326722601280296, |
|
"grad_norm": 0.08642578125, |
|
"learning_rate": 5.847953216374269e-05, |
|
"loss": 1.3803, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03225939486140832, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 6.432748538011695e-05, |
|
"loss": 1.365, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.03519206712153635, |
|
"grad_norm": 0.06689453125, |
|
"learning_rate": 7.017543859649122e-05, |
|
"loss": 1.3275, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.038124739381664385, |
|
"grad_norm": 0.0634765625, |
|
"learning_rate": 7.602339181286549e-05, |
|
"loss": 1.3026, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.04105741164179241, |
|
"grad_norm": 0.06005859375, |
|
"learning_rate": 8.187134502923976e-05, |
|
"loss": 1.2994, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.04399008390192044, |
|
"grad_norm": 0.0634765625, |
|
"learning_rate": 8.771929824561403e-05, |
|
"loss": 1.2961, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.046922756162048475, |
|
"grad_norm": 0.06201171875, |
|
"learning_rate": 9.35672514619883e-05, |
|
"loss": 1.2742, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0498554284221765, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 9.941520467836257e-05, |
|
"loss": 1.2694, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.05278810068230453, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 0.00010526315789473685, |
|
"loss": 1.2601, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.05572077294243256, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 1.2419, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.05865344520256059, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 0.00011695906432748539, |
|
"loss": 1.2357, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.06158611746268862, |
|
"grad_norm": 0.060302734375, |
|
"learning_rate": 0.00012280701754385965, |
|
"loss": 1.2227, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.06451878972281665, |
|
"grad_norm": 0.052978515625, |
|
"learning_rate": 0.0001286549707602339, |
|
"loss": 1.2327, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.06745146198294467, |
|
"grad_norm": 0.061767578125, |
|
"learning_rate": 0.0001345029239766082, |
|
"loss": 1.2221, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.0703841342430727, |
|
"grad_norm": 0.05859375, |
|
"learning_rate": 0.00014035087719298245, |
|
"loss": 1.209, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.07331680650320074, |
|
"grad_norm": 0.064453125, |
|
"learning_rate": 0.00014619883040935673, |
|
"loss": 1.2159, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.07624947876332877, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.00015204678362573098, |
|
"loss": 1.1813, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.0791821510234568, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 0.00015789473684210527, |
|
"loss": 1.2059, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.08211482328358483, |
|
"grad_norm": 0.06396484375, |
|
"learning_rate": 0.00016374269005847952, |
|
"loss": 1.2146, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.08504749554371285, |
|
"grad_norm": 0.09326171875, |
|
"learning_rate": 0.0001695906432748538, |
|
"loss": 1.1965, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.08798016780384088, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.00017543859649122806, |
|
"loss": 1.1822, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.09091284006396891, |
|
"grad_norm": 0.08349609375, |
|
"learning_rate": 0.00018128654970760234, |
|
"loss": 1.1824, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.09384551232409695, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.0001871345029239766, |
|
"loss": 1.1992, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.09677818458422498, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.00019298245614035088, |
|
"loss": 1.1806, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.099710856844353, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.00019883040935672513, |
|
"loss": 1.1926, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.10264352910448103, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.00019999664028072614, |
|
"loss": 1.1773, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.10557620136460906, |
|
"grad_norm": 0.0830078125, |
|
"learning_rate": 0.000199982991808088, |
|
"loss": 1.1422, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.10850887362473709, |
|
"grad_norm": 0.11279296875, |
|
"learning_rate": 0.00019995884603149402, |
|
"loss": 1.1739, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.11144154588486511, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 0.00019992420548603092, |
|
"loss": 1.1652, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.11437421814499316, |
|
"grad_norm": 0.08154296875, |
|
"learning_rate": 0.00019987907380864062, |
|
"loss": 1.1597, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.11730689040512118, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.00019982345573773844, |
|
"loss": 1.1497, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.12023956266524921, |
|
"grad_norm": 0.08544921875, |
|
"learning_rate": 0.00019975735711271552, |
|
"loss": 1.1419, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.12317223492537724, |
|
"grad_norm": 0.080078125, |
|
"learning_rate": 0.00019968078487332566, |
|
"loss": 1.1583, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.12610490718550527, |
|
"grad_norm": 0.08203125, |
|
"learning_rate": 0.000199593747058957, |
|
"loss": 1.1501, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1290375794456333, |
|
"grad_norm": 0.0830078125, |
|
"learning_rate": 0.00019949625280778777, |
|
"loss": 1.1552, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.13197025170576132, |
|
"grad_norm": 0.080078125, |
|
"learning_rate": 0.00019938831235582672, |
|
"loss": 1.1285, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.13490292396588935, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.0001992699370358387, |
|
"loss": 1.1525, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.13783559622601738, |
|
"grad_norm": 0.0830078125, |
|
"learning_rate": 0.00019914113927615472, |
|
"loss": 1.1405, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.1407682684861454, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.00019900193259936704, |
|
"loss": 1.1432, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.14370094074627343, |
|
"grad_norm": 0.08056640625, |
|
"learning_rate": 0.00019885233162090946, |
|
"loss": 1.1523, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.1466336130064015, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 0.00019869235204752285, |
|
"loss": 1.1592, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.14956628526652951, |
|
"grad_norm": 0.08544921875, |
|
"learning_rate": 0.00019852201067560606, |
|
"loss": 1.1535, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.15249895752665754, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 0.00019834132538945246, |
|
"loss": 1.1429, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.15543162978678557, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 0.00019815031515937225, |
|
"loss": 1.1385, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.1583643020469136, |
|
"grad_norm": 0.08642578125, |
|
"learning_rate": 0.00019794900003970077, |
|
"loss": 1.1356, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.16129697430704162, |
|
"grad_norm": 0.0849609375, |
|
"learning_rate": 0.00019773740116669288, |
|
"loss": 1.1311, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.16422964656716965, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.00019751554075630404, |
|
"loss": 1.1243, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.16716231882729768, |
|
"grad_norm": 0.0908203125, |
|
"learning_rate": 0.0001972834421018576, |
|
"loss": 1.1475, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.1700949910874257, |
|
"grad_norm": 0.08056640625, |
|
"learning_rate": 0.0001970411295715994, |
|
"loss": 1.1338, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.17302766334755373, |
|
"grad_norm": 0.0810546875, |
|
"learning_rate": 0.0001967886286061393, |
|
"loss": 1.1238, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.17596033560768176, |
|
"grad_norm": 0.08642578125, |
|
"learning_rate": 0.00019652596571578004, |
|
"loss": 1.1322, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1788930078678098, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 0.00019625316847773395, |
|
"loss": 1.1305, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.18182568012793782, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.0001959702655332277, |
|
"loss": 1.1273, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.18475835238806584, |
|
"grad_norm": 0.08349609375, |
|
"learning_rate": 0.00019567728658449504, |
|
"loss": 1.1306, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1876910246481939, |
|
"grad_norm": 0.0830078125, |
|
"learning_rate": 0.00019537426239165853, |
|
"loss": 1.1239, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.19062369690832193, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.00019506122476949981, |
|
"loss": 1.1234, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.19355636916844995, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 0.00019473820658411957, |
|
"loss": 1.1115, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.19648904142857798, |
|
"grad_norm": 0.09033203125, |
|
"learning_rate": 0.0001944052417494867, |
|
"loss": 1.1127, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.199421713688706, |
|
"grad_norm": 0.08740234375, |
|
"learning_rate": 0.0001940623652238777, |
|
"loss": 1.1315, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.20235438594883404, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 0.00019370961300620637, |
|
"loss": 1.1211, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.20528705820896206, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.00019334702213224446, |
|
"loss": 1.1162, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.2082197304690901, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.00019297463067073287, |
|
"loss": 1.129, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.21115240272921812, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.000192592477719385, |
|
"loss": 1.108, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.21408507498934615, |
|
"grad_norm": 0.080078125, |
|
"learning_rate": 0.00019220060340078188, |
|
"loss": 1.1167, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.21701774724947417, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.00019179904885815958, |
|
"loss": 1.1485, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.2199504195096022, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 0.00019138785625108957, |
|
"loss": 1.1183, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.22288309176973023, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.00019096706875105235, |
|
"loss": 1.1176, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.22581576402985826, |
|
"grad_norm": 0.08203125, |
|
"learning_rate": 0.0001905367305369048, |
|
"loss": 1.1184, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.2287484362899863, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 1.1197, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.23168110855011434, |
|
"grad_norm": 0.09521484375, |
|
"learning_rate": 0.000189647583690653, |
|
"loss": 1.1265, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.23461378081024237, |
|
"grad_norm": 0.08251953125, |
|
"learning_rate": 0.00018918886841087334, |
|
"loss": 1.1209, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2375464530703704, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 0.00018872078911183146, |
|
"loss": 1.1084, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.24047912533049842, |
|
"grad_norm": 0.08447265625, |
|
"learning_rate": 0.00018824339493759263, |
|
"loss": 1.132, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.24341179759062645, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.00018775673601019923, |
|
"loss": 1.1122, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.24634446985075448, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.00018726086342440846, |
|
"loss": 1.1161, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.2492771421108825, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.00018675582924232762, |
|
"loss": 1.1183, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.25220981437101053, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.00018624168648794832, |
|
"loss": 1.1029, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.2551424866311386, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.0001857184891415794, |
|
"loss": 1.1236, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.2580751588912666, |
|
"grad_norm": 0.0810546875, |
|
"learning_rate": 0.00018518629213417929, |
|
"loss": 1.1036, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.26100783115139464, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.00018464515134158896, |
|
"loss": 1.1047, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.26394050341152264, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00018409512357866548, |
|
"loss": 1.1153, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.2668731756716507, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.00018353626659331683, |
|
"loss": 1.1238, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.2698058479317787, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.00018296863906043894, |
|
"loss": 1.1149, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.27273852019190675, |
|
"grad_norm": 0.08056640625, |
|
"learning_rate": 0.00018239230057575542, |
|
"loss": 1.1112, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.27567119245203475, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 0.0001818073116495606, |
|
"loss": 1.1043, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.2786038647121628, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 0.0001812137337003663, |
|
"loss": 1.1055, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.2815365369722908, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 0.00018061162904845358, |
|
"loss": 1.1059, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.28446920923241886, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.0001800010609093298, |
|
"loss": 1.1062, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.28740188149254686, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 0.00017938209338709123, |
|
"loss": 1.115, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.2903345537526749, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 0.00017875479146769305, |
|
"loss": 1.0987, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.293267226012803, |
|
"grad_norm": 0.08056640625, |
|
"learning_rate": 0.0001781192210121262, |
|
"loss": 1.1072, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.296199898272931, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.00017747544874950272, |
|
"loss": 1.1027, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.29913257053305903, |
|
"grad_norm": 0.08837890625, |
|
"learning_rate": 0.00017682354227004963, |
|
"loss": 1.1004, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.302065242793187, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 0.0001761635700180127, |
|
"loss": 1.1012, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.3049979150533151, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.00017549560128447047, |
|
"loss": 1.0992, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.3079305873134431, |
|
"grad_norm": 0.10888671875, |
|
"learning_rate": 0.00017481970620005912, |
|
"loss": 1.1039, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.31086325957357114, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.00017413595572760961, |
|
"loss": 1.1154, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.31379593183369914, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.00017344442165469714, |
|
"loss": 1.0995, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.3167286040938272, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.00017274517658610398, |
|
"loss": 1.0934, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.3196612763539552, |
|
"grad_norm": 0.08154296875, |
|
"learning_rate": 0.0001720382939361969, |
|
"loss": 1.0801, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.32259394861408325, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 0.00017132384792121905, |
|
"loss": 1.0932, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.32552662087421125, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 0.0001706019135514982, |
|
"loss": 1.12, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.3284592931343393, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 0.00016987256662357106, |
|
"loss": 1.129, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.33139196539446736, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.00016913588371222557, |
|
"loss": 1.1059, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.33432463765459536, |
|
"grad_norm": 0.08642578125, |
|
"learning_rate": 0.00016839194216246108, |
|
"loss": 1.0923, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.3372573099147234, |
|
"grad_norm": 0.0849609375, |
|
"learning_rate": 0.00016764082008136795, |
|
"loss": 1.0909, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.3401899821748514, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 0.00016688259632992693, |
|
"loss": 1.096, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.34312265443497947, |
|
"grad_norm": 0.08056640625, |
|
"learning_rate": 0.0001661173505147295, |
|
"loss": 1.091, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.34605532669510747, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00016534516297961996, |
|
"loss": 1.0937, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.3489879989552355, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.00016456611479725996, |
|
"loss": 1.1003, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.3519206712153635, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 0.00016378028776061667, |
|
"loss": 1.0941, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.3548533434754916, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 0.00016298776437437523, |
|
"loss": 1.1158, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.3577860157356196, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 0.00016218862784627658, |
|
"loss": 1.1004, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.36071868799574763, |
|
"grad_norm": 0.08251953125, |
|
"learning_rate": 0.00016138296207838127, |
|
"loss": 1.079, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.36365136025587563, |
|
"grad_norm": 0.08349609375, |
|
"learning_rate": 0.00016057085165826072, |
|
"loss": 1.0907, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3665840325160037, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.00015975238185011602, |
|
"loss": 1.0869, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.3695167047761317, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 0.0001589276385858262, |
|
"loss": 1.0919, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.37244937703625974, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.00015809670845592604, |
|
"loss": 1.095, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.3753820492963878, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.0001572596787005149, |
|
"loss": 1.0989, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.3783147215565158, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.00015641663720009733, |
|
"loss": 1.0984, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.38124739381664385, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00015556767246635626, |
|
"loss": 1.1086, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.38418006607677185, |
|
"grad_norm": 0.08203125, |
|
"learning_rate": 0.00015471287363286038, |
|
"loss": 1.0858, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.3871127383368999, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 0.00015385233044570555, |
|
"loss": 1.096, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3900454105970279, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 0.00015298613325409263, |
|
"loss": 1.089, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.39297808285715596, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.00015211437300084136, |
|
"loss": 1.0853, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.39591075511728396, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.0001512371412128424, |
|
"loss": 1.1072, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.398843427377412, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.00015035452999144762, |
|
"loss": 1.0836, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.40177609963754, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 0.00014946663200280063, |
|
"loss": 1.0942, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.4047087718976681, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 0.00014857354046810732, |
|
"loss": 1.093, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.4076414441577961, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 0.00014767534915384865, |
|
"loss": 1.0874, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.41057411641792413, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.00014677215236193604, |
|
"loss": 1.0787, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.41350678867805213, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 0.00014586404491981052, |
|
"loss": 1.1116, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.4164394609381802, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 0.00014495112217048658, |
|
"loss": 1.1018, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.41937213319830824, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00014403347996254232, |
|
"loss": 1.0945, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.42230480545843624, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.00014311121464005583, |
|
"loss": 1.0836, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.4252374777185643, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 0.00014218442303249026, |
|
"loss": 1.0765, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.4281701499786923, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.0001412532024445275, |
|
"loss": 1.0928, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.43110282223882035, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 0.00014031765064585197, |
|
"loss": 1.0855, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.43403549449894835, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.00013937786586088583, |
|
"loss": 1.0841, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.4369681667590764, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.00013843394675847634, |
|
"loss": 1.0571, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.4399008390192044, |
|
"grad_norm": 0.0810546875, |
|
"learning_rate": 0.00013748599244153633, |
|
"loss": 1.0945, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.44283351127933246, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 0.00013653410243663952, |
|
"loss": 1.0922, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.44576618353946046, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 0.000135578376683571, |
|
"loss": 1.097, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.4486988557995885, |
|
"grad_norm": 0.08056640625, |
|
"learning_rate": 0.00013461891552483444, |
|
"loss": 1.09, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.4516315280597165, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.00013365581969511725, |
|
"loss": 1.0844, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.45456420031984457, |
|
"grad_norm": 0.080078125, |
|
"learning_rate": 0.00013268919031071406, |
|
"loss": 1.099, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.4574968725799726, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.00013171912885891063, |
|
"loss": 1.0919, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.4604295448401006, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00013074573718732858, |
|
"loss": 1.0743, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.4633622171002287, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 0.0001297691174932322, |
|
"loss": 1.0849, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.4662948893603567, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.00012878937231279892, |
|
"loss": 1.0769, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.46922756162048473, |
|
"grad_norm": 0.08154296875, |
|
"learning_rate": 0.0001278066045103536, |
|
"loss": 1.0656, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.47216023388061273, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.00012682091726756904, |
|
"loss": 1.0847, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.4750929061407408, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.0001258324140726326, |
|
"loss": 1.0789, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.4780255784008688, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 0.00012484119870938103, |
|
"loss": 1.1031, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.48095825066099684, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 0.00012384737524640405, |
|
"loss": 1.0838, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.48389092292112484, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.00012285104802611812, |
|
"loss": 1.0795, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.4868235951812529, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 0.00012185232165381141, |
|
"loss": 1.0936, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.4897562674413809, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.00012085130098666124, |
|
"loss": 1.0845, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.49268893970150895, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 0.00011984809112272495, |
|
"loss": 1.0807, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.49562161196163695, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.00011884279738990565, |
|
"loss": 1.0834, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.498554284221765, |
|
"grad_norm": 0.08203125, |
|
"learning_rate": 0.00011783552533489372, |
|
"loss": 1.0944, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.501486956481893, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 0.00011682638071208533, |
|
"loss": 1.095, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.5044196287420211, |
|
"grad_norm": 0.08056640625, |
|
"learning_rate": 0.00011581546947247927, |
|
"loss": 1.0819, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.5073523010021491, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 0.00011480289775255295, |
|
"loss": 1.0875, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.5102849732622772, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.00011378877186311912, |
|
"loss": 1.0879, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.5132176455224051, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 0.00011277319827816423, |
|
"loss": 1.1074, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.5161503177825332, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 0.0001117562836236695, |
|
"loss": 1.0842, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.5190829900426612, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.00011073813466641632, |
|
"loss": 1.0812, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.5220156623027893, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 0.00010971885830277657, |
|
"loss": 1.108, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.5249483345629172, |
|
"grad_norm": 0.0849609375, |
|
"learning_rate": 0.00010869856154748956, |
|
"loss": 1.0808, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.5278810068230453, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 0.00010767735152242649, |
|
"loss": 1.0734, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5308136790831733, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 0.00010665533544534343, |
|
"loss": 1.0865, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.5337463513433014, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.00010563262061862471, |
|
"loss": 1.0818, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.5366790236034293, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.000104609314418017, |
|
"loss": 1.0719, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.5396116958635574, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 0.00010358552428135575, |
|
"loss": 1.0836, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.5425443681236854, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.00010256135769728539, |
|
"loss": 1.0967, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.5454770403838135, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 0.00010153692219397387, |
|
"loss": 1.0691, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.5484097126439416, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.00010051232532782313, |
|
"loss": 1.0766, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.5513423849040695, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 9.94876746721769e-05, |
|
"loss": 1.0727, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.5542750571641976, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 9.84630778060262e-05, |
|
"loss": 1.0784, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.5572077294243256, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 9.743864230271465e-05, |
|
"loss": 1.0809, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.5601404016844537, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 9.641447571864429e-05, |
|
"loss": 1.0809, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.5630730739445816, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 9.539068558198304e-05, |
|
"loss": 1.0784, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.5660057462047097, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 9.436737938137531e-05, |
|
"loss": 1.0707, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.5689384184648377, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 9.33446645546566e-05, |
|
"loss": 1.0821, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.5718710907249658, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 9.232264847757357e-05, |
|
"loss": 1.0715, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.5748037629850937, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 9.130143845251046e-05, |
|
"loss": 1.0746, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.5777364352452218, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 9.028114169722347e-05, |
|
"loss": 1.0987, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.5806691075053498, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 8.92618653335837e-05, |
|
"loss": 1.0758, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.5836017797654779, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 8.824371637633053e-05, |
|
"loss": 1.0523, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.586534452025606, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 8.722680172183578e-05, |
|
"loss": 1.0797, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.5894671242857339, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 8.62112281368809e-05, |
|
"loss": 1.0951, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.592399796545862, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 8.519710224744709e-05, |
|
"loss": 1.0833, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.59533246880599, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 8.418453052752076e-05, |
|
"loss": 1.0717, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.5982651410661181, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 8.317361928791469e-05, |
|
"loss": 1.0757, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.601197813326246, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 8.216447466510631e-05, |
|
"loss": 1.0819, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.604130485586374, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 8.115720261009437e-05, |
|
"loss": 1.075, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.6070631578465021, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 8.015190887727509e-05, |
|
"loss": 1.0778, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.6099958301066302, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 7.914869901333877e-05, |
|
"loss": 1.0564, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.6129285023667581, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 7.81476783461886e-05, |
|
"loss": 1.0812, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.6158611746268862, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 7.714895197388189e-05, |
|
"loss": 1.0919, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.6187938468870142, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 7.615262475359597e-05, |
|
"loss": 1.0833, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.6217265191471423, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 7.5158801290619e-05, |
|
"loss": 1.081, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.6246591914072703, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 7.416758592736744e-05, |
|
"loss": 1.0686, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.6275918636673983, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 7.3179082732431e-05, |
|
"loss": 1.0728, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.6305245359275263, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 7.219339548964644e-05, |
|
"loss": 1.0735, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.6334572081876544, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 7.12106276872011e-05, |
|
"loss": 1.0621, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.6363898804477824, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 7.023088250676784e-05, |
|
"loss": 1.071, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.6393225527079104, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 6.925426281267147e-05, |
|
"loss": 1.0794, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.6422552249680384, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 6.82808711410894e-05, |
|
"loss": 1.0637, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.6451878972281665, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 6.731080968928599e-05, |
|
"loss": 1.0665, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.6481205694882946, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 6.63441803048828e-05, |
|
"loss": 1.0773, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.6510532417484225, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 6.538108447516558e-05, |
|
"loss": 1.0689, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.6539859140085506, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 6.442162331642907e-05, |
|
"loss": 1.0894, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.6569185862686786, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 6.34658975633605e-05, |
|
"loss": 1.0772, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.6598512585288067, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 6.251400755846372e-05, |
|
"loss": 1.0653, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.6627839307889347, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 6.15660532415237e-05, |
|
"loss": 1.0827, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.6657166030490627, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 6.0622134139114194e-05, |
|
"loss": 1.0833, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.6686492753091907, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 5.968234935414807e-05, |
|
"loss": 1.081, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.6715819475693188, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 5.874679755547254e-05, |
|
"loss": 1.0955, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.6745146198294468, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 5.7815576967509733e-05, |
|
"loss": 1.0862, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.6774472920895748, |
|
"grad_norm": 0.07763671875, |
|
"learning_rate": 5.688878535994421e-05, |
|
"loss": 1.0885, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.6803799643497028, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 5.5966520037457716e-05, |
|
"loss": 1.0578, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.6833126366098309, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 5.5048877829513424e-05, |
|
"loss": 1.0914, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.6862453088699589, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 5.413595508018952e-05, |
|
"loss": 1.0837, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.6891779811300869, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 5.3227847638064e-05, |
|
"loss": 1.0665, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.6921106533902149, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 5.232465084615135e-05, |
|
"loss": 1.0815, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.695043325650343, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 5.1426459531892714e-05, |
|
"loss": 1.1066, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.697975997910471, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 5.0533367997199376e-05, |
|
"loss": 1.0758, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.700908670170599, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 4.964547000855237e-05, |
|
"loss": 1.0704, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.703841342430727, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 4.876285878715764e-05, |
|
"loss": 1.067, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7067740146908551, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 4.7885626999158695e-05, |
|
"loss": 1.0796, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.7097066869509832, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 4.701386674590742e-05, |
|
"loss": 1.0572, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.7126393592111112, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 4.614766955429447e-05, |
|
"loss": 1.0896, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.7155720314712392, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 4.528712636713964e-05, |
|
"loss": 1.0699, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.7185047037313672, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 4.443232753364377e-05, |
|
"loss": 1.081, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.7214373759914953, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 4.358336279990268e-05, |
|
"loss": 1.0733, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.7243700482516233, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 4.274032129948512e-05, |
|
"loss": 1.0983, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.7273027205117513, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 4.1903291544073986e-05, |
|
"loss": 1.0868, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.7302353927718793, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 4.107236141417382e-05, |
|
"loss": 1.0871, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.7331680650320074, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 4.024761814988399e-05, |
|
"loss": 1.0787, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.7361007372921354, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 3.942914834173932e-05, |
|
"loss": 1.0823, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.7390334095522634, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 3.8617037921618705e-05, |
|
"loss": 1.0632, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.7419660818123914, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 3.781137215372345e-05, |
|
"loss": 1.0764, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.7448987540725195, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 3.701223562562478e-05, |
|
"loss": 1.0878, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.7478314263326475, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 3.621971223938334e-05, |
|
"loss": 1.0832, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.7507640985927756, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 3.5433885202740045e-05, |
|
"loss": 1.0745, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.7536967708529035, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 3.4654837020380074e-05, |
|
"loss": 1.0719, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.7566294431130316, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 3.388264948527052e-05, |
|
"loss": 1.0929, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.7595621153731597, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 3.311740367007311e-05, |
|
"loss": 1.067, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.7624947876332877, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 3.2359179918632076e-05, |
|
"loss": 1.062, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.7654274598934157, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 3.160805783753897e-05, |
|
"loss": 1.06, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.7683601321535437, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 3.086411628777445e-05, |
|
"loss": 1.084, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.7712928044136718, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 3.0127433376428983e-05, |
|
"loss": 1.0578, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.7742254766737998, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 2.939808644850184e-05, |
|
"loss": 1.0637, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.7771581489339278, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 2.867615207878096e-05, |
|
"loss": 1.0661, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.7800908211940558, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 2.796170606380313e-05, |
|
"loss": 1.0768, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.7830234934541839, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 2.7254823413896058e-05, |
|
"loss": 1.068, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.7859561657143119, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 2.6555578345302878e-05, |
|
"loss": 1.0674, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.78888883797444, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 2.58640442723904e-05, |
|
"loss": 1.0798, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.7918215102345679, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 2.518029379994089e-05, |
|
"loss": 1.0694, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.794754182494696, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 2.4504398715529554e-05, |
|
"loss": 1.0752, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.797686854754824, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 2.383642998198731e-05, |
|
"loss": 1.066, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.8006195270149521, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 2.317645772995042e-05, |
|
"loss": 1.0752, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.80355219927508, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 2.2524551250497316e-05, |
|
"loss": 1.0738, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.8064848715352081, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 2.1880778987873807e-05, |
|
"loss": 1.0717, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.8094175437953361, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 2.124520853230697e-05, |
|
"loss": 1.0781, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.8123502160554642, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 2.061790661290881e-05, |
|
"loss": 1.0774, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.8152828883155921, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 1.999893909067021e-05, |
|
"loss": 1.0817, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.8182155605757202, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 1.9388370951546432e-05, |
|
"loss": 1.0899, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.8211482328358483, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 1.8786266299633738e-05, |
|
"loss": 1.0724, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8240809050959763, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 1.8192688350439424e-05, |
|
"loss": 1.0676, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.8270135773561043, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 1.7607699424244585e-05, |
|
"loss": 1.0751, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.8299462496162323, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 1.7031360939561103e-05, |
|
"loss": 1.0992, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.8328789218763604, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 1.646373340668319e-05, |
|
"loss": 1.0735, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.8358115941364884, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 1.5904876421334536e-05, |
|
"loss": 1.0686, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.8387442663966165, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 1.5354848658411048e-05, |
|
"loss": 1.079, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.8416769386567444, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 1.4813707865820747e-05, |
|
"loss": 1.1048, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.8446096109168725, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 1.4281510858420632e-05, |
|
"loss": 1.0656, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.8475422831770005, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 1.3758313512051702e-05, |
|
"loss": 1.067, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.8504749554371286, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 1.3244170757672425e-05, |
|
"loss": 1.0804, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.8534076276972565, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 1.2739136575591581e-05, |
|
"loss": 1.0829, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.8563402999573846, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 1.2243263989800768e-05, |
|
"loss": 1.0792, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.8592729722175126, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 1.17566050624074e-05, |
|
"loss": 1.0776, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.8622056444776407, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 1.1279210888168546e-05, |
|
"loss": 1.0771, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.8651383167377686, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 1.0811131589126667e-05, |
|
"loss": 1.0872, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.8680709889978967, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 1.0352416309347001e-05, |
|
"loss": 1.0668, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.8710036612580248, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 9.903113209758096e-06, |
|
"loss": 1.0598, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.8739363335181528, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 9.463269463095203e-06, |
|
"loss": 1.0664, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.8768690057782809, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 9.032931248947685e-06, |
|
"loss": 1.0709, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.8798016780384088, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 8.612143748910451e-06, |
|
"loss": 1.0585, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.8827343502985369, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 8.20095114184044e-06, |
|
"loss": 1.0641, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.8856670225586649, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 7.799396599218133e-06, |
|
"loss": 1.0674, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.888599694818793, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 7.40752228061502e-06, |
|
"loss": 1.0844, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.8915323670789209, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 7.0253693292671505e-06, |
|
"loss": 1.0832, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.894465039339049, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 6.65297786775555e-06, |
|
"loss": 1.0834, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.897397711599177, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 6.290386993793618e-06, |
|
"loss": 1.0561, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.9003303838593051, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 5.937634776122348e-06, |
|
"loss": 1.0663, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.903263056119433, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 5.594758250513333e-06, |
|
"loss": 1.0691, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.9061957283795611, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 5.261793415880456e-06, |
|
"loss": 1.0672, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.9091284006396891, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 4.938775230500192e-06, |
|
"loss": 1.0781, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.9120610728998172, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 4.625737608341507e-06, |
|
"loss": 1.0764, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.9149937451599452, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 4.322713415504975e-06, |
|
"loss": 1.0858, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.9179264174200732, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 4.029734466772328e-06, |
|
"loss": 1.0722, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.9208590896802012, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 3.7468315222660586e-06, |
|
"loss": 1.0667, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.9237917619403293, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 3.4740342842199956e-06, |
|
"loss": 1.0658, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.9267244342004574, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 3.211371393860718e-06, |
|
"loss": 1.0764, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.9296571064605853, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 2.9588704284006174e-06, |
|
"loss": 1.0804, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.9325897787207134, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 2.7165578981424357e-06, |
|
"loss": 1.0749, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.9355224509808414, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 2.484459243695991e-06, |
|
"loss": 1.0805, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.9384551232409695, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 2.262598833307128e-06, |
|
"loss": 1.07, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.9413877955010974, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 2.0509999602992493e-06, |
|
"loss": 1.0834, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.9443204677612255, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 1.849684840627741e-06, |
|
"loss": 1.0646, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.9472531400213535, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 1.6586746105475281e-06, |
|
"loss": 1.0626, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.9501858122814816, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 1.4779893243939359e-06, |
|
"loss": 1.0706, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.9531184845416096, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 1.3076479524771644e-06, |
|
"loss": 1.0747, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.9560511568017376, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 1.1476683790905495e-06, |
|
"loss": 1.0859, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.9589838290618656, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 9.98067400632985e-07, |
|
"loss": 1.0916, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.9619165013219937, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 8.588607238453006e-07, |
|
"loss": 1.0644, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.9648491735821217, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 7.300629641613154e-07, |
|
"loss": 1.0783, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.9677818458422497, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 6.116876441733088e-07, |
|
"loss": 1.0885, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.9707145181023777, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 5.037471922122561e-07, |
|
"loss": 1.0759, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.9736471903625058, |
|
"grad_norm": 0.07470703125, |
|
"learning_rate": 4.062529410429949e-07, |
|
"loss": 1.0633, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.9765798626226339, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 3.192151266743548e-07, |
|
"loss": 1.0794, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.9795125348827618, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 2.4264288728451657e-07, |
|
"loss": 1.0704, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.9824452071428899, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 1.7654426226155763e-07, |
|
"loss": 1.0678, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.9853778794030179, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 1.2092619135937177e-07, |
|
"loss": 1.067, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.988310551663146, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 7.579451396908521e-08, |
|
"loss": 1.0654, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.9912432239232739, |
|
"grad_norm": 0.0751953125, |
|
"learning_rate": 4.1153968505991406e-08, |
|
"loss": 1.0512, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.994175896183402, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 1.7008191912004646e-08, |
|
"loss": 1.075, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.99710856844353, |
|
"grad_norm": 0.07568359375, |
|
"learning_rate": 3.359719273865469e-09, |
|
"loss": 1.0838, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.9994547062516325, |
|
"eval_loss": 1.1863435506820679, |
|
"eval_runtime": 1954.7038, |
|
"eval_samples_per_second": 8.263, |
|
"eval_steps_per_second": 8.263, |
|
"step": 1704 |
|
}, |
|
{ |
|
"epoch": 0.9994547062516325, |
|
"step": 1704, |
|
"total_flos": 4.734725057610252e+18, |
|
"train_loss": 0.12850537406446788, |
|
"train_runtime": 13780.2764, |
|
"train_samples_per_second": 15.836, |
|
"train_steps_per_second": 0.124 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1704, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 4.734725057610252e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|