{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994547062516325, "eval_steps": 500, "global_step": 1704, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005865344520256058, "grad_norm": 0.134765625, "learning_rate": 1.1695906432748538e-06, "loss": 1.5828, "step": 1 }, { "epoch": 0.0029326722601280297, "grad_norm": 0.1298828125, "learning_rate": 5.8479532163742686e-06, "loss": 1.5339, "step": 5 }, { "epoch": 0.005865344520256059, "grad_norm": 0.12890625, "learning_rate": 1.1695906432748537e-05, "loss": 1.5449, "step": 10 }, { "epoch": 0.008798016780384088, "grad_norm": 0.1328125, "learning_rate": 1.7543859649122806e-05, "loss": 1.5426, "step": 15 }, { "epoch": 0.011730689040512119, "grad_norm": 0.1279296875, "learning_rate": 2.3391812865497074e-05, "loss": 1.5446, "step": 20 }, { "epoch": 0.014663361300640148, "grad_norm": 0.142578125, "learning_rate": 2.9239766081871346e-05, "loss": 1.5188, "step": 25 }, { "epoch": 0.017596033560768175, "grad_norm": 0.1298828125, "learning_rate": 3.508771929824561e-05, "loss": 1.5175, "step": 30 }, { "epoch": 0.020528705820896206, "grad_norm": 0.119140625, "learning_rate": 4.093567251461988e-05, "loss": 1.4769, "step": 35 }, { "epoch": 0.023461378081024237, "grad_norm": 0.1171875, "learning_rate": 4.678362573099415e-05, "loss": 1.4503, "step": 40 }, { "epoch": 0.026394050341152265, "grad_norm": 0.0947265625, "learning_rate": 5.2631578947368424e-05, "loss": 1.4078, "step": 45 }, { "epoch": 0.029326722601280296, "grad_norm": 0.08642578125, "learning_rate": 5.847953216374269e-05, "loss": 1.3803, "step": 50 }, { "epoch": 0.03225939486140832, "grad_norm": 0.07373046875, "learning_rate": 6.432748538011695e-05, "loss": 1.365, "step": 55 }, { "epoch": 0.03519206712153635, "grad_norm": 0.06689453125, "learning_rate": 7.017543859649122e-05, "loss": 1.3275, "step": 60 }, { "epoch": 0.038124739381664385, "grad_norm": 0.0634765625, "learning_rate": 7.602339181286549e-05, "loss": 1.3026, "step": 65 }, { "epoch": 0.04105741164179241, "grad_norm": 0.06005859375, "learning_rate": 8.187134502923976e-05, "loss": 1.2994, "step": 70 }, { "epoch": 0.04399008390192044, "grad_norm": 0.0634765625, "learning_rate": 8.771929824561403e-05, "loss": 1.2961, "step": 75 }, { "epoch": 0.046922756162048475, "grad_norm": 0.06201171875, "learning_rate": 9.35672514619883e-05, "loss": 1.2742, "step": 80 }, { "epoch": 0.0498554284221765, "grad_norm": 0.04833984375, "learning_rate": 9.941520467836257e-05, "loss": 1.2694, "step": 85 }, { "epoch": 0.05278810068230453, "grad_norm": 0.0537109375, "learning_rate": 0.00010526315789473685, "loss": 1.2601, "step": 90 }, { "epoch": 0.05572077294243256, "grad_norm": 0.05419921875, "learning_rate": 0.00011111111111111112, "loss": 1.2419, "step": 95 }, { "epoch": 0.05865344520256059, "grad_norm": 0.053466796875, "learning_rate": 0.00011695906432748539, "loss": 1.2357, "step": 100 }, { "epoch": 0.06158611746268862, "grad_norm": 0.060302734375, "learning_rate": 0.00012280701754385965, "loss": 1.2227, "step": 105 }, { "epoch": 0.06451878972281665, "grad_norm": 0.052978515625, "learning_rate": 0.0001286549707602339, "loss": 1.2327, "step": 110 }, { "epoch": 0.06745146198294467, "grad_norm": 0.061767578125, "learning_rate": 0.0001345029239766082, "loss": 1.2221, "step": 115 }, { "epoch": 0.0703841342430727, "grad_norm": 0.05859375, "learning_rate": 0.00014035087719298245, "loss": 1.209, "step": 120 }, { "epoch": 0.07331680650320074, "grad_norm": 0.064453125, "learning_rate": 0.00014619883040935673, "loss": 1.2159, "step": 125 }, { "epoch": 0.07624947876332877, "grad_norm": 0.0791015625, "learning_rate": 0.00015204678362573098, "loss": 1.1813, "step": 130 }, { "epoch": 0.0791821510234568, "grad_norm": 0.0693359375, "learning_rate": 0.00015789473684210527, "loss": 1.2059, "step": 135 }, { "epoch": 0.08211482328358483, "grad_norm": 0.06396484375, "learning_rate": 0.00016374269005847952, "loss": 1.2146, "step": 140 }, { "epoch": 0.08504749554371285, "grad_norm": 0.09326171875, "learning_rate": 0.0001695906432748538, "loss": 1.1965, "step": 145 }, { "epoch": 0.08798016780384088, "grad_norm": 0.0791015625, "learning_rate": 0.00017543859649122806, "loss": 1.1822, "step": 150 }, { "epoch": 0.09091284006396891, "grad_norm": 0.08349609375, "learning_rate": 0.00018128654970760234, "loss": 1.1824, "step": 155 }, { "epoch": 0.09384551232409695, "grad_norm": 0.078125, "learning_rate": 0.0001871345029239766, "loss": 1.1992, "step": 160 }, { "epoch": 0.09677818458422498, "grad_norm": 0.0791015625, "learning_rate": 0.00019298245614035088, "loss": 1.1806, "step": 165 }, { "epoch": 0.099710856844353, "grad_norm": 0.07666015625, "learning_rate": 0.00019883040935672513, "loss": 1.1926, "step": 170 }, { "epoch": 0.10264352910448103, "grad_norm": 0.07666015625, "learning_rate": 0.00019999664028072614, "loss": 1.1773, "step": 175 }, { "epoch": 0.10557620136460906, "grad_norm": 0.0830078125, "learning_rate": 0.000199982991808088, "loss": 1.1422, "step": 180 }, { "epoch": 0.10850887362473709, "grad_norm": 0.11279296875, "learning_rate": 0.00019995884603149402, "loss": 1.1739, "step": 185 }, { "epoch": 0.11144154588486511, "grad_norm": 0.0859375, "learning_rate": 0.00019992420548603092, "loss": 1.1652, "step": 190 }, { "epoch": 0.11437421814499316, "grad_norm": 0.08154296875, "learning_rate": 0.00019987907380864062, "loss": 1.1597, "step": 195 }, { "epoch": 0.11730689040512118, "grad_norm": 0.07861328125, "learning_rate": 0.00019982345573773844, "loss": 1.1497, "step": 200 }, { "epoch": 0.12023956266524921, "grad_norm": 0.08544921875, "learning_rate": 0.00019975735711271552, "loss": 1.1419, "step": 205 }, { "epoch": 0.12317223492537724, "grad_norm": 0.080078125, "learning_rate": 0.00019968078487332566, "loss": 1.1583, "step": 210 }, { "epoch": 0.12610490718550527, "grad_norm": 0.08203125, "learning_rate": 0.000199593747058957, "loss": 1.1501, "step": 215 }, { "epoch": 0.1290375794456333, "grad_norm": 0.0830078125, "learning_rate": 0.00019949625280778777, "loss": 1.1552, "step": 220 }, { "epoch": 0.13197025170576132, "grad_norm": 0.080078125, "learning_rate": 0.00019938831235582672, "loss": 1.1285, "step": 225 }, { "epoch": 0.13490292396588935, "grad_norm": 0.07861328125, "learning_rate": 0.0001992699370358387, "loss": 1.1525, "step": 230 }, { "epoch": 0.13783559622601738, "grad_norm": 0.0830078125, "learning_rate": 0.00019914113927615472, "loss": 1.1405, "step": 235 }, { "epoch": 0.1407682684861454, "grad_norm": 0.07861328125, "learning_rate": 0.00019900193259936704, "loss": 1.1432, "step": 240 }, { "epoch": 0.14370094074627343, "grad_norm": 0.08056640625, "learning_rate": 0.00019885233162090946, "loss": 1.1523, "step": 245 }, { "epoch": 0.1466336130064015, "grad_norm": 0.0751953125, "learning_rate": 0.00019869235204752285, "loss": 1.1592, "step": 250 }, { "epoch": 0.14956628526652951, "grad_norm": 0.08544921875, "learning_rate": 0.00019852201067560606, "loss": 1.1535, "step": 255 }, { "epoch": 0.15249895752665754, "grad_norm": 0.0771484375, "learning_rate": 0.00019834132538945246, "loss": 1.1429, "step": 260 }, { "epoch": 0.15543162978678557, "grad_norm": 0.07763671875, "learning_rate": 0.00019815031515937225, "loss": 1.1385, "step": 265 }, { "epoch": 0.1583643020469136, "grad_norm": 0.08642578125, "learning_rate": 0.00019794900003970077, "loss": 1.1356, "step": 270 }, { "epoch": 0.16129697430704162, "grad_norm": 0.0849609375, "learning_rate": 0.00019773740116669288, "loss": 1.1311, "step": 275 }, { "epoch": 0.16422964656716965, "grad_norm": 0.0869140625, "learning_rate": 0.00019751554075630404, "loss": 1.1243, "step": 280 }, { "epoch": 0.16716231882729768, "grad_norm": 0.0908203125, "learning_rate": 0.0001972834421018576, "loss": 1.1475, "step": 285 }, { "epoch": 0.1700949910874257, "grad_norm": 0.08056640625, "learning_rate": 0.0001970411295715994, "loss": 1.1338, "step": 290 }, { "epoch": 0.17302766334755373, "grad_norm": 0.0810546875, "learning_rate": 0.0001967886286061393, "loss": 1.1238, "step": 295 }, { "epoch": 0.17596033560768176, "grad_norm": 0.08642578125, "learning_rate": 0.00019652596571578004, "loss": 1.1322, "step": 300 }, { "epoch": 0.1788930078678098, "grad_norm": 0.0712890625, "learning_rate": 0.00019625316847773395, "loss": 1.1305, "step": 305 }, { "epoch": 0.18182568012793782, "grad_norm": 0.07666015625, "learning_rate": 0.0001959702655332277, "loss": 1.1273, "step": 310 }, { "epoch": 0.18475835238806584, "grad_norm": 0.08349609375, "learning_rate": 0.00019567728658449504, "loss": 1.1306, "step": 315 }, { "epoch": 0.1876910246481939, "grad_norm": 0.0830078125, "learning_rate": 0.00019537426239165853, "loss": 1.1239, "step": 320 }, { "epoch": 0.19062369690832193, "grad_norm": 0.0869140625, "learning_rate": 0.00019506122476949981, "loss": 1.1234, "step": 325 }, { "epoch": 0.19355636916844995, "grad_norm": 0.09228515625, "learning_rate": 0.00019473820658411957, "loss": 1.1115, "step": 330 }, { "epoch": 0.19648904142857798, "grad_norm": 0.09033203125, "learning_rate": 0.0001944052417494867, "loss": 1.1127, "step": 335 }, { "epoch": 0.199421713688706, "grad_norm": 0.08740234375, "learning_rate": 0.0001940623652238777, "loss": 1.1315, "step": 340 }, { "epoch": 0.20235438594883404, "grad_norm": 0.0771484375, "learning_rate": 0.00019370961300620637, "loss": 1.1211, "step": 345 }, { "epoch": 0.20528705820896206, "grad_norm": 0.07421875, "learning_rate": 0.00019334702213224446, "loss": 1.1162, "step": 350 }, { "epoch": 0.2082197304690901, "grad_norm": 0.07421875, "learning_rate": 0.00019297463067073287, "loss": 1.129, "step": 355 }, { "epoch": 0.21115240272921812, "grad_norm": 0.0791015625, "learning_rate": 0.000192592477719385, "loss": 1.108, "step": 360 }, { "epoch": 0.21408507498934615, "grad_norm": 0.080078125, "learning_rate": 0.00019220060340078188, "loss": 1.1167, "step": 365 }, { "epoch": 0.21701774724947417, "grad_norm": 0.0869140625, "learning_rate": 0.00019179904885815958, "loss": 1.1485, "step": 370 }, { "epoch": 0.2199504195096022, "grad_norm": 0.0751953125, "learning_rate": 0.00019138785625108957, "loss": 1.1183, "step": 375 }, { "epoch": 0.22288309176973023, "grad_norm": 0.0869140625, "learning_rate": 0.00019096706875105235, "loss": 1.1176, "step": 380 }, { "epoch": 0.22581576402985826, "grad_norm": 0.08203125, "learning_rate": 0.0001905367305369048, "loss": 1.1184, "step": 385 }, { "epoch": 0.2287484362899863, "grad_norm": 0.0732421875, "learning_rate": 0.0001900968867902419, "loss": 1.1197, "step": 390 }, { "epoch": 0.23168110855011434, "grad_norm": 0.09521484375, "learning_rate": 0.000189647583690653, "loss": 1.1265, "step": 395 }, { "epoch": 0.23461378081024237, "grad_norm": 0.08251953125, "learning_rate": 0.00018918886841087334, "loss": 1.1209, "step": 400 }, { "epoch": 0.2375464530703704, "grad_norm": 0.0732421875, "learning_rate": 0.00018872078911183146, "loss": 1.1084, "step": 405 }, { "epoch": 0.24047912533049842, "grad_norm": 0.08447265625, "learning_rate": 0.00018824339493759263, "loss": 1.132, "step": 410 }, { "epoch": 0.24341179759062645, "grad_norm": 0.07666015625, "learning_rate": 0.00018775673601019923, "loss": 1.1122, "step": 415 }, { "epoch": 0.24634446985075448, "grad_norm": 0.07666015625, "learning_rate": 0.00018726086342440846, "loss": 1.1161, "step": 420 }, { "epoch": 0.2492771421108825, "grad_norm": 0.07275390625, "learning_rate": 0.00018675582924232762, "loss": 1.1183, "step": 425 }, { "epoch": 0.25220981437101053, "grad_norm": 0.078125, "learning_rate": 0.00018624168648794832, "loss": 1.1029, "step": 430 }, { "epoch": 0.2551424866311386, "grad_norm": 0.07666015625, "learning_rate": 0.0001857184891415794, "loss": 1.1236, "step": 435 }, { "epoch": 0.2580751588912666, "grad_norm": 0.0810546875, "learning_rate": 0.00018518629213417929, "loss": 1.1036, "step": 440 }, { "epoch": 0.26100783115139464, "grad_norm": 0.07275390625, "learning_rate": 0.00018464515134158896, "loss": 1.1047, "step": 445 }, { "epoch": 0.26394050341152264, "grad_norm": 0.07470703125, "learning_rate": 0.00018409512357866548, "loss": 1.1153, "step": 450 }, { "epoch": 0.2668731756716507, "grad_norm": 0.07666015625, "learning_rate": 0.00018353626659331683, "loss": 1.1238, "step": 455 }, { "epoch": 0.2698058479317787, "grad_norm": 0.07568359375, "learning_rate": 0.00018296863906043894, "loss": 1.1149, "step": 460 }, { "epoch": 0.27273852019190675, "grad_norm": 0.08056640625, "learning_rate": 0.00018239230057575542, "loss": 1.1112, "step": 465 }, { "epoch": 0.27567119245203475, "grad_norm": 0.0751953125, "learning_rate": 0.0001818073116495606, "loss": 1.1043, "step": 470 }, { "epoch": 0.2786038647121628, "grad_norm": 0.0732421875, "learning_rate": 0.0001812137337003663, "loss": 1.1055, "step": 475 }, { "epoch": 0.2815365369722908, "grad_norm": 0.0732421875, "learning_rate": 0.00018061162904845358, "loss": 1.1059, "step": 480 }, { "epoch": 0.28446920923241886, "grad_norm": 0.07666015625, "learning_rate": 0.0001800010609093298, "loss": 1.1062, "step": 485 }, { "epoch": 0.28740188149254686, "grad_norm": 0.07763671875, "learning_rate": 0.00017938209338709123, "loss": 1.115, "step": 490 }, { "epoch": 0.2903345537526749, "grad_norm": 0.076171875, "learning_rate": 0.00017875479146769305, "loss": 1.0987, "step": 495 }, { "epoch": 0.293267226012803, "grad_norm": 0.08056640625, "learning_rate": 0.0001781192210121262, "loss": 1.1072, "step": 500 }, { "epoch": 0.296199898272931, "grad_norm": 0.07421875, "learning_rate": 0.00017747544874950272, "loss": 1.1027, "step": 505 }, { "epoch": 0.29913257053305903, "grad_norm": 0.08837890625, "learning_rate": 0.00017682354227004963, "loss": 1.1004, "step": 510 }, { "epoch": 0.302065242793187, "grad_norm": 0.07763671875, "learning_rate": 0.0001761635700180127, "loss": 1.1012, "step": 515 }, { "epoch": 0.3049979150533151, "grad_norm": 0.07861328125, "learning_rate": 0.00017549560128447047, "loss": 1.0992, "step": 520 }, { "epoch": 0.3079305873134431, "grad_norm": 0.10888671875, "learning_rate": 0.00017481970620005912, "loss": 1.1039, "step": 525 }, { "epoch": 0.31086325957357114, "grad_norm": 0.07421875, "learning_rate": 0.00017413595572760961, "loss": 1.1154, "step": 530 }, { "epoch": 0.31379593183369914, "grad_norm": 0.07861328125, "learning_rate": 0.00017344442165469714, "loss": 1.0995, "step": 535 }, { "epoch": 0.3167286040938272, "grad_norm": 0.07861328125, "learning_rate": 0.00017274517658610398, "loss": 1.0934, "step": 540 }, { "epoch": 0.3196612763539552, "grad_norm": 0.08154296875, "learning_rate": 0.0001720382939361969, "loss": 1.0801, "step": 545 }, { "epoch": 0.32259394861408325, "grad_norm": 0.07763671875, "learning_rate": 0.00017132384792121905, "loss": 1.0932, "step": 550 }, { "epoch": 0.32552662087421125, "grad_norm": 0.076171875, "learning_rate": 0.0001706019135514982, "loss": 1.12, "step": 555 }, { "epoch": 0.3284592931343393, "grad_norm": 0.0927734375, "learning_rate": 0.00016987256662357106, "loss": 1.129, "step": 560 }, { "epoch": 0.33139196539446736, "grad_norm": 0.0869140625, "learning_rate": 0.00016913588371222557, "loss": 1.1059, "step": 565 }, { "epoch": 0.33432463765459536, "grad_norm": 0.08642578125, "learning_rate": 0.00016839194216246108, "loss": 1.0923, "step": 570 }, { "epoch": 0.3372573099147234, "grad_norm": 0.0849609375, "learning_rate": 0.00016764082008136795, "loss": 1.0909, "step": 575 }, { "epoch": 0.3401899821748514, "grad_norm": 0.0771484375, "learning_rate": 0.00016688259632992693, "loss": 1.096, "step": 580 }, { "epoch": 0.34312265443497947, "grad_norm": 0.08056640625, "learning_rate": 0.0001661173505147295, "loss": 1.091, "step": 585 }, { "epoch": 0.34605532669510747, "grad_norm": 0.07470703125, "learning_rate": 0.00016534516297961996, "loss": 1.0937, "step": 590 }, { "epoch": 0.3489879989552355, "grad_norm": 0.0869140625, "learning_rate": 0.00016456611479725996, "loss": 1.1003, "step": 595 }, { "epoch": 0.3519206712153635, "grad_norm": 0.07763671875, "learning_rate": 0.00016378028776061667, "loss": 1.0941, "step": 600 }, { "epoch": 0.3548533434754916, "grad_norm": 0.07763671875, "learning_rate": 0.00016298776437437523, "loss": 1.1158, "step": 605 }, { "epoch": 0.3577860157356196, "grad_norm": 0.09228515625, "learning_rate": 0.00016218862784627658, "loss": 1.1004, "step": 610 }, { "epoch": 0.36071868799574763, "grad_norm": 0.08251953125, "learning_rate": 0.00016138296207838127, "loss": 1.079, "step": 615 }, { "epoch": 0.36365136025587563, "grad_norm": 0.08349609375, "learning_rate": 0.00016057085165826072, "loss": 1.0907, "step": 620 }, { "epoch": 0.3665840325160037, "grad_norm": 0.07861328125, "learning_rate": 0.00015975238185011602, "loss": 1.0869, "step": 625 }, { "epoch": 0.3695167047761317, "grad_norm": 0.0771484375, "learning_rate": 0.0001589276385858262, "loss": 1.0919, "step": 630 }, { "epoch": 0.37244937703625974, "grad_norm": 0.07421875, "learning_rate": 0.00015809670845592604, "loss": 1.095, "step": 635 }, { "epoch": 0.3753820492963878, "grad_norm": 0.07470703125, "learning_rate": 0.0001572596787005149, "loss": 1.0989, "step": 640 }, { "epoch": 0.3783147215565158, "grad_norm": 0.0791015625, "learning_rate": 0.00015641663720009733, "loss": 1.0984, "step": 645 }, { "epoch": 0.38124739381664385, "grad_norm": 0.07470703125, "learning_rate": 0.00015556767246635626, "loss": 1.1086, "step": 650 }, { "epoch": 0.38418006607677185, "grad_norm": 0.08203125, "learning_rate": 0.00015471287363286038, "loss": 1.0858, "step": 655 }, { "epoch": 0.3871127383368999, "grad_norm": 0.16796875, "learning_rate": 0.00015385233044570555, "loss": 1.096, "step": 660 }, { "epoch": 0.3900454105970279, "grad_norm": 0.07958984375, "learning_rate": 0.00015298613325409263, "loss": 1.089, "step": 665 }, { "epoch": 0.39297808285715596, "grad_norm": 0.07861328125, "learning_rate": 0.00015211437300084136, "loss": 1.0853, "step": 670 }, { "epoch": 0.39591075511728396, "grad_norm": 0.07470703125, "learning_rate": 0.0001512371412128424, "loss": 1.1072, "step": 675 }, { "epoch": 0.398843427377412, "grad_norm": 0.07666015625, "learning_rate": 0.00015035452999144762, "loss": 1.0836, "step": 680 }, { "epoch": 0.40177609963754, "grad_norm": 0.0859375, "learning_rate": 0.00014946663200280063, "loss": 1.0942, "step": 685 }, { "epoch": 0.4047087718976681, "grad_norm": 0.072265625, "learning_rate": 0.00014857354046810732, "loss": 1.093, "step": 690 }, { "epoch": 0.4076414441577961, "grad_norm": 0.0712890625, "learning_rate": 0.00014767534915384865, "loss": 1.0874, "step": 695 }, { "epoch": 0.41057411641792413, "grad_norm": 0.07568359375, "learning_rate": 0.00014677215236193604, "loss": 1.0787, "step": 700 }, { "epoch": 0.41350678867805213, "grad_norm": 0.07568359375, "learning_rate": 0.00014586404491981052, "loss": 1.1116, "step": 705 }, { "epoch": 0.4164394609381802, "grad_norm": 0.07373046875, "learning_rate": 0.00014495112217048658, "loss": 1.1018, "step": 710 }, { "epoch": 0.41937213319830824, "grad_norm": 0.07470703125, "learning_rate": 0.00014403347996254232, "loss": 1.0945, "step": 715 }, { "epoch": 0.42230480545843624, "grad_norm": 0.07275390625, "learning_rate": 0.00014311121464005583, "loss": 1.0836, "step": 720 }, { "epoch": 0.4252374777185643, "grad_norm": 0.076171875, "learning_rate": 0.00014218442303249026, "loss": 1.0765, "step": 725 }, { "epoch": 0.4281701499786923, "grad_norm": 0.0791015625, "learning_rate": 0.0001412532024445275, "loss": 1.0928, "step": 730 }, { "epoch": 0.43110282223882035, "grad_norm": 0.07763671875, "learning_rate": 0.00014031765064585197, "loss": 1.0855, "step": 735 }, { "epoch": 0.43403549449894835, "grad_norm": 0.07275390625, "learning_rate": 0.00013937786586088583, "loss": 1.0841, "step": 740 }, { "epoch": 0.4369681667590764, "grad_norm": 0.07421875, "learning_rate": 0.00013843394675847634, "loss": 1.0571, "step": 745 }, { "epoch": 0.4399008390192044, "grad_norm": 0.0810546875, "learning_rate": 0.00013748599244153633, "loss": 1.0945, "step": 750 }, { "epoch": 0.44283351127933246, "grad_norm": 0.0771484375, "learning_rate": 0.00013653410243663952, "loss": 1.0922, "step": 755 }, { "epoch": 0.44576618353946046, "grad_norm": 0.0771484375, "learning_rate": 0.000135578376683571, "loss": 1.097, "step": 760 }, { "epoch": 0.4486988557995885, "grad_norm": 0.08056640625, "learning_rate": 0.00013461891552483444, "loss": 1.09, "step": 765 }, { "epoch": 0.4516315280597165, "grad_norm": 0.07666015625, "learning_rate": 0.00013365581969511725, "loss": 1.0844, "step": 770 }, { "epoch": 0.45456420031984457, "grad_norm": 0.080078125, "learning_rate": 0.00013268919031071406, "loss": 1.099, "step": 775 }, { "epoch": 0.4574968725799726, "grad_norm": 0.07861328125, "learning_rate": 0.00013171912885891063, "loss": 1.0919, "step": 780 }, { "epoch": 0.4604295448401006, "grad_norm": 0.07470703125, "learning_rate": 0.00013074573718732858, "loss": 1.0743, "step": 785 }, { "epoch": 0.4633622171002287, "grad_norm": 0.076171875, "learning_rate": 0.0001297691174932322, "loss": 1.0849, "step": 790 }, { "epoch": 0.4662948893603567, "grad_norm": 0.07421875, "learning_rate": 0.00012878937231279892, "loss": 1.0769, "step": 795 }, { "epoch": 0.46922756162048473, "grad_norm": 0.08154296875, "learning_rate": 0.0001278066045103536, "loss": 1.0656, "step": 800 }, { "epoch": 0.47216023388061273, "grad_norm": 0.0791015625, "learning_rate": 0.00012682091726756904, "loss": 1.0847, "step": 805 }, { "epoch": 0.4750929061407408, "grad_norm": 0.078125, "learning_rate": 0.0001258324140726326, "loss": 1.0789, "step": 810 }, { "epoch": 0.4780255784008688, "grad_norm": 0.0732421875, "learning_rate": 0.00012484119870938103, "loss": 1.1031, "step": 815 }, { "epoch": 0.48095825066099684, "grad_norm": 0.0751953125, "learning_rate": 0.00012384737524640405, "loss": 1.0838, "step": 820 }, { "epoch": 0.48389092292112484, "grad_norm": 0.07275390625, "learning_rate": 0.00012285104802611812, "loss": 1.0795, "step": 825 }, { "epoch": 0.4868235951812529, "grad_norm": 0.07763671875, "learning_rate": 0.00012185232165381141, "loss": 1.0936, "step": 830 }, { "epoch": 0.4897562674413809, "grad_norm": 0.07666015625, "learning_rate": 0.00012085130098666124, "loss": 1.0845, "step": 835 }, { "epoch": 0.49268893970150895, "grad_norm": 0.09130859375, "learning_rate": 0.00011984809112272495, "loss": 1.0807, "step": 840 }, { "epoch": 0.49562161196163695, "grad_norm": 0.07666015625, "learning_rate": 0.00011884279738990565, "loss": 1.0834, "step": 845 }, { "epoch": 0.498554284221765, "grad_norm": 0.08203125, "learning_rate": 0.00011783552533489372, "loss": 1.0944, "step": 850 }, { "epoch": 0.501486956481893, "grad_norm": 0.0771484375, "learning_rate": 0.00011682638071208533, "loss": 1.095, "step": 855 }, { "epoch": 0.5044196287420211, "grad_norm": 0.08056640625, "learning_rate": 0.00011581546947247927, "loss": 1.0819, "step": 860 }, { "epoch": 0.5073523010021491, "grad_norm": 0.0732421875, "learning_rate": 0.00011480289775255295, "loss": 1.0875, "step": 865 }, { "epoch": 0.5102849732622772, "grad_norm": 0.07666015625, "learning_rate": 0.00011378877186311912, "loss": 1.0879, "step": 870 }, { "epoch": 0.5132176455224051, "grad_norm": 0.0751953125, "learning_rate": 0.00011277319827816423, "loss": 1.1074, "step": 875 }, { "epoch": 0.5161503177825332, "grad_norm": 0.0771484375, "learning_rate": 0.0001117562836236695, "loss": 1.0842, "step": 880 }, { "epoch": 0.5190829900426612, "grad_norm": 0.078125, "learning_rate": 0.00011073813466641632, "loss": 1.0812, "step": 885 }, { "epoch": 0.5220156623027893, "grad_norm": 0.0732421875, "learning_rate": 0.00010971885830277657, "loss": 1.108, "step": 890 }, { "epoch": 0.5249483345629172, "grad_norm": 0.0849609375, "learning_rate": 0.00010869856154748956, "loss": 1.0808, "step": 895 }, { "epoch": 0.5278810068230453, "grad_norm": 0.076171875, "learning_rate": 0.00010767735152242649, "loss": 1.0734, "step": 900 }, { "epoch": 0.5308136790831733, "grad_norm": 0.07470703125, "learning_rate": 0.00010665533544534343, "loss": 1.0865, "step": 905 }, { "epoch": 0.5337463513433014, "grad_norm": 0.078125, "learning_rate": 0.00010563262061862471, "loss": 1.0818, "step": 910 }, { "epoch": 0.5366790236034293, "grad_norm": 0.07421875, "learning_rate": 0.000104609314418017, "loss": 1.0719, "step": 915 }, { "epoch": 0.5396116958635574, "grad_norm": 0.07763671875, "learning_rate": 0.00010358552428135575, "loss": 1.0836, "step": 920 }, { "epoch": 0.5425443681236854, "grad_norm": 0.07421875, "learning_rate": 0.00010256135769728539, "loss": 1.0967, "step": 925 }, { "epoch": 0.5454770403838135, "grad_norm": 0.0732421875, "learning_rate": 0.00010153692219397387, "loss": 1.0691, "step": 930 }, { "epoch": 0.5484097126439416, "grad_norm": 0.07275390625, "learning_rate": 0.00010051232532782313, "loss": 1.0766, "step": 935 }, { "epoch": 0.5513423849040695, "grad_norm": 0.07666015625, "learning_rate": 9.94876746721769e-05, "loss": 1.0727, "step": 940 }, { "epoch": 0.5542750571641976, "grad_norm": 0.07568359375, "learning_rate": 9.84630778060262e-05, "loss": 1.0784, "step": 945 }, { "epoch": 0.5572077294243256, "grad_norm": 0.0791015625, "learning_rate": 9.743864230271465e-05, "loss": 1.0809, "step": 950 }, { "epoch": 0.5601404016844537, "grad_norm": 0.078125, "learning_rate": 9.641447571864429e-05, "loss": 1.0809, "step": 955 }, { "epoch": 0.5630730739445816, "grad_norm": 0.07470703125, "learning_rate": 9.539068558198304e-05, "loss": 1.0784, "step": 960 }, { "epoch": 0.5660057462047097, "grad_norm": 0.0751953125, "learning_rate": 9.436737938137531e-05, "loss": 1.0707, "step": 965 }, { "epoch": 0.5689384184648377, "grad_norm": 0.0751953125, "learning_rate": 9.33446645546566e-05, "loss": 1.0821, "step": 970 }, { "epoch": 0.5718710907249658, "grad_norm": 0.07666015625, "learning_rate": 9.232264847757357e-05, "loss": 1.0715, "step": 975 }, { "epoch": 0.5748037629850937, "grad_norm": 0.078125, "learning_rate": 9.130143845251046e-05, "loss": 1.0746, "step": 980 }, { "epoch": 0.5777364352452218, "grad_norm": 0.0732421875, "learning_rate": 9.028114169722347e-05, "loss": 1.0987, "step": 985 }, { "epoch": 0.5806691075053498, "grad_norm": 0.07666015625, "learning_rate": 8.92618653335837e-05, "loss": 1.0758, "step": 990 }, { "epoch": 0.5836017797654779, "grad_norm": 0.07373046875, "learning_rate": 8.824371637633053e-05, "loss": 1.0523, "step": 995 }, { "epoch": 0.586534452025606, "grad_norm": 0.07666015625, "learning_rate": 8.722680172183578e-05, "loss": 1.0797, "step": 1000 }, { "epoch": 0.5894671242857339, "grad_norm": 0.07470703125, "learning_rate": 8.62112281368809e-05, "loss": 1.0951, "step": 1005 }, { "epoch": 0.592399796545862, "grad_norm": 0.076171875, "learning_rate": 8.519710224744709e-05, "loss": 1.0833, "step": 1010 }, { "epoch": 0.59533246880599, "grad_norm": 0.07861328125, "learning_rate": 8.418453052752076e-05, "loss": 1.0717, "step": 1015 }, { "epoch": 0.5982651410661181, "grad_norm": 0.078125, "learning_rate": 8.317361928791469e-05, "loss": 1.0757, "step": 1020 }, { "epoch": 0.601197813326246, "grad_norm": 0.07861328125, "learning_rate": 8.216447466510631e-05, "loss": 1.0819, "step": 1025 }, { "epoch": 0.604130485586374, "grad_norm": 0.076171875, "learning_rate": 8.115720261009437e-05, "loss": 1.075, "step": 1030 }, { "epoch": 0.6070631578465021, "grad_norm": 0.0751953125, "learning_rate": 8.015190887727509e-05, "loss": 1.0778, "step": 1035 }, { "epoch": 0.6099958301066302, "grad_norm": 0.07666015625, "learning_rate": 7.914869901333877e-05, "loss": 1.0564, "step": 1040 }, { "epoch": 0.6129285023667581, "grad_norm": 0.07470703125, "learning_rate": 7.81476783461886e-05, "loss": 1.0812, "step": 1045 }, { "epoch": 0.6158611746268862, "grad_norm": 0.0751953125, "learning_rate": 7.714895197388189e-05, "loss": 1.0919, "step": 1050 }, { "epoch": 0.6187938468870142, "grad_norm": 0.0732421875, "learning_rate": 7.615262475359597e-05, "loss": 1.0833, "step": 1055 }, { "epoch": 0.6217265191471423, "grad_norm": 0.07275390625, "learning_rate": 7.5158801290619e-05, "loss": 1.081, "step": 1060 }, { "epoch": 0.6246591914072703, "grad_norm": 0.07568359375, "learning_rate": 7.416758592736744e-05, "loss": 1.0686, "step": 1065 }, { "epoch": 0.6275918636673983, "grad_norm": 0.0732421875, "learning_rate": 7.3179082732431e-05, "loss": 1.0728, "step": 1070 }, { "epoch": 0.6305245359275263, "grad_norm": 0.0771484375, "learning_rate": 7.219339548964644e-05, "loss": 1.0735, "step": 1075 }, { "epoch": 0.6334572081876544, "grad_norm": 0.0771484375, "learning_rate": 7.12106276872011e-05, "loss": 1.0621, "step": 1080 }, { "epoch": 0.6363898804477824, "grad_norm": 0.0751953125, "learning_rate": 7.023088250676784e-05, "loss": 1.071, "step": 1085 }, { "epoch": 0.6393225527079104, "grad_norm": 0.07763671875, "learning_rate": 6.925426281267147e-05, "loss": 1.0794, "step": 1090 }, { "epoch": 0.6422552249680384, "grad_norm": 0.0791015625, "learning_rate": 6.82808711410894e-05, "loss": 1.0637, "step": 1095 }, { "epoch": 0.6451878972281665, "grad_norm": 0.07275390625, "learning_rate": 6.731080968928599e-05, "loss": 1.0665, "step": 1100 }, { "epoch": 0.6481205694882946, "grad_norm": 0.0732421875, "learning_rate": 6.63441803048828e-05, "loss": 1.0773, "step": 1105 }, { "epoch": 0.6510532417484225, "grad_norm": 0.07470703125, "learning_rate": 6.538108447516558e-05, "loss": 1.0689, "step": 1110 }, { "epoch": 0.6539859140085506, "grad_norm": 0.0751953125, "learning_rate": 6.442162331642907e-05, "loss": 1.0894, "step": 1115 }, { "epoch": 0.6569185862686786, "grad_norm": 0.072265625, "learning_rate": 6.34658975633605e-05, "loss": 1.0772, "step": 1120 }, { "epoch": 0.6598512585288067, "grad_norm": 0.07470703125, "learning_rate": 6.251400755846372e-05, "loss": 1.0653, "step": 1125 }, { "epoch": 0.6627839307889347, "grad_norm": 0.07568359375, "learning_rate": 6.15660532415237e-05, "loss": 1.0827, "step": 1130 }, { "epoch": 0.6657166030490627, "grad_norm": 0.076171875, "learning_rate": 6.0622134139114194e-05, "loss": 1.0833, "step": 1135 }, { "epoch": 0.6686492753091907, "grad_norm": 0.07373046875, "learning_rate": 5.968234935414807e-05, "loss": 1.081, "step": 1140 }, { "epoch": 0.6715819475693188, "grad_norm": 0.0771484375, "learning_rate": 5.874679755547254e-05, "loss": 1.0955, "step": 1145 }, { "epoch": 0.6745146198294468, "grad_norm": 0.07470703125, "learning_rate": 5.7815576967509733e-05, "loss": 1.0862, "step": 1150 }, { "epoch": 0.6774472920895748, "grad_norm": 0.07763671875, "learning_rate": 5.688878535994421e-05, "loss": 1.0885, "step": 1155 }, { "epoch": 0.6803799643497028, "grad_norm": 0.078125, "learning_rate": 5.5966520037457716e-05, "loss": 1.0578, "step": 1160 }, { "epoch": 0.6833126366098309, "grad_norm": 0.0751953125, "learning_rate": 5.5048877829513424e-05, "loss": 1.0914, "step": 1165 }, { "epoch": 0.6862453088699589, "grad_norm": 0.07470703125, "learning_rate": 5.413595508018952e-05, "loss": 1.0837, "step": 1170 }, { "epoch": 0.6891779811300869, "grad_norm": 0.07568359375, "learning_rate": 5.3227847638064e-05, "loss": 1.0665, "step": 1175 }, { "epoch": 0.6921106533902149, "grad_norm": 0.078125, "learning_rate": 5.232465084615135e-05, "loss": 1.0815, "step": 1180 }, { "epoch": 0.695043325650343, "grad_norm": 0.07470703125, "learning_rate": 5.1426459531892714e-05, "loss": 1.1066, "step": 1185 }, { "epoch": 0.697975997910471, "grad_norm": 0.07666015625, "learning_rate": 5.0533367997199376e-05, "loss": 1.0758, "step": 1190 }, { "epoch": 0.700908670170599, "grad_norm": 0.076171875, "learning_rate": 4.964547000855237e-05, "loss": 1.0704, "step": 1195 }, { "epoch": 0.703841342430727, "grad_norm": 0.07470703125, "learning_rate": 4.876285878715764e-05, "loss": 1.067, "step": 1200 }, { "epoch": 0.7067740146908551, "grad_norm": 0.078125, "learning_rate": 4.7885626999158695e-05, "loss": 1.0796, "step": 1205 }, { "epoch": 0.7097066869509832, "grad_norm": 0.07568359375, "learning_rate": 4.701386674590742e-05, "loss": 1.0572, "step": 1210 }, { "epoch": 0.7126393592111112, "grad_norm": 0.07666015625, "learning_rate": 4.614766955429447e-05, "loss": 1.0896, "step": 1215 }, { "epoch": 0.7155720314712392, "grad_norm": 0.07568359375, "learning_rate": 4.528712636713964e-05, "loss": 1.0699, "step": 1220 }, { "epoch": 0.7185047037313672, "grad_norm": 0.07470703125, "learning_rate": 4.443232753364377e-05, "loss": 1.081, "step": 1225 }, { "epoch": 0.7214373759914953, "grad_norm": 0.0751953125, "learning_rate": 4.358336279990268e-05, "loss": 1.0733, "step": 1230 }, { "epoch": 0.7243700482516233, "grad_norm": 0.07666015625, "learning_rate": 4.274032129948512e-05, "loss": 1.0983, "step": 1235 }, { "epoch": 0.7273027205117513, "grad_norm": 0.07861328125, "learning_rate": 4.1903291544073986e-05, "loss": 1.0868, "step": 1240 }, { "epoch": 0.7302353927718793, "grad_norm": 0.07861328125, "learning_rate": 4.107236141417382e-05, "loss": 1.0871, "step": 1245 }, { "epoch": 0.7331680650320074, "grad_norm": 0.07373046875, "learning_rate": 4.024761814988399e-05, "loss": 1.0787, "step": 1250 }, { "epoch": 0.7361007372921354, "grad_norm": 0.07421875, "learning_rate": 3.942914834173932e-05, "loss": 1.0823, "step": 1255 }, { "epoch": 0.7390334095522634, "grad_norm": 0.0771484375, "learning_rate": 3.8617037921618705e-05, "loss": 1.0632, "step": 1260 }, { "epoch": 0.7419660818123914, "grad_norm": 0.07470703125, "learning_rate": 3.781137215372345e-05, "loss": 1.0764, "step": 1265 }, { "epoch": 0.7448987540725195, "grad_norm": 0.07421875, "learning_rate": 3.701223562562478e-05, "loss": 1.0878, "step": 1270 }, { "epoch": 0.7478314263326475, "grad_norm": 0.07373046875, "learning_rate": 3.621971223938334e-05, "loss": 1.0832, "step": 1275 }, { "epoch": 0.7507640985927756, "grad_norm": 0.076171875, "learning_rate": 3.5433885202740045e-05, "loss": 1.0745, "step": 1280 }, { "epoch": 0.7536967708529035, "grad_norm": 0.07470703125, "learning_rate": 3.4654837020380074e-05, "loss": 1.0719, "step": 1285 }, { "epoch": 0.7566294431130316, "grad_norm": 0.07373046875, "learning_rate": 3.388264948527052e-05, "loss": 1.0929, "step": 1290 }, { "epoch": 0.7595621153731597, "grad_norm": 0.07373046875, "learning_rate": 3.311740367007311e-05, "loss": 1.067, "step": 1295 }, { "epoch": 0.7624947876332877, "grad_norm": 0.07421875, "learning_rate": 3.2359179918632076e-05, "loss": 1.062, "step": 1300 }, { "epoch": 0.7654274598934157, "grad_norm": 0.07275390625, "learning_rate": 3.160805783753897e-05, "loss": 1.06, "step": 1305 }, { "epoch": 0.7683601321535437, "grad_norm": 0.0751953125, "learning_rate": 3.086411628777445e-05, "loss": 1.084, "step": 1310 }, { "epoch": 0.7712928044136718, "grad_norm": 0.078125, "learning_rate": 3.0127433376428983e-05, "loss": 1.0578, "step": 1315 }, { "epoch": 0.7742254766737998, "grad_norm": 0.0732421875, "learning_rate": 2.939808644850184e-05, "loss": 1.0637, "step": 1320 }, { "epoch": 0.7771581489339278, "grad_norm": 0.07177734375, "learning_rate": 2.867615207878096e-05, "loss": 1.0661, "step": 1325 }, { "epoch": 0.7800908211940558, "grad_norm": 0.0751953125, "learning_rate": 2.796170606380313e-05, "loss": 1.0768, "step": 1330 }, { "epoch": 0.7830234934541839, "grad_norm": 0.07470703125, "learning_rate": 2.7254823413896058e-05, "loss": 1.068, "step": 1335 }, { "epoch": 0.7859561657143119, "grad_norm": 0.07470703125, "learning_rate": 2.6555578345302878e-05, "loss": 1.0674, "step": 1340 }, { "epoch": 0.78888883797444, "grad_norm": 0.07373046875, "learning_rate": 2.58640442723904e-05, "loss": 1.0798, "step": 1345 }, { "epoch": 0.7918215102345679, "grad_norm": 0.0751953125, "learning_rate": 2.518029379994089e-05, "loss": 1.0694, "step": 1350 }, { "epoch": 0.794754182494696, "grad_norm": 0.0771484375, "learning_rate": 2.4504398715529554e-05, "loss": 1.0752, "step": 1355 }, { "epoch": 0.797686854754824, "grad_norm": 0.07373046875, "learning_rate": 2.383642998198731e-05, "loss": 1.066, "step": 1360 }, { "epoch": 0.8006195270149521, "grad_norm": 0.0732421875, "learning_rate": 2.317645772995042e-05, "loss": 1.0752, "step": 1365 }, { "epoch": 0.80355219927508, "grad_norm": 0.0751953125, "learning_rate": 2.2524551250497316e-05, "loss": 1.0738, "step": 1370 }, { "epoch": 0.8064848715352081, "grad_norm": 0.07373046875, "learning_rate": 2.1880778987873807e-05, "loss": 1.0717, "step": 1375 }, { "epoch": 0.8094175437953361, "grad_norm": 0.07470703125, "learning_rate": 2.124520853230697e-05, "loss": 1.0781, "step": 1380 }, { "epoch": 0.8123502160554642, "grad_norm": 0.07666015625, "learning_rate": 2.061790661290881e-05, "loss": 1.0774, "step": 1385 }, { "epoch": 0.8152828883155921, "grad_norm": 0.07568359375, "learning_rate": 1.999893909067021e-05, "loss": 1.0817, "step": 1390 }, { "epoch": 0.8182155605757202, "grad_norm": 0.0771484375, "learning_rate": 1.9388370951546432e-05, "loss": 1.0899, "step": 1395 }, { "epoch": 0.8211482328358483, "grad_norm": 0.076171875, "learning_rate": 1.8786266299633738e-05, "loss": 1.0724, "step": 1400 }, { "epoch": 0.8240809050959763, "grad_norm": 0.07421875, "learning_rate": 1.8192688350439424e-05, "loss": 1.0676, "step": 1405 }, { "epoch": 0.8270135773561043, "grad_norm": 0.0732421875, "learning_rate": 1.7607699424244585e-05, "loss": 1.0751, "step": 1410 }, { "epoch": 0.8299462496162323, "grad_norm": 0.07373046875, "learning_rate": 1.7031360939561103e-05, "loss": 1.0992, "step": 1415 }, { "epoch": 0.8328789218763604, "grad_norm": 0.07421875, "learning_rate": 1.646373340668319e-05, "loss": 1.0735, "step": 1420 }, { "epoch": 0.8358115941364884, "grad_norm": 0.0751953125, "learning_rate": 1.5904876421334536e-05, "loss": 1.0686, "step": 1425 }, { "epoch": 0.8387442663966165, "grad_norm": 0.07861328125, "learning_rate": 1.5354848658411048e-05, "loss": 1.079, "step": 1430 }, { "epoch": 0.8416769386567444, "grad_norm": 0.07470703125, "learning_rate": 1.4813707865820747e-05, "loss": 1.1048, "step": 1435 }, { "epoch": 0.8446096109168725, "grad_norm": 0.0751953125, "learning_rate": 1.4281510858420632e-05, "loss": 1.0656, "step": 1440 }, { "epoch": 0.8475422831770005, "grad_norm": 0.07470703125, "learning_rate": 1.3758313512051702e-05, "loss": 1.067, "step": 1445 }, { "epoch": 0.8504749554371286, "grad_norm": 0.07470703125, "learning_rate": 1.3244170757672425e-05, "loss": 1.0804, "step": 1450 }, { "epoch": 0.8534076276972565, "grad_norm": 0.0751953125, "learning_rate": 1.2739136575591581e-05, "loss": 1.0829, "step": 1455 }, { "epoch": 0.8563402999573846, "grad_norm": 0.07421875, "learning_rate": 1.2243263989800768e-05, "loss": 1.0792, "step": 1460 }, { "epoch": 0.8592729722175126, "grad_norm": 0.0751953125, "learning_rate": 1.17566050624074e-05, "loss": 1.0776, "step": 1465 }, { "epoch": 0.8622056444776407, "grad_norm": 0.0732421875, "learning_rate": 1.1279210888168546e-05, "loss": 1.0771, "step": 1470 }, { "epoch": 0.8651383167377686, "grad_norm": 0.0732421875, "learning_rate": 1.0811131589126667e-05, "loss": 1.0872, "step": 1475 }, { "epoch": 0.8680709889978967, "grad_norm": 0.0732421875, "learning_rate": 1.0352416309347001e-05, "loss": 1.0668, "step": 1480 }, { "epoch": 0.8710036612580248, "grad_norm": 0.07373046875, "learning_rate": 9.903113209758096e-06, "loss": 1.0598, "step": 1485 }, { "epoch": 0.8739363335181528, "grad_norm": 0.07275390625, "learning_rate": 9.463269463095203e-06, "loss": 1.0664, "step": 1490 }, { "epoch": 0.8768690057782809, "grad_norm": 0.07421875, "learning_rate": 9.032931248947685e-06, "loss": 1.0709, "step": 1495 }, { "epoch": 0.8798016780384088, "grad_norm": 0.0732421875, "learning_rate": 8.612143748910451e-06, "loss": 1.0585, "step": 1500 }, { "epoch": 0.8827343502985369, "grad_norm": 0.0771484375, "learning_rate": 8.20095114184044e-06, "loss": 1.0641, "step": 1505 }, { "epoch": 0.8856670225586649, "grad_norm": 0.07275390625, "learning_rate": 7.799396599218133e-06, "loss": 1.0674, "step": 1510 }, { "epoch": 0.888599694818793, "grad_norm": 0.07373046875, "learning_rate": 7.40752228061502e-06, "loss": 1.0844, "step": 1515 }, { "epoch": 0.8915323670789209, "grad_norm": 0.07373046875, "learning_rate": 7.0253693292671505e-06, "loss": 1.0832, "step": 1520 }, { "epoch": 0.894465039339049, "grad_norm": 0.07373046875, "learning_rate": 6.65297786775555e-06, "loss": 1.0834, "step": 1525 }, { "epoch": 0.897397711599177, "grad_norm": 0.0771484375, "learning_rate": 6.290386993793618e-06, "loss": 1.0561, "step": 1530 }, { "epoch": 0.9003303838593051, "grad_norm": 0.07373046875, "learning_rate": 5.937634776122348e-06, "loss": 1.0663, "step": 1535 }, { "epoch": 0.903263056119433, "grad_norm": 0.07666015625, "learning_rate": 5.594758250513333e-06, "loss": 1.0691, "step": 1540 }, { "epoch": 0.9061957283795611, "grad_norm": 0.07666015625, "learning_rate": 5.261793415880456e-06, "loss": 1.0672, "step": 1545 }, { "epoch": 0.9091284006396891, "grad_norm": 0.07568359375, "learning_rate": 4.938775230500192e-06, "loss": 1.0781, "step": 1550 }, { "epoch": 0.9120610728998172, "grad_norm": 0.07470703125, "learning_rate": 4.625737608341507e-06, "loss": 1.0764, "step": 1555 }, { "epoch": 0.9149937451599452, "grad_norm": 0.07421875, "learning_rate": 4.322713415504975e-06, "loss": 1.0858, "step": 1560 }, { "epoch": 0.9179264174200732, "grad_norm": 0.0732421875, "learning_rate": 4.029734466772328e-06, "loss": 1.0722, "step": 1565 }, { "epoch": 0.9208590896802012, "grad_norm": 0.07421875, "learning_rate": 3.7468315222660586e-06, "loss": 1.0667, "step": 1570 }, { "epoch": 0.9237917619403293, "grad_norm": 0.07421875, "learning_rate": 3.4740342842199956e-06, "loss": 1.0658, "step": 1575 }, { "epoch": 0.9267244342004574, "grad_norm": 0.07421875, "learning_rate": 3.211371393860718e-06, "loss": 1.0764, "step": 1580 }, { "epoch": 0.9296571064605853, "grad_norm": 0.07470703125, "learning_rate": 2.9588704284006174e-06, "loss": 1.0804, "step": 1585 }, { "epoch": 0.9325897787207134, "grad_norm": 0.0732421875, "learning_rate": 2.7165578981424357e-06, "loss": 1.0749, "step": 1590 }, { "epoch": 0.9355224509808414, "grad_norm": 0.07421875, "learning_rate": 2.484459243695991e-06, "loss": 1.0805, "step": 1595 }, { "epoch": 0.9384551232409695, "grad_norm": 0.07568359375, "learning_rate": 2.262598833307128e-06, "loss": 1.07, "step": 1600 }, { "epoch": 0.9413877955010974, "grad_norm": 0.07470703125, "learning_rate": 2.0509999602992493e-06, "loss": 1.0834, "step": 1605 }, { "epoch": 0.9443204677612255, "grad_norm": 0.0751953125, "learning_rate": 1.849684840627741e-06, "loss": 1.0646, "step": 1610 }, { "epoch": 0.9472531400213535, "grad_norm": 0.07373046875, "learning_rate": 1.6586746105475281e-06, "loss": 1.0626, "step": 1615 }, { "epoch": 0.9501858122814816, "grad_norm": 0.0712890625, "learning_rate": 1.4779893243939359e-06, "loss": 1.0706, "step": 1620 }, { "epoch": 0.9531184845416096, "grad_norm": 0.076171875, "learning_rate": 1.3076479524771644e-06, "loss": 1.0747, "step": 1625 }, { "epoch": 0.9560511568017376, "grad_norm": 0.07470703125, "learning_rate": 1.1476683790905495e-06, "loss": 1.0859, "step": 1630 }, { "epoch": 0.9589838290618656, "grad_norm": 0.07421875, "learning_rate": 9.98067400632985e-07, "loss": 1.0916, "step": 1635 }, { "epoch": 0.9619165013219937, "grad_norm": 0.0751953125, "learning_rate": 8.588607238453006e-07, "loss": 1.0644, "step": 1640 }, { "epoch": 0.9648491735821217, "grad_norm": 0.07421875, "learning_rate": 7.300629641613154e-07, "loss": 1.0783, "step": 1645 }, { "epoch": 0.9677818458422497, "grad_norm": 0.07421875, "learning_rate": 6.116876441733088e-07, "loss": 1.0885, "step": 1650 }, { "epoch": 0.9707145181023777, "grad_norm": 0.07373046875, "learning_rate": 5.037471922122561e-07, "loss": 1.0759, "step": 1655 }, { "epoch": 0.9736471903625058, "grad_norm": 0.07470703125, "learning_rate": 4.062529410429949e-07, "loss": 1.0633, "step": 1660 }, { "epoch": 0.9765798626226339, "grad_norm": 0.07373046875, "learning_rate": 3.192151266743548e-07, "loss": 1.0794, "step": 1665 }, { "epoch": 0.9795125348827618, "grad_norm": 0.072265625, "learning_rate": 2.4264288728451657e-07, "loss": 1.0704, "step": 1670 }, { "epoch": 0.9824452071428899, "grad_norm": 0.0751953125, "learning_rate": 1.7654426226155763e-07, "loss": 1.0678, "step": 1675 }, { "epoch": 0.9853778794030179, "grad_norm": 0.07275390625, "learning_rate": 1.2092619135937177e-07, "loss": 1.067, "step": 1680 }, { "epoch": 0.988310551663146, "grad_norm": 0.07373046875, "learning_rate": 7.579451396908521e-08, "loss": 1.0654, "step": 1685 }, { "epoch": 0.9912432239232739, "grad_norm": 0.0751953125, "learning_rate": 4.1153968505991406e-08, "loss": 1.0512, "step": 1690 }, { "epoch": 0.994175896183402, "grad_norm": 0.07568359375, "learning_rate": 1.7008191912004646e-08, "loss": 1.075, "step": 1695 }, { "epoch": 0.99710856844353, "grad_norm": 0.07568359375, "learning_rate": 3.359719273865469e-09, "loss": 1.0838, "step": 1700 }, { "epoch": 0.9994547062516325, "eval_loss": 1.1863435506820679, "eval_runtime": 1954.7038, "eval_samples_per_second": 8.263, "eval_steps_per_second": 8.263, "step": 1704 }, { "epoch": 0.9994547062516325, "step": 1704, "total_flos": 4.734725057610252e+18, "train_loss": 0.12850537406446788, "train_runtime": 13780.2764, "train_samples_per_second": 15.836, "train_steps_per_second": 0.124 } ], "logging_steps": 5, "max_steps": 1704, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 4.734725057610252e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }