|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 8.492822966507177, |
|
"eval_steps": 500, |
|
"global_step": 10650, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011961722488038277, |
|
"grad_norm": 1.9270328283309937, |
|
"learning_rate": 4.998999599839936e-05, |
|
"loss": 2.3527, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.023923444976076555, |
|
"grad_norm": 1.8812594413757324, |
|
"learning_rate": 4.995998399359744e-05, |
|
"loss": 2.3692, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03588516746411483, |
|
"grad_norm": 1.3909887075424194, |
|
"learning_rate": 4.9929971988795524e-05, |
|
"loss": 2.3111, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04784688995215311, |
|
"grad_norm": 2.809542417526245, |
|
"learning_rate": 4.98999599839936e-05, |
|
"loss": 2.2284, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05980861244019139, |
|
"grad_norm": 3.4008336067199707, |
|
"learning_rate": 4.986994797919168e-05, |
|
"loss": 2.2569, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.07177033492822966, |
|
"grad_norm": 1.2219867706298828, |
|
"learning_rate": 4.983993597438976e-05, |
|
"loss": 2.1762, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.08373205741626795, |
|
"grad_norm": 1.3036127090454102, |
|
"learning_rate": 4.9809923969587836e-05, |
|
"loss": 2.1956, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.09569377990430622, |
|
"grad_norm": 1.468847393989563, |
|
"learning_rate": 4.977991196478592e-05, |
|
"loss": 2.2729, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1076555023923445, |
|
"grad_norm": 1.2088780403137207, |
|
"learning_rate": 4.9749899959984e-05, |
|
"loss": 2.2164, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11961722488038277, |
|
"grad_norm": 1.197135090827942, |
|
"learning_rate": 4.9719887955182076e-05, |
|
"loss": 2.1056, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.13157894736842105, |
|
"grad_norm": 1.309010624885559, |
|
"learning_rate": 4.9689875950380154e-05, |
|
"loss": 2.171, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.14354066985645933, |
|
"grad_norm": 1.3516101837158203, |
|
"learning_rate": 4.965986394557823e-05, |
|
"loss": 2.1898, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.15550239234449761, |
|
"grad_norm": 1.186513900756836, |
|
"learning_rate": 4.962985194077631e-05, |
|
"loss": 2.1427, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.1674641148325359, |
|
"grad_norm": 1.159972906112671, |
|
"learning_rate": 4.959983993597439e-05, |
|
"loss": 2.2603, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.17942583732057416, |
|
"grad_norm": 1.188928484916687, |
|
"learning_rate": 4.956982793117247e-05, |
|
"loss": 2.2281, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.19138755980861244, |
|
"grad_norm": 2.18959903717041, |
|
"learning_rate": 4.953981592637055e-05, |
|
"loss": 2.2187, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.20334928229665072, |
|
"grad_norm": 1.267388939857483, |
|
"learning_rate": 4.9509803921568634e-05, |
|
"loss": 2.1898, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.215311004784689, |
|
"grad_norm": 1.5959223508834839, |
|
"learning_rate": 4.947979191676671e-05, |
|
"loss": 2.1488, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.22727272727272727, |
|
"grad_norm": 1.081666350364685, |
|
"learning_rate": 4.944977991196479e-05, |
|
"loss": 2.2176, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.23923444976076555, |
|
"grad_norm": 1.1691621541976929, |
|
"learning_rate": 4.941976790716287e-05, |
|
"loss": 2.1297, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2511961722488038, |
|
"grad_norm": 1.4069727659225464, |
|
"learning_rate": 4.9389755902360946e-05, |
|
"loss": 2.2035, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.2631578947368421, |
|
"grad_norm": 1.135937213897705, |
|
"learning_rate": 4.9359743897559024e-05, |
|
"loss": 2.1925, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.2751196172248804, |
|
"grad_norm": 1.0926917791366577, |
|
"learning_rate": 4.93297318927571e-05, |
|
"loss": 2.1679, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.28708133971291866, |
|
"grad_norm": 1.0808637142181396, |
|
"learning_rate": 4.9299719887955186e-05, |
|
"loss": 2.2122, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.29904306220095694, |
|
"grad_norm": 1.2694952487945557, |
|
"learning_rate": 4.9269707883153264e-05, |
|
"loss": 2.1643, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.31100478468899523, |
|
"grad_norm": 1.1682099103927612, |
|
"learning_rate": 4.923969587835134e-05, |
|
"loss": 2.2263, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3229665071770335, |
|
"grad_norm": 1.1954610347747803, |
|
"learning_rate": 4.920968387354942e-05, |
|
"loss": 2.1555, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.3349282296650718, |
|
"grad_norm": 1.0608245134353638, |
|
"learning_rate": 4.9179671868747504e-05, |
|
"loss": 2.1918, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.34688995215311, |
|
"grad_norm": 1.2034133672714233, |
|
"learning_rate": 4.914965986394558e-05, |
|
"loss": 2.1101, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.3588516746411483, |
|
"grad_norm": 1.0936003923416138, |
|
"learning_rate": 4.911964785914366e-05, |
|
"loss": 2.137, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.3708133971291866, |
|
"grad_norm": 1.188496708869934, |
|
"learning_rate": 4.908963585434174e-05, |
|
"loss": 2.1864, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.3827751196172249, |
|
"grad_norm": 1.350693941116333, |
|
"learning_rate": 4.905962384953982e-05, |
|
"loss": 2.1491, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.39473684210526316, |
|
"grad_norm": 1.2483429908752441, |
|
"learning_rate": 4.90296118447379e-05, |
|
"loss": 2.1868, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.40669856459330145, |
|
"grad_norm": 1.1137944459915161, |
|
"learning_rate": 4.899959983993598e-05, |
|
"loss": 2.191, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.41866028708133973, |
|
"grad_norm": 1.3261072635650635, |
|
"learning_rate": 4.8969587835134056e-05, |
|
"loss": 2.1336, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.430622009569378, |
|
"grad_norm": 1.6815850734710693, |
|
"learning_rate": 4.8939575830332134e-05, |
|
"loss": 2.1524, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.44258373205741625, |
|
"grad_norm": 1.080824851989746, |
|
"learning_rate": 4.890956382553021e-05, |
|
"loss": 2.2056, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.45454545454545453, |
|
"grad_norm": 1.2140378952026367, |
|
"learning_rate": 4.887955182072829e-05, |
|
"loss": 2.2114, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.4665071770334928, |
|
"grad_norm": 1.1290125846862793, |
|
"learning_rate": 4.884953981592637e-05, |
|
"loss": 2.101, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.4784688995215311, |
|
"grad_norm": 1.171129822731018, |
|
"learning_rate": 4.881952781112445e-05, |
|
"loss": 2.2186, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.4904306220095694, |
|
"grad_norm": 1.99854576587677, |
|
"learning_rate": 4.878951580632253e-05, |
|
"loss": 2.154, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.5023923444976076, |
|
"grad_norm": 1.1021254062652588, |
|
"learning_rate": 4.8759503801520615e-05, |
|
"loss": 2.1066, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.5143540669856459, |
|
"grad_norm": 1.022976040840149, |
|
"learning_rate": 4.872949179671869e-05, |
|
"loss": 2.1642, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.5263157894736842, |
|
"grad_norm": 1.110926866531372, |
|
"learning_rate": 4.869947979191677e-05, |
|
"loss": 2.1592, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5382775119617225, |
|
"grad_norm": 1.096807599067688, |
|
"learning_rate": 4.866946778711485e-05, |
|
"loss": 2.2171, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.5502392344497608, |
|
"grad_norm": 1.2465318441390991, |
|
"learning_rate": 4.8639455782312926e-05, |
|
"loss": 2.1794, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.562200956937799, |
|
"grad_norm": 1.6367931365966797, |
|
"learning_rate": 4.8609443777511004e-05, |
|
"loss": 2.1405, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.5741626794258373, |
|
"grad_norm": 1.3877207040786743, |
|
"learning_rate": 4.857943177270909e-05, |
|
"loss": 2.1781, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.5861244019138756, |
|
"grad_norm": 1.1698716878890991, |
|
"learning_rate": 4.8549419767907166e-05, |
|
"loss": 2.2076, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.5980861244019139, |
|
"grad_norm": 1.1922690868377686, |
|
"learning_rate": 4.8519407763105244e-05, |
|
"loss": 2.1515, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.6100478468899522, |
|
"grad_norm": 1.1112874746322632, |
|
"learning_rate": 4.848939575830332e-05, |
|
"loss": 2.0535, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.6220095693779905, |
|
"grad_norm": 1.3220607042312622, |
|
"learning_rate": 4.84593837535014e-05, |
|
"loss": 2.1978, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6339712918660287, |
|
"grad_norm": 1.2560738325119019, |
|
"learning_rate": 4.8429371748699484e-05, |
|
"loss": 2.245, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.645933014354067, |
|
"grad_norm": 1.1312100887298584, |
|
"learning_rate": 4.839935974389756e-05, |
|
"loss": 2.1252, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.6578947368421053, |
|
"grad_norm": 1.2060538530349731, |
|
"learning_rate": 4.836934773909564e-05, |
|
"loss": 2.1268, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.6698564593301436, |
|
"grad_norm": 2.0435290336608887, |
|
"learning_rate": 4.8339335734293725e-05, |
|
"loss": 2.2091, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.6818181818181818, |
|
"grad_norm": 2.7680532932281494, |
|
"learning_rate": 4.83093237294918e-05, |
|
"loss": 2.0631, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.69377990430622, |
|
"grad_norm": 1.1256909370422363, |
|
"learning_rate": 4.827931172468988e-05, |
|
"loss": 2.1396, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.7057416267942583, |
|
"grad_norm": 1.1224644184112549, |
|
"learning_rate": 4.824929971988796e-05, |
|
"loss": 2.107, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.7177033492822966, |
|
"grad_norm": 1.2712397575378418, |
|
"learning_rate": 4.8219287715086036e-05, |
|
"loss": 2.1332, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.7296650717703349, |
|
"grad_norm": 1.2399568557739258, |
|
"learning_rate": 4.8189275710284114e-05, |
|
"loss": 2.1198, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.7416267942583732, |
|
"grad_norm": 1.0852080583572388, |
|
"learning_rate": 4.815926370548219e-05, |
|
"loss": 2.1436, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.7535885167464115, |
|
"grad_norm": 1.3282052278518677, |
|
"learning_rate": 4.812925170068027e-05, |
|
"loss": 2.1763, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.7655502392344498, |
|
"grad_norm": 1.8598517179489136, |
|
"learning_rate": 4.809923969587835e-05, |
|
"loss": 2.1188, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.777511961722488, |
|
"grad_norm": 1.1602433919906616, |
|
"learning_rate": 4.806922769107643e-05, |
|
"loss": 2.2234, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.7894736842105263, |
|
"grad_norm": 1.3578499555587769, |
|
"learning_rate": 4.803921568627452e-05, |
|
"loss": 2.1404, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.8014354066985646, |
|
"grad_norm": 1.4764407873153687, |
|
"learning_rate": 4.8009203681472595e-05, |
|
"loss": 2.1582, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.8133971291866029, |
|
"grad_norm": 1.083958387374878, |
|
"learning_rate": 4.797919167667067e-05, |
|
"loss": 2.1156, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.8253588516746412, |
|
"grad_norm": 1.2568596601486206, |
|
"learning_rate": 4.794917967186875e-05, |
|
"loss": 2.1341, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.8373205741626795, |
|
"grad_norm": 1.1657259464263916, |
|
"learning_rate": 4.791916766706683e-05, |
|
"loss": 2.1245, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8492822966507177, |
|
"grad_norm": 2.355947256088257, |
|
"learning_rate": 4.7889155662264906e-05, |
|
"loss": 2.1975, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.861244019138756, |
|
"grad_norm": 2.6566946506500244, |
|
"learning_rate": 4.7859143657462984e-05, |
|
"loss": 2.1263, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.8732057416267942, |
|
"grad_norm": 1.2993121147155762, |
|
"learning_rate": 4.782913165266107e-05, |
|
"loss": 2.1481, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.8851674641148325, |
|
"grad_norm": 1.129744291305542, |
|
"learning_rate": 4.7799119647859146e-05, |
|
"loss": 2.1574, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.8971291866028708, |
|
"grad_norm": 1.1695717573165894, |
|
"learning_rate": 4.7769107643057224e-05, |
|
"loss": 2.0916, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 1.159279465675354, |
|
"learning_rate": 4.77390956382553e-05, |
|
"loss": 2.1265, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.9210526315789473, |
|
"grad_norm": 1.2150417566299438, |
|
"learning_rate": 4.770908363345338e-05, |
|
"loss": 2.1351, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.9330143540669856, |
|
"grad_norm": 1.2673773765563965, |
|
"learning_rate": 4.7679071628651465e-05, |
|
"loss": 2.2444, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.9449760765550239, |
|
"grad_norm": 1.1746214628219604, |
|
"learning_rate": 4.764905962384954e-05, |
|
"loss": 2.1371, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.9569377990430622, |
|
"grad_norm": 1.3716073036193848, |
|
"learning_rate": 4.761904761904762e-05, |
|
"loss": 2.1414, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.9688995215311005, |
|
"grad_norm": 1.1066573858261108, |
|
"learning_rate": 4.7589035614245705e-05, |
|
"loss": 2.0949, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.9808612440191388, |
|
"grad_norm": 1.1547194719314575, |
|
"learning_rate": 4.755902360944378e-05, |
|
"loss": 2.1023, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.992822966507177, |
|
"grad_norm": 1.5456453561782837, |
|
"learning_rate": 4.752901160464186e-05, |
|
"loss": 2.1542, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.0047846889952152, |
|
"grad_norm": 1.7362697124481201, |
|
"learning_rate": 4.749899959983994e-05, |
|
"loss": 2.0444, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.0167464114832536, |
|
"grad_norm": 5.408290386199951, |
|
"learning_rate": 4.7468987595038016e-05, |
|
"loss": 1.8079, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.0287081339712918, |
|
"grad_norm": 3.33227276802063, |
|
"learning_rate": 4.7438975590236094e-05, |
|
"loss": 1.9851, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.0406698564593302, |
|
"grad_norm": 1.4184224605560303, |
|
"learning_rate": 4.740896358543417e-05, |
|
"loss": 1.8732, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.0526315789473684, |
|
"grad_norm": 1.5775929689407349, |
|
"learning_rate": 4.737895158063225e-05, |
|
"loss": 1.9714, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.0645933014354068, |
|
"grad_norm": 1.4744929075241089, |
|
"learning_rate": 4.7348939575830335e-05, |
|
"loss": 1.8901, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.076555023923445, |
|
"grad_norm": 1.5280168056488037, |
|
"learning_rate": 4.731892757102841e-05, |
|
"loss": 1.9348, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.0885167464114833, |
|
"grad_norm": 1.2531495094299316, |
|
"learning_rate": 4.72889155662265e-05, |
|
"loss": 1.83, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.1004784688995215, |
|
"grad_norm": 1.3821693658828735, |
|
"learning_rate": 4.7258903561424575e-05, |
|
"loss": 1.7183, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.11244019138756, |
|
"grad_norm": 1.3789594173431396, |
|
"learning_rate": 4.722889155662265e-05, |
|
"loss": 1.8931, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.124401913875598, |
|
"grad_norm": 1.2702490091323853, |
|
"learning_rate": 4.719887955182073e-05, |
|
"loss": 1.7617, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.1363636363636362, |
|
"grad_norm": 1.4505800008773804, |
|
"learning_rate": 4.716886754701881e-05, |
|
"loss": 1.9103, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.1483253588516746, |
|
"grad_norm": 1.612985610961914, |
|
"learning_rate": 4.7138855542216886e-05, |
|
"loss": 1.9471, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.160287081339713, |
|
"grad_norm": 1.2852972745895386, |
|
"learning_rate": 4.710884353741497e-05, |
|
"loss": 1.9249, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.1722488038277512, |
|
"grad_norm": 1.385501503944397, |
|
"learning_rate": 4.707883153261305e-05, |
|
"loss": 1.8883, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.1842105263157894, |
|
"grad_norm": 1.4401298761367798, |
|
"learning_rate": 4.704881952781113e-05, |
|
"loss": 1.94, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.1961722488038278, |
|
"grad_norm": 3.9501471519470215, |
|
"learning_rate": 4.7018807523009204e-05, |
|
"loss": 1.893, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.208133971291866, |
|
"grad_norm": 1.3335622549057007, |
|
"learning_rate": 4.698879551820728e-05, |
|
"loss": 1.7215, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.2200956937799043, |
|
"grad_norm": 1.6928309202194214, |
|
"learning_rate": 4.695878351340536e-05, |
|
"loss": 1.8889, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.2320574162679425, |
|
"grad_norm": 1.2327487468719482, |
|
"learning_rate": 4.6928771508603445e-05, |
|
"loss": 1.8503, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.244019138755981, |
|
"grad_norm": 1.3527581691741943, |
|
"learning_rate": 4.689875950380152e-05, |
|
"loss": 1.7963, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.255980861244019, |
|
"grad_norm": 1.4024996757507324, |
|
"learning_rate": 4.686874749899961e-05, |
|
"loss": 1.8679, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.2679425837320575, |
|
"grad_norm": 1.6798954010009766, |
|
"learning_rate": 4.6838735494197685e-05, |
|
"loss": 1.8944, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.2799043062200957, |
|
"grad_norm": 1.4541043043136597, |
|
"learning_rate": 4.680872348939576e-05, |
|
"loss": 1.9555, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.291866028708134, |
|
"grad_norm": 1.503612756729126, |
|
"learning_rate": 4.677871148459384e-05, |
|
"loss": 1.8223, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.3038277511961722, |
|
"grad_norm": 1.4559051990509033, |
|
"learning_rate": 4.674869947979192e-05, |
|
"loss": 1.8442, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.3157894736842106, |
|
"grad_norm": 1.3559598922729492, |
|
"learning_rate": 4.6718687474989997e-05, |
|
"loss": 1.933, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.3277511961722488, |
|
"grad_norm": 1.3937571048736572, |
|
"learning_rate": 4.6688675470188074e-05, |
|
"loss": 1.864, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.339712918660287, |
|
"grad_norm": 1.356520175933838, |
|
"learning_rate": 4.665866346538615e-05, |
|
"loss": 1.856, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.3516746411483254, |
|
"grad_norm": 1.6281076669692993, |
|
"learning_rate": 4.662865146058424e-05, |
|
"loss": 1.8623, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.3636363636363638, |
|
"grad_norm": 1.390368103981018, |
|
"learning_rate": 4.6598639455782315e-05, |
|
"loss": 1.8775, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.375598086124402, |
|
"grad_norm": 1.575172781944275, |
|
"learning_rate": 4.656862745098039e-05, |
|
"loss": 1.9558, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.38755980861244, |
|
"grad_norm": 1.6121597290039062, |
|
"learning_rate": 4.653861544617848e-05, |
|
"loss": 1.8698, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.3995215311004785, |
|
"grad_norm": 1.4013128280639648, |
|
"learning_rate": 4.6508603441376555e-05, |
|
"loss": 1.8567, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.4114832535885167, |
|
"grad_norm": 1.636841893196106, |
|
"learning_rate": 4.647859143657463e-05, |
|
"loss": 1.8708, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.423444976076555, |
|
"grad_norm": 1.6554105281829834, |
|
"learning_rate": 4.644857943177271e-05, |
|
"loss": 1.9281, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.4354066985645932, |
|
"grad_norm": 1.7569769620895386, |
|
"learning_rate": 4.641856742697079e-05, |
|
"loss": 1.8563, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.4473684210526316, |
|
"grad_norm": 1.5896693468093872, |
|
"learning_rate": 4.638855542216887e-05, |
|
"loss": 1.8764, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.4593301435406698, |
|
"grad_norm": 1.3887263536453247, |
|
"learning_rate": 4.635854341736695e-05, |
|
"loss": 1.8871, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.4712918660287082, |
|
"grad_norm": 1.6596853733062744, |
|
"learning_rate": 4.632853141256503e-05, |
|
"loss": 1.9176, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.4832535885167464, |
|
"grad_norm": 1.6174405813217163, |
|
"learning_rate": 4.629851940776311e-05, |
|
"loss": 1.8109, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.4952153110047846, |
|
"grad_norm": 1.3717613220214844, |
|
"learning_rate": 4.6268507402961185e-05, |
|
"loss": 1.867, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.507177033492823, |
|
"grad_norm": 1.4477450847625732, |
|
"learning_rate": 4.623849539815926e-05, |
|
"loss": 1.929, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.5191387559808613, |
|
"grad_norm": 1.4237533807754517, |
|
"learning_rate": 4.620848339335734e-05, |
|
"loss": 1.8444, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 1.5311004784688995, |
|
"grad_norm": 1.41818106174469, |
|
"learning_rate": 4.6178471388555425e-05, |
|
"loss": 1.8505, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.5430622009569377, |
|
"grad_norm": 1.5824397802352905, |
|
"learning_rate": 4.61484593837535e-05, |
|
"loss": 1.773, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.555023923444976, |
|
"grad_norm": 1.6391881704330444, |
|
"learning_rate": 4.611844737895159e-05, |
|
"loss": 1.9057, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.5669856459330145, |
|
"grad_norm": 1.5484305620193481, |
|
"learning_rate": 4.6088435374149665e-05, |
|
"loss": 1.9141, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.5789473684210527, |
|
"grad_norm": 1.4594415426254272, |
|
"learning_rate": 4.605842336934774e-05, |
|
"loss": 1.8732, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.5909090909090908, |
|
"grad_norm": 1.3924568891525269, |
|
"learning_rate": 4.602841136454582e-05, |
|
"loss": 1.9441, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 1.6028708133971292, |
|
"grad_norm": 1.523986577987671, |
|
"learning_rate": 4.59983993597439e-05, |
|
"loss": 1.9101, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.6148325358851676, |
|
"grad_norm": 1.369285225868225, |
|
"learning_rate": 4.596838735494198e-05, |
|
"loss": 1.8829, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.6267942583732058, |
|
"grad_norm": 1.4909306764602661, |
|
"learning_rate": 4.5938375350140055e-05, |
|
"loss": 1.9204, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.638755980861244, |
|
"grad_norm": 1.5464478731155396, |
|
"learning_rate": 4.590836334533814e-05, |
|
"loss": 1.8064, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 1.6507177033492821, |
|
"grad_norm": 1.5255078077316284, |
|
"learning_rate": 4.587835134053622e-05, |
|
"loss": 1.9518, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.6626794258373205, |
|
"grad_norm": 1.3710672855377197, |
|
"learning_rate": 4.5848339335734295e-05, |
|
"loss": 1.8957, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 1.674641148325359, |
|
"grad_norm": 1.4883019924163818, |
|
"learning_rate": 4.581832733093237e-05, |
|
"loss": 1.8884, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.686602870813397, |
|
"grad_norm": 1.383284091949463, |
|
"learning_rate": 4.578831532613046e-05, |
|
"loss": 1.8924, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 1.6985645933014353, |
|
"grad_norm": 1.5126210451126099, |
|
"learning_rate": 4.5758303321328535e-05, |
|
"loss": 1.9423, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.7105263157894737, |
|
"grad_norm": 1.4830104112625122, |
|
"learning_rate": 4.572829131652661e-05, |
|
"loss": 1.9377, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 1.722488038277512, |
|
"grad_norm": 1.578748106956482, |
|
"learning_rate": 4.569827931172469e-05, |
|
"loss": 1.8532, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.7344497607655502, |
|
"grad_norm": 3.1164207458496094, |
|
"learning_rate": 4.5668267306922776e-05, |
|
"loss": 1.9072, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.7464114832535884, |
|
"grad_norm": 1.5984658002853394, |
|
"learning_rate": 4.5638255302120853e-05, |
|
"loss": 1.9674, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.7583732057416268, |
|
"grad_norm": 1.5007200241088867, |
|
"learning_rate": 4.560824329731893e-05, |
|
"loss": 1.93, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 1.7703349282296652, |
|
"grad_norm": 2.623798131942749, |
|
"learning_rate": 4.557823129251701e-05, |
|
"loss": 1.9068, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.7822966507177034, |
|
"grad_norm": 2.1396572589874268, |
|
"learning_rate": 4.554821928771509e-05, |
|
"loss": 1.886, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 1.7942583732057416, |
|
"grad_norm": 1.5055629014968872, |
|
"learning_rate": 4.5518207282913165e-05, |
|
"loss": 1.8678, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.80622009569378, |
|
"grad_norm": 1.4418485164642334, |
|
"learning_rate": 4.548819527811124e-05, |
|
"loss": 1.984, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 1.5159984827041626, |
|
"learning_rate": 4.545818327330932e-05, |
|
"loss": 1.9688, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.8301435406698565, |
|
"grad_norm": 1.299607753753662, |
|
"learning_rate": 4.5428171268507405e-05, |
|
"loss": 1.9347, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 1.8421052631578947, |
|
"grad_norm": 1.4144442081451416, |
|
"learning_rate": 4.539815926370549e-05, |
|
"loss": 1.8877, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.8540669856459329, |
|
"grad_norm": 1.5180310010910034, |
|
"learning_rate": 4.536814725890357e-05, |
|
"loss": 1.9392, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.8660287081339713, |
|
"grad_norm": 1.475977897644043, |
|
"learning_rate": 4.5338135254101645e-05, |
|
"loss": 1.8535, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.8779904306220097, |
|
"grad_norm": 1.4614003896713257, |
|
"learning_rate": 4.530812324929972e-05, |
|
"loss": 1.9246, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 1.8899521531100478, |
|
"grad_norm": 1.4736562967300415, |
|
"learning_rate": 4.52781112444978e-05, |
|
"loss": 1.9095, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.901913875598086, |
|
"grad_norm": 1.3201289176940918, |
|
"learning_rate": 4.524809923969588e-05, |
|
"loss": 1.8479, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 1.9138755980861244, |
|
"grad_norm": 1.4976378679275513, |
|
"learning_rate": 4.521808723489396e-05, |
|
"loss": 1.8262, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.9258373205741628, |
|
"grad_norm": 1.5323299169540405, |
|
"learning_rate": 4.5188075230092035e-05, |
|
"loss": 1.8989, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 1.937799043062201, |
|
"grad_norm": 2.050426483154297, |
|
"learning_rate": 4.515806322529012e-05, |
|
"loss": 1.8958, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.9497607655502391, |
|
"grad_norm": 1.822324514389038, |
|
"learning_rate": 4.51280512204882e-05, |
|
"loss": 1.99, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 1.9617224880382775, |
|
"grad_norm": 1.5009537935256958, |
|
"learning_rate": 4.5098039215686275e-05, |
|
"loss": 1.8561, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.973684210526316, |
|
"grad_norm": 1.3751215934753418, |
|
"learning_rate": 4.506802721088435e-05, |
|
"loss": 1.9033, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 1.985645933014354, |
|
"grad_norm": 1.6106884479522705, |
|
"learning_rate": 4.503801520608244e-05, |
|
"loss": 1.9555, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.9976076555023923, |
|
"grad_norm": 1.5378204584121704, |
|
"learning_rate": 4.5008003201280515e-05, |
|
"loss": 2.0009, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 2.0095693779904304, |
|
"grad_norm": 2.0536139011383057, |
|
"learning_rate": 4.497799119647859e-05, |
|
"loss": 1.7212, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.021531100478469, |
|
"grad_norm": 1.7498282194137573, |
|
"learning_rate": 4.494797919167667e-05, |
|
"loss": 1.5574, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 2.0334928229665072, |
|
"grad_norm": 1.7728687524795532, |
|
"learning_rate": 4.4917967186874756e-05, |
|
"loss": 1.4411, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.0454545454545454, |
|
"grad_norm": 1.8067642450332642, |
|
"learning_rate": 4.4887955182072834e-05, |
|
"loss": 1.5242, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 2.0574162679425836, |
|
"grad_norm": 1.924641489982605, |
|
"learning_rate": 4.485794317727091e-05, |
|
"loss": 1.5415, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.069377990430622, |
|
"grad_norm": 1.9768836498260498, |
|
"learning_rate": 4.482793117246899e-05, |
|
"loss": 1.6774, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 2.0813397129186604, |
|
"grad_norm": 1.943829894065857, |
|
"learning_rate": 4.479791916766707e-05, |
|
"loss": 1.6263, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.0933014354066986, |
|
"grad_norm": 2.1001622676849365, |
|
"learning_rate": 4.4767907162865145e-05, |
|
"loss": 1.6304, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.1052631578947367, |
|
"grad_norm": 2.0388505458831787, |
|
"learning_rate": 4.473789515806322e-05, |
|
"loss": 1.4718, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.117224880382775, |
|
"grad_norm": 1.884468913078308, |
|
"learning_rate": 4.47078831532613e-05, |
|
"loss": 1.5752, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 2.1291866028708135, |
|
"grad_norm": 1.9775267839431763, |
|
"learning_rate": 4.4677871148459385e-05, |
|
"loss": 1.478, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.1411483253588517, |
|
"grad_norm": 1.8365753889083862, |
|
"learning_rate": 4.464785914365747e-05, |
|
"loss": 1.5408, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 2.15311004784689, |
|
"grad_norm": 1.8778951168060303, |
|
"learning_rate": 4.461784713885555e-05, |
|
"loss": 1.6373, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.165071770334928, |
|
"grad_norm": 1.9629762172698975, |
|
"learning_rate": 4.4587835134053626e-05, |
|
"loss": 1.5741, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 2.1770334928229667, |
|
"grad_norm": 2.0409107208251953, |
|
"learning_rate": 4.4557823129251704e-05, |
|
"loss": 1.6216, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.188995215311005, |
|
"grad_norm": 2.1008028984069824, |
|
"learning_rate": 4.452781112444978e-05, |
|
"loss": 1.5515, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 2.200956937799043, |
|
"grad_norm": 2.2391457557678223, |
|
"learning_rate": 4.449779911964786e-05, |
|
"loss": 1.6279, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.212918660287081, |
|
"grad_norm": 2.294734239578247, |
|
"learning_rate": 4.446778711484594e-05, |
|
"loss": 1.5232, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.22488038277512, |
|
"grad_norm": 1.6631484031677246, |
|
"learning_rate": 4.443777511004402e-05, |
|
"loss": 1.5113, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.236842105263158, |
|
"grad_norm": 1.9847686290740967, |
|
"learning_rate": 4.44077631052421e-05, |
|
"loss": 1.5006, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 2.248803827751196, |
|
"grad_norm": 1.8953202962875366, |
|
"learning_rate": 4.437775110044018e-05, |
|
"loss": 1.5853, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.2607655502392343, |
|
"grad_norm": 1.9015896320343018, |
|
"learning_rate": 4.4347739095638255e-05, |
|
"loss": 1.6078, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 2.2727272727272725, |
|
"grad_norm": 1.900415301322937, |
|
"learning_rate": 4.431772709083633e-05, |
|
"loss": 1.5399, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.284688995215311, |
|
"grad_norm": 1.9138609170913696, |
|
"learning_rate": 4.428771508603442e-05, |
|
"loss": 1.589, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 2.2966507177033493, |
|
"grad_norm": 1.7661852836608887, |
|
"learning_rate": 4.4257703081232496e-05, |
|
"loss": 1.6258, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.3086124401913874, |
|
"grad_norm": 1.9043537378311157, |
|
"learning_rate": 4.4227691076430573e-05, |
|
"loss": 1.6243, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 2.320574162679426, |
|
"grad_norm": 1.8166050910949707, |
|
"learning_rate": 4.419767907162866e-05, |
|
"loss": 1.5999, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.3325358851674642, |
|
"grad_norm": 1.7325972318649292, |
|
"learning_rate": 4.4167667066826736e-05, |
|
"loss": 1.586, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 2.3444976076555024, |
|
"grad_norm": 1.8609052896499634, |
|
"learning_rate": 4.4137655062024814e-05, |
|
"loss": 1.5466, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.3564593301435406, |
|
"grad_norm": 3.3115549087524414, |
|
"learning_rate": 4.410764305722289e-05, |
|
"loss": 1.5816, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 2.3684210526315788, |
|
"grad_norm": 2.2015438079833984, |
|
"learning_rate": 4.407763105242097e-05, |
|
"loss": 1.5162, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.3803827751196174, |
|
"grad_norm": 1.7339051961898804, |
|
"learning_rate": 4.404761904761905e-05, |
|
"loss": 1.5764, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 2.3923444976076556, |
|
"grad_norm": 2.817207098007202, |
|
"learning_rate": 4.4017607042817125e-05, |
|
"loss": 1.5633, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.4043062200956937, |
|
"grad_norm": 2.063880681991577, |
|
"learning_rate": 4.39875950380152e-05, |
|
"loss": 1.604, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 2.416267942583732, |
|
"grad_norm": 1.8153194189071655, |
|
"learning_rate": 4.395758303321329e-05, |
|
"loss": 1.6417, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.4282296650717705, |
|
"grad_norm": 3.646466016769409, |
|
"learning_rate": 4.3927571028411365e-05, |
|
"loss": 1.6325, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 2.4401913875598087, |
|
"grad_norm": 1.9638229608535767, |
|
"learning_rate": 4.389755902360945e-05, |
|
"loss": 1.6393, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.452153110047847, |
|
"grad_norm": 2.549917697906494, |
|
"learning_rate": 4.386754701880753e-05, |
|
"loss": 1.6231, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 2.464114832535885, |
|
"grad_norm": 1.8698160648345947, |
|
"learning_rate": 4.3837535014005606e-05, |
|
"loss": 1.4995, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.4760765550239237, |
|
"grad_norm": 1.8844027519226074, |
|
"learning_rate": 4.3807523009203684e-05, |
|
"loss": 1.6133, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 2.488038277511962, |
|
"grad_norm": 2.275132417678833, |
|
"learning_rate": 4.377751100440176e-05, |
|
"loss": 1.6124, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.729272723197937, |
|
"learning_rate": 4.374749899959984e-05, |
|
"loss": 1.6766, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 2.511961722488038, |
|
"grad_norm": 1.9503229856491089, |
|
"learning_rate": 4.3717486994797924e-05, |
|
"loss": 1.6937, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.5239234449760763, |
|
"grad_norm": 1.8774380683898926, |
|
"learning_rate": 4.3687474989996e-05, |
|
"loss": 1.6159, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 2.535885167464115, |
|
"grad_norm": 2.066387176513672, |
|
"learning_rate": 4.365746298519408e-05, |
|
"loss": 1.6234, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.547846889952153, |
|
"grad_norm": 2.7428183555603027, |
|
"learning_rate": 4.362745098039216e-05, |
|
"loss": 1.5469, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 2.5598086124401913, |
|
"grad_norm": 1.9833886623382568, |
|
"learning_rate": 4.3597438975590235e-05, |
|
"loss": 1.5982, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.57177033492823, |
|
"grad_norm": 1.7080726623535156, |
|
"learning_rate": 4.356742697078831e-05, |
|
"loss": 1.5975, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 2.583732057416268, |
|
"grad_norm": 1.9213649034500122, |
|
"learning_rate": 4.35374149659864e-05, |
|
"loss": 1.5921, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.5956937799043063, |
|
"grad_norm": 2.0085928440093994, |
|
"learning_rate": 4.3507402961184476e-05, |
|
"loss": 1.5904, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 2.6076555023923444, |
|
"grad_norm": 1.903548002243042, |
|
"learning_rate": 4.347739095638256e-05, |
|
"loss": 1.5794, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.6196172248803826, |
|
"grad_norm": 1.8258320093154907, |
|
"learning_rate": 4.344737895158064e-05, |
|
"loss": 1.6408, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 2.6315789473684212, |
|
"grad_norm": 2.0597989559173584, |
|
"learning_rate": 4.3417366946778716e-05, |
|
"loss": 1.5868, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.6435406698564594, |
|
"grad_norm": 2.0705902576446533, |
|
"learning_rate": 4.3387354941976794e-05, |
|
"loss": 1.6906, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 2.6555023923444976, |
|
"grad_norm": 1.9880789518356323, |
|
"learning_rate": 4.335734293717487e-05, |
|
"loss": 1.5963, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.6674641148325358, |
|
"grad_norm": 2.0182063579559326, |
|
"learning_rate": 4.332733093237295e-05, |
|
"loss": 1.6478, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 2.679425837320574, |
|
"grad_norm": 1.9995989799499512, |
|
"learning_rate": 4.329731892757103e-05, |
|
"loss": 1.653, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.6913875598086126, |
|
"grad_norm": 2.738987922668457, |
|
"learning_rate": 4.3267306922769105e-05, |
|
"loss": 1.6505, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 2.7033492822966507, |
|
"grad_norm": 2.058044672012329, |
|
"learning_rate": 4.323729491796719e-05, |
|
"loss": 1.5528, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.715311004784689, |
|
"grad_norm": 2.0416853427886963, |
|
"learning_rate": 4.320728291316527e-05, |
|
"loss": 1.5553, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 1.9002925157546997, |
|
"learning_rate": 4.3177270908363346e-05, |
|
"loss": 1.5736, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.7392344497607657, |
|
"grad_norm": 1.8847737312316895, |
|
"learning_rate": 4.314725890356143e-05, |
|
"loss": 1.6232, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 2.751196172248804, |
|
"grad_norm": 1.9627894163131714, |
|
"learning_rate": 4.311724689875951e-05, |
|
"loss": 1.6496, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.763157894736842, |
|
"grad_norm": 1.823258638381958, |
|
"learning_rate": 4.3087234893957586e-05, |
|
"loss": 1.584, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 2.77511961722488, |
|
"grad_norm": 3.361528158187866, |
|
"learning_rate": 4.3057222889155664e-05, |
|
"loss": 1.6163, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 2.787081339712919, |
|
"grad_norm": 2.01798677444458, |
|
"learning_rate": 4.302721088435374e-05, |
|
"loss": 1.4596, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 2.799043062200957, |
|
"grad_norm": 1.9381790161132812, |
|
"learning_rate": 4.2997198879551826e-05, |
|
"loss": 1.6621, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.811004784688995, |
|
"grad_norm": 2.0217368602752686, |
|
"learning_rate": 4.2967186874749904e-05, |
|
"loss": 1.6089, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 2.8229665071770333, |
|
"grad_norm": 1.7677721977233887, |
|
"learning_rate": 4.293717486994798e-05, |
|
"loss": 1.6052, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.8349282296650715, |
|
"grad_norm": 1.9464062452316284, |
|
"learning_rate": 4.290716286514606e-05, |
|
"loss": 1.6751, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 2.84688995215311, |
|
"grad_norm": 1.9557422399520874, |
|
"learning_rate": 4.287715086034414e-05, |
|
"loss": 1.5964, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.8588516746411483, |
|
"grad_norm": 3.1278235912323, |
|
"learning_rate": 4.2847138855542216e-05, |
|
"loss": 1.6272, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 2.8708133971291865, |
|
"grad_norm": 1.8671112060546875, |
|
"learning_rate": 4.2817126850740293e-05, |
|
"loss": 1.6573, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.882775119617225, |
|
"grad_norm": 1.9375852346420288, |
|
"learning_rate": 4.278711484593838e-05, |
|
"loss": 1.6407, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 2.8947368421052633, |
|
"grad_norm": 1.907958984375, |
|
"learning_rate": 4.275710284113646e-05, |
|
"loss": 1.6272, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.9066985645933014, |
|
"grad_norm": 2.1269607543945312, |
|
"learning_rate": 4.272709083633454e-05, |
|
"loss": 1.5664, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 2.9186602870813396, |
|
"grad_norm": 1.766072392463684, |
|
"learning_rate": 4.269707883153262e-05, |
|
"loss": 1.6766, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.930622009569378, |
|
"grad_norm": 2.157346248626709, |
|
"learning_rate": 4.2667066826730696e-05, |
|
"loss": 1.6374, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 2.9425837320574164, |
|
"grad_norm": 3.1585512161254883, |
|
"learning_rate": 4.2637054821928774e-05, |
|
"loss": 1.6082, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.9545454545454546, |
|
"grad_norm": 2.0836970806121826, |
|
"learning_rate": 4.260704281712685e-05, |
|
"loss": 1.6703, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 2.9665071770334928, |
|
"grad_norm": 1.729893445968628, |
|
"learning_rate": 4.257703081232493e-05, |
|
"loss": 1.6557, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.9784688995215314, |
|
"grad_norm": 3.384397268295288, |
|
"learning_rate": 4.254701880752301e-05, |
|
"loss": 1.643, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 2.990430622009569, |
|
"grad_norm": 1.8642953634262085, |
|
"learning_rate": 4.2517006802721085e-05, |
|
"loss": 1.6524, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 3.0023923444976077, |
|
"grad_norm": 1.9247709512710571, |
|
"learning_rate": 4.248699479791917e-05, |
|
"loss": 1.484, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 3.014354066985646, |
|
"grad_norm": 2.0377817153930664, |
|
"learning_rate": 4.245698279311725e-05, |
|
"loss": 1.2241, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 3.026315789473684, |
|
"grad_norm": 2.2331552505493164, |
|
"learning_rate": 4.2426970788315326e-05, |
|
"loss": 1.1948, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 3.0382775119617227, |
|
"grad_norm": 2.3499271869659424, |
|
"learning_rate": 4.239695878351341e-05, |
|
"loss": 1.2828, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 3.050239234449761, |
|
"grad_norm": 2.445600748062134, |
|
"learning_rate": 4.236694677871149e-05, |
|
"loss": 1.1715, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 3.062200956937799, |
|
"grad_norm": 2.801543951034546, |
|
"learning_rate": 4.2336934773909566e-05, |
|
"loss": 1.2167, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 3.074162679425837, |
|
"grad_norm": 2.515307664871216, |
|
"learning_rate": 4.2306922769107644e-05, |
|
"loss": 1.1451, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 3.0861244019138754, |
|
"grad_norm": 2.6123640537261963, |
|
"learning_rate": 4.227691076430572e-05, |
|
"loss": 1.256, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 3.098086124401914, |
|
"grad_norm": 2.602388381958008, |
|
"learning_rate": 4.2246898759503806e-05, |
|
"loss": 1.1867, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 3.110047846889952, |
|
"grad_norm": 2.552335739135742, |
|
"learning_rate": 4.2216886754701884e-05, |
|
"loss": 1.1845, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 3.1220095693779903, |
|
"grad_norm": 2.6270079612731934, |
|
"learning_rate": 4.218687474989996e-05, |
|
"loss": 1.2479, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 3.1339712918660285, |
|
"grad_norm": 2.490518808364868, |
|
"learning_rate": 4.215686274509804e-05, |
|
"loss": 1.2386, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 3.145933014354067, |
|
"grad_norm": 2.348869800567627, |
|
"learning_rate": 4.212685074029612e-05, |
|
"loss": 1.2285, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 3.1578947368421053, |
|
"grad_norm": 2.3546955585479736, |
|
"learning_rate": 4.2096838735494196e-05, |
|
"loss": 1.206, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 3.1698564593301435, |
|
"grad_norm": 2.4429666996002197, |
|
"learning_rate": 4.2066826730692274e-05, |
|
"loss": 1.335, |
|
"step": 3975 |
|
}, |
|
{ |
|
"epoch": 3.1818181818181817, |
|
"grad_norm": 2.397874355316162, |
|
"learning_rate": 4.203681472589036e-05, |
|
"loss": 1.2252, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 3.1937799043062203, |
|
"grad_norm": 2.526556968688965, |
|
"learning_rate": 4.200680272108844e-05, |
|
"loss": 1.2811, |
|
"step": 4005 |
|
}, |
|
{ |
|
"epoch": 3.2057416267942584, |
|
"grad_norm": 2.7083089351654053, |
|
"learning_rate": 4.197679071628652e-05, |
|
"loss": 1.3154, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 3.2177033492822966, |
|
"grad_norm": 2.426650285720825, |
|
"learning_rate": 4.19467787114846e-05, |
|
"loss": 1.2251, |
|
"step": 4035 |
|
}, |
|
{ |
|
"epoch": 3.229665071770335, |
|
"grad_norm": 3.1592352390289307, |
|
"learning_rate": 4.1916766706682676e-05, |
|
"loss": 1.232, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 3.2416267942583734, |
|
"grad_norm": 2.4699387550354004, |
|
"learning_rate": 4.1886754701880754e-05, |
|
"loss": 1.3075, |
|
"step": 4065 |
|
}, |
|
{ |
|
"epoch": 3.2535885167464116, |
|
"grad_norm": 2.410412311553955, |
|
"learning_rate": 4.185674269707883e-05, |
|
"loss": 1.2583, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 3.2655502392344498, |
|
"grad_norm": 2.3662848472595215, |
|
"learning_rate": 4.182673069227691e-05, |
|
"loss": 1.2718, |
|
"step": 4095 |
|
}, |
|
{ |
|
"epoch": 3.277511961722488, |
|
"grad_norm": 2.241677761077881, |
|
"learning_rate": 4.179671868747499e-05, |
|
"loss": 1.2293, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 3.2894736842105265, |
|
"grad_norm": 2.289928674697876, |
|
"learning_rate": 4.176670668267307e-05, |
|
"loss": 1.2369, |
|
"step": 4125 |
|
}, |
|
{ |
|
"epoch": 3.3014354066985647, |
|
"grad_norm": 2.9561991691589355, |
|
"learning_rate": 4.173669467787115e-05, |
|
"loss": 1.1936, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 3.313397129186603, |
|
"grad_norm": 2.6181890964508057, |
|
"learning_rate": 4.170668267306923e-05, |
|
"loss": 1.2791, |
|
"step": 4155 |
|
}, |
|
{ |
|
"epoch": 3.325358851674641, |
|
"grad_norm": 2.208653688430786, |
|
"learning_rate": 4.1676670668267306e-05, |
|
"loss": 1.3175, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 3.3373205741626792, |
|
"grad_norm": 2.460291624069214, |
|
"learning_rate": 4.164665866346539e-05, |
|
"loss": 1.255, |
|
"step": 4185 |
|
}, |
|
{ |
|
"epoch": 3.349282296650718, |
|
"grad_norm": 2.2541019916534424, |
|
"learning_rate": 4.161664665866347e-05, |
|
"loss": 1.2815, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 3.361244019138756, |
|
"grad_norm": 2.543994903564453, |
|
"learning_rate": 4.1586634653861546e-05, |
|
"loss": 1.2888, |
|
"step": 4215 |
|
}, |
|
{ |
|
"epoch": 3.373205741626794, |
|
"grad_norm": 2.7568411827087402, |
|
"learning_rate": 4.1556622649059624e-05, |
|
"loss": 1.2894, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 3.3851674641148324, |
|
"grad_norm": 2.5805466175079346, |
|
"learning_rate": 4.152661064425771e-05, |
|
"loss": 1.3434, |
|
"step": 4245 |
|
}, |
|
{ |
|
"epoch": 3.397129186602871, |
|
"grad_norm": 2.409097194671631, |
|
"learning_rate": 4.149659863945579e-05, |
|
"loss": 1.2903, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 3.409090909090909, |
|
"grad_norm": 4.126059532165527, |
|
"learning_rate": 4.1466586634653865e-05, |
|
"loss": 1.2764, |
|
"step": 4275 |
|
}, |
|
{ |
|
"epoch": 3.4210526315789473, |
|
"grad_norm": 3.106367826461792, |
|
"learning_rate": 4.143657462985194e-05, |
|
"loss": 1.3184, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 3.4330143540669855, |
|
"grad_norm": 2.195138454437256, |
|
"learning_rate": 4.140656262505002e-05, |
|
"loss": 1.2636, |
|
"step": 4305 |
|
}, |
|
{ |
|
"epoch": 3.444976076555024, |
|
"grad_norm": 2.7023708820343018, |
|
"learning_rate": 4.13765506202481e-05, |
|
"loss": 1.3316, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 3.4569377990430623, |
|
"grad_norm": 2.262626886367798, |
|
"learning_rate": 4.1346538615446176e-05, |
|
"loss": 1.2847, |
|
"step": 4335 |
|
}, |
|
{ |
|
"epoch": 3.4688995215311005, |
|
"grad_norm": 2.5416321754455566, |
|
"learning_rate": 4.131652661064426e-05, |
|
"loss": 1.3254, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 3.4808612440191387, |
|
"grad_norm": 2.868903875350952, |
|
"learning_rate": 4.128651460584234e-05, |
|
"loss": 1.2778, |
|
"step": 4365 |
|
}, |
|
{ |
|
"epoch": 3.492822966507177, |
|
"grad_norm": 2.347463607788086, |
|
"learning_rate": 4.125650260104042e-05, |
|
"loss": 1.34, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 3.5047846889952154, |
|
"grad_norm": 2.644416332244873, |
|
"learning_rate": 4.12264905962385e-05, |
|
"loss": 1.2862, |
|
"step": 4395 |
|
}, |
|
{ |
|
"epoch": 3.5167464114832536, |
|
"grad_norm": 2.8803160190582275, |
|
"learning_rate": 4.119647859143658e-05, |
|
"loss": 1.3538, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 3.528708133971292, |
|
"grad_norm": 2.643848180770874, |
|
"learning_rate": 4.1166466586634657e-05, |
|
"loss": 1.3566, |
|
"step": 4425 |
|
}, |
|
{ |
|
"epoch": 3.5406698564593304, |
|
"grad_norm": 2.555978298187256, |
|
"learning_rate": 4.1136454581832734e-05, |
|
"loss": 1.284, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 3.5526315789473686, |
|
"grad_norm": 2.4635751247406006, |
|
"learning_rate": 4.110644257703081e-05, |
|
"loss": 1.3229, |
|
"step": 4455 |
|
}, |
|
{ |
|
"epoch": 3.5645933014354068, |
|
"grad_norm": 2.804314374923706, |
|
"learning_rate": 4.107643057222889e-05, |
|
"loss": 1.2931, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 3.576555023923445, |
|
"grad_norm": 2.5955514907836914, |
|
"learning_rate": 4.1046418567426975e-05, |
|
"loss": 1.3153, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 3.588516746411483, |
|
"grad_norm": 2.4464356899261475, |
|
"learning_rate": 4.101640656262505e-05, |
|
"loss": 1.2963, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 3.6004784688995217, |
|
"grad_norm": 2.8158469200134277, |
|
"learning_rate": 4.098639455782313e-05, |
|
"loss": 1.333, |
|
"step": 4515 |
|
}, |
|
{ |
|
"epoch": 3.61244019138756, |
|
"grad_norm": 2.324192523956299, |
|
"learning_rate": 4.095638255302121e-05, |
|
"loss": 1.3438, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 3.624401913875598, |
|
"grad_norm": 2.5822291374206543, |
|
"learning_rate": 4.0926370548219286e-05, |
|
"loss": 1.381, |
|
"step": 4545 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 2.3783419132232666, |
|
"learning_rate": 4.089635854341737e-05, |
|
"loss": 1.321, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 3.6483253588516744, |
|
"grad_norm": 2.453040361404419, |
|
"learning_rate": 4.086634653861545e-05, |
|
"loss": 1.35, |
|
"step": 4575 |
|
}, |
|
{ |
|
"epoch": 3.660287081339713, |
|
"grad_norm": 2.694587230682373, |
|
"learning_rate": 4.0836334533813526e-05, |
|
"loss": 1.3342, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 3.672248803827751, |
|
"grad_norm": 2.4545223712921143, |
|
"learning_rate": 4.080632252901161e-05, |
|
"loss": 1.4238, |
|
"step": 4605 |
|
}, |
|
{ |
|
"epoch": 3.6842105263157894, |
|
"grad_norm": 2.5401089191436768, |
|
"learning_rate": 4.077631052420969e-05, |
|
"loss": 1.3699, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 3.696172248803828, |
|
"grad_norm": 2.4257302284240723, |
|
"learning_rate": 4.074629851940777e-05, |
|
"loss": 1.3569, |
|
"step": 4635 |
|
}, |
|
{ |
|
"epoch": 3.708133971291866, |
|
"grad_norm": 2.7543747425079346, |
|
"learning_rate": 4.0716286514605845e-05, |
|
"loss": 1.2967, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 3.7200956937799043, |
|
"grad_norm": 2.4614686965942383, |
|
"learning_rate": 4.068627450980392e-05, |
|
"loss": 1.2982, |
|
"step": 4665 |
|
}, |
|
{ |
|
"epoch": 3.7320574162679425, |
|
"grad_norm": 3.7613461017608643, |
|
"learning_rate": 4.0656262505002e-05, |
|
"loss": 1.3812, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 3.7440191387559807, |
|
"grad_norm": 2.60383939743042, |
|
"learning_rate": 4.062625050020008e-05, |
|
"loss": 1.3526, |
|
"step": 4695 |
|
}, |
|
{ |
|
"epoch": 3.7559808612440193, |
|
"grad_norm": 2.3789987564086914, |
|
"learning_rate": 4.0596238495398156e-05, |
|
"loss": 1.3502, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 3.7679425837320575, |
|
"grad_norm": 2.6684768199920654, |
|
"learning_rate": 4.056622649059624e-05, |
|
"loss": 1.4723, |
|
"step": 4725 |
|
}, |
|
{ |
|
"epoch": 3.7799043062200957, |
|
"grad_norm": 2.480144500732422, |
|
"learning_rate": 4.053621448579432e-05, |
|
"loss": 1.3716, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 3.791866028708134, |
|
"grad_norm": 2.429513454437256, |
|
"learning_rate": 4.05062024809924e-05, |
|
"loss": 1.2895, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 3.803827751196172, |
|
"grad_norm": 2.4947898387908936, |
|
"learning_rate": 4.047619047619048e-05, |
|
"loss": 1.4147, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 3.8157894736842106, |
|
"grad_norm": 2.351773500442505, |
|
"learning_rate": 4.044617847138856e-05, |
|
"loss": 1.3712, |
|
"step": 4785 |
|
}, |
|
{ |
|
"epoch": 3.827751196172249, |
|
"grad_norm": 2.4937288761138916, |
|
"learning_rate": 4.041616646658664e-05, |
|
"loss": 1.3342, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 3.839712918660287, |
|
"grad_norm": 3.4912281036376953, |
|
"learning_rate": 4.0386154461784715e-05, |
|
"loss": 1.3403, |
|
"step": 4815 |
|
}, |
|
{ |
|
"epoch": 3.8516746411483256, |
|
"grad_norm": 2.2786455154418945, |
|
"learning_rate": 4.035614245698279e-05, |
|
"loss": 1.335, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 3.8636363636363638, |
|
"grad_norm": 2.7752015590667725, |
|
"learning_rate": 4.032613045218088e-05, |
|
"loss": 1.3739, |
|
"step": 4845 |
|
}, |
|
{ |
|
"epoch": 3.875598086124402, |
|
"grad_norm": 2.510052442550659, |
|
"learning_rate": 4.0296118447378955e-05, |
|
"loss": 1.3793, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 3.88755980861244, |
|
"grad_norm": 4.657649517059326, |
|
"learning_rate": 4.026610644257703e-05, |
|
"loss": 1.3914, |
|
"step": 4875 |
|
}, |
|
{ |
|
"epoch": 3.8995215311004783, |
|
"grad_norm": 2.437033176422119, |
|
"learning_rate": 4.023609443777511e-05, |
|
"loss": 1.3793, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 3.911483253588517, |
|
"grad_norm": 2.7319986820220947, |
|
"learning_rate": 4.020608243297319e-05, |
|
"loss": 1.437, |
|
"step": 4905 |
|
}, |
|
{ |
|
"epoch": 3.923444976076555, |
|
"grad_norm": 2.553680896759033, |
|
"learning_rate": 4.0176070428171266e-05, |
|
"loss": 1.3613, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 3.9354066985645932, |
|
"grad_norm": 2.379471778869629, |
|
"learning_rate": 4.014605842336935e-05, |
|
"loss": 1.3638, |
|
"step": 4935 |
|
}, |
|
{ |
|
"epoch": 3.9473684210526314, |
|
"grad_norm": 2.8651113510131836, |
|
"learning_rate": 4.011604641856743e-05, |
|
"loss": 1.3265, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 3.9593301435406696, |
|
"grad_norm": 2.366116762161255, |
|
"learning_rate": 4.0086034413765513e-05, |
|
"loss": 1.2701, |
|
"step": 4965 |
|
}, |
|
{ |
|
"epoch": 3.971291866028708, |
|
"grad_norm": 2.60257625579834, |
|
"learning_rate": 4.005602240896359e-05, |
|
"loss": 1.305, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 3.9832535885167464, |
|
"grad_norm": 2.544235944747925, |
|
"learning_rate": 4.002601040416167e-05, |
|
"loss": 1.3632, |
|
"step": 4995 |
|
}, |
|
{ |
|
"epoch": 3.9952153110047846, |
|
"grad_norm": 2.541198253631592, |
|
"learning_rate": 3.999599839935975e-05, |
|
"loss": 1.4154, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 4.007177033492823, |
|
"grad_norm": 3.7236313819885254, |
|
"learning_rate": 3.9965986394557825e-05, |
|
"loss": 1.1803, |
|
"step": 5025 |
|
}, |
|
{ |
|
"epoch": 4.019138755980861, |
|
"grad_norm": 3.206791877746582, |
|
"learning_rate": 3.99359743897559e-05, |
|
"loss": 0.9466, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 4.0311004784688995, |
|
"grad_norm": 2.9792520999908447, |
|
"learning_rate": 3.990596238495398e-05, |
|
"loss": 0.8937, |
|
"step": 5055 |
|
}, |
|
{ |
|
"epoch": 4.043062200956938, |
|
"grad_norm": 3.3796586990356445, |
|
"learning_rate": 3.987595038015206e-05, |
|
"loss": 0.9352, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 4.055023923444976, |
|
"grad_norm": 2.383775472640991, |
|
"learning_rate": 3.984593837535014e-05, |
|
"loss": 0.8506, |
|
"step": 5085 |
|
}, |
|
{ |
|
"epoch": 4.0669856459330145, |
|
"grad_norm": 2.6192071437835693, |
|
"learning_rate": 3.981592637054822e-05, |
|
"loss": 0.8886, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 4.078947368421052, |
|
"grad_norm": 3.329030990600586, |
|
"learning_rate": 3.97859143657463e-05, |
|
"loss": 0.9639, |
|
"step": 5115 |
|
}, |
|
{ |
|
"epoch": 4.090909090909091, |
|
"grad_norm": 3.970484733581543, |
|
"learning_rate": 3.975590236094438e-05, |
|
"loss": 0.9112, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 4.1028708133971294, |
|
"grad_norm": 3.082409381866455, |
|
"learning_rate": 3.972589035614246e-05, |
|
"loss": 0.8825, |
|
"step": 5145 |
|
}, |
|
{ |
|
"epoch": 4.114832535885167, |
|
"grad_norm": 2.9433696269989014, |
|
"learning_rate": 3.969587835134054e-05, |
|
"loss": 0.9384, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 4.126794258373206, |
|
"grad_norm": 3.1707279682159424, |
|
"learning_rate": 3.966586634653862e-05, |
|
"loss": 0.9025, |
|
"step": 5175 |
|
}, |
|
{ |
|
"epoch": 4.138755980861244, |
|
"grad_norm": 3.336472988128662, |
|
"learning_rate": 3.9635854341736695e-05, |
|
"loss": 0.9228, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 4.150717703349282, |
|
"grad_norm": 3.4995670318603516, |
|
"learning_rate": 3.960584233693477e-05, |
|
"loss": 0.9847, |
|
"step": 5205 |
|
}, |
|
{ |
|
"epoch": 4.162679425837321, |
|
"grad_norm": 3.3354713916778564, |
|
"learning_rate": 3.957583033213286e-05, |
|
"loss": 0.9717, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 4.1746411483253585, |
|
"grad_norm": 3.2553207874298096, |
|
"learning_rate": 3.9545818327330935e-05, |
|
"loss": 0.9973, |
|
"step": 5235 |
|
}, |
|
{ |
|
"epoch": 4.186602870813397, |
|
"grad_norm": 3.007181406021118, |
|
"learning_rate": 3.951580632252901e-05, |
|
"loss": 0.919, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 4.198564593301436, |
|
"grad_norm": 2.7252211570739746, |
|
"learning_rate": 3.948579431772709e-05, |
|
"loss": 0.8914, |
|
"step": 5265 |
|
}, |
|
{ |
|
"epoch": 4.2105263157894735, |
|
"grad_norm": 3.078258514404297, |
|
"learning_rate": 3.945578231292517e-05, |
|
"loss": 1.0353, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 4.222488038277512, |
|
"grad_norm": 3.0154271125793457, |
|
"learning_rate": 3.942577030812325e-05, |
|
"loss": 0.9594, |
|
"step": 5295 |
|
}, |
|
{ |
|
"epoch": 4.23444976076555, |
|
"grad_norm": 3.7115094661712646, |
|
"learning_rate": 3.939575830332133e-05, |
|
"loss": 0.9248, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 4.246411483253588, |
|
"grad_norm": 3.135359048843384, |
|
"learning_rate": 3.936574629851941e-05, |
|
"loss": 0.9918, |
|
"step": 5325 |
|
}, |
|
{ |
|
"epoch": 4.258373205741627, |
|
"grad_norm": 2.8541269302368164, |
|
"learning_rate": 3.9335734293717494e-05, |
|
"loss": 0.974, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 4.270334928229665, |
|
"grad_norm": 3.1880204677581787, |
|
"learning_rate": 3.930572228891557e-05, |
|
"loss": 1.0267, |
|
"step": 5355 |
|
}, |
|
{ |
|
"epoch": 4.282296650717703, |
|
"grad_norm": 4.082556247711182, |
|
"learning_rate": 3.927571028411365e-05, |
|
"loss": 0.9764, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 4.294258373205742, |
|
"grad_norm": 3.121758460998535, |
|
"learning_rate": 3.924569827931173e-05, |
|
"loss": 1.0353, |
|
"step": 5385 |
|
}, |
|
{ |
|
"epoch": 4.30622009569378, |
|
"grad_norm": 3.3821141719818115, |
|
"learning_rate": 3.9215686274509805e-05, |
|
"loss": 1.0219, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 4.318181818181818, |
|
"grad_norm": 3.336914300918579, |
|
"learning_rate": 3.918567426970788e-05, |
|
"loss": 1.0427, |
|
"step": 5415 |
|
}, |
|
{ |
|
"epoch": 4.330143540669856, |
|
"grad_norm": 3.1878132820129395, |
|
"learning_rate": 3.915566226490596e-05, |
|
"loss": 1.0125, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 4.342105263157895, |
|
"grad_norm": 3.5293705463409424, |
|
"learning_rate": 3.912565026010404e-05, |
|
"loss": 0.9655, |
|
"step": 5445 |
|
}, |
|
{ |
|
"epoch": 4.354066985645933, |
|
"grad_norm": 2.9817090034484863, |
|
"learning_rate": 3.909563825530212e-05, |
|
"loss": 0.9854, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 4.366028708133971, |
|
"grad_norm": 3.0998663902282715, |
|
"learning_rate": 3.90656262505002e-05, |
|
"loss": 0.951, |
|
"step": 5475 |
|
}, |
|
{ |
|
"epoch": 4.37799043062201, |
|
"grad_norm": 3.541856050491333, |
|
"learning_rate": 3.903561424569828e-05, |
|
"loss": 1.0302, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 4.389952153110048, |
|
"grad_norm": 3.180595636367798, |
|
"learning_rate": 3.9005602240896364e-05, |
|
"loss": 0.9434, |
|
"step": 5505 |
|
}, |
|
{ |
|
"epoch": 4.401913875598086, |
|
"grad_norm": 3.341787099838257, |
|
"learning_rate": 3.897559023609444e-05, |
|
"loss": 1.0062, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 4.413875598086125, |
|
"grad_norm": 3.4445912837982178, |
|
"learning_rate": 3.894557823129252e-05, |
|
"loss": 0.9558, |
|
"step": 5535 |
|
}, |
|
{ |
|
"epoch": 4.425837320574162, |
|
"grad_norm": 2.839120388031006, |
|
"learning_rate": 3.89155662264906e-05, |
|
"loss": 1.0152, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 4.437799043062201, |
|
"grad_norm": 3.482067108154297, |
|
"learning_rate": 3.8885554221688675e-05, |
|
"loss": 1.0234, |
|
"step": 5565 |
|
}, |
|
{ |
|
"epoch": 4.44976076555024, |
|
"grad_norm": 2.869065761566162, |
|
"learning_rate": 3.885554221688676e-05, |
|
"loss": 1.0045, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 4.461722488038277, |
|
"grad_norm": 3.366964101791382, |
|
"learning_rate": 3.882553021208484e-05, |
|
"loss": 1.0086, |
|
"step": 5595 |
|
}, |
|
{ |
|
"epoch": 4.473684210526316, |
|
"grad_norm": 3.8538451194763184, |
|
"learning_rate": 3.8795518207282915e-05, |
|
"loss": 1.0727, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 4.485645933014354, |
|
"grad_norm": 3.1612632274627686, |
|
"learning_rate": 3.876550620248099e-05, |
|
"loss": 1.1, |
|
"step": 5625 |
|
}, |
|
{ |
|
"epoch": 4.497607655502392, |
|
"grad_norm": 3.4518115520477295, |
|
"learning_rate": 3.873549419767907e-05, |
|
"loss": 0.9788, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 4.509569377990431, |
|
"grad_norm": 2.8597676753997803, |
|
"learning_rate": 3.870548219287715e-05, |
|
"loss": 1.0111, |
|
"step": 5655 |
|
}, |
|
{ |
|
"epoch": 4.521531100478469, |
|
"grad_norm": 3.2637124061584473, |
|
"learning_rate": 3.8675470188075233e-05, |
|
"loss": 0.9647, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 4.533492822966507, |
|
"grad_norm": 3.176473379135132, |
|
"learning_rate": 3.864545818327331e-05, |
|
"loss": 1.0303, |
|
"step": 5685 |
|
}, |
|
{ |
|
"epoch": 4.545454545454545, |
|
"grad_norm": 3.1555211544036865, |
|
"learning_rate": 3.8615446178471396e-05, |
|
"loss": 0.9983, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 4.557416267942584, |
|
"grad_norm": 3.690917730331421, |
|
"learning_rate": 3.8585434173669474e-05, |
|
"loss": 1.0843, |
|
"step": 5715 |
|
}, |
|
{ |
|
"epoch": 4.569377990430622, |
|
"grad_norm": 3.4356346130371094, |
|
"learning_rate": 3.855542216886755e-05, |
|
"loss": 1.0957, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 4.58133971291866, |
|
"grad_norm": 3.0207927227020264, |
|
"learning_rate": 3.852541016406563e-05, |
|
"loss": 0.9877, |
|
"step": 5745 |
|
}, |
|
{ |
|
"epoch": 4.5933014354066986, |
|
"grad_norm": 3.256007194519043, |
|
"learning_rate": 3.849539815926371e-05, |
|
"loss": 0.9934, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 4.605263157894737, |
|
"grad_norm": 4.417782783508301, |
|
"learning_rate": 3.8465386154461785e-05, |
|
"loss": 1.0612, |
|
"step": 5775 |
|
}, |
|
{ |
|
"epoch": 4.617224880382775, |
|
"grad_norm": 2.802917242050171, |
|
"learning_rate": 3.843537414965986e-05, |
|
"loss": 1.0714, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 4.6291866028708135, |
|
"grad_norm": 2.9113950729370117, |
|
"learning_rate": 3.840536214485794e-05, |
|
"loss": 1.0637, |
|
"step": 5805 |
|
}, |
|
{ |
|
"epoch": 4.641148325358852, |
|
"grad_norm": 3.0320019721984863, |
|
"learning_rate": 3.8375350140056026e-05, |
|
"loss": 1.0407, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 4.65311004784689, |
|
"grad_norm": 2.9705982208251953, |
|
"learning_rate": 3.83453381352541e-05, |
|
"loss": 1.118, |
|
"step": 5835 |
|
}, |
|
{ |
|
"epoch": 4.6650717703349285, |
|
"grad_norm": 3.1082069873809814, |
|
"learning_rate": 3.831532613045218e-05, |
|
"loss": 1.1102, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 4.677033492822966, |
|
"grad_norm": 3.2098066806793213, |
|
"learning_rate": 3.828531412565026e-05, |
|
"loss": 1.1063, |
|
"step": 5865 |
|
}, |
|
{ |
|
"epoch": 4.688995215311005, |
|
"grad_norm": 3.18621826171875, |
|
"learning_rate": 3.8255302120848344e-05, |
|
"loss": 1.0772, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 4.7009569377990434, |
|
"grad_norm": 3.3197460174560547, |
|
"learning_rate": 3.822529011604642e-05, |
|
"loss": 1.0054, |
|
"step": 5895 |
|
}, |
|
{ |
|
"epoch": 4.712918660287081, |
|
"grad_norm": 2.8657805919647217, |
|
"learning_rate": 3.81952781112445e-05, |
|
"loss": 1.073, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 4.72488038277512, |
|
"grad_norm": 2.897557497024536, |
|
"learning_rate": 3.816526610644258e-05, |
|
"loss": 1.0991, |
|
"step": 5925 |
|
}, |
|
{ |
|
"epoch": 4.7368421052631575, |
|
"grad_norm": 2.881815195083618, |
|
"learning_rate": 3.813525410164066e-05, |
|
"loss": 1.1037, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 4.748803827751196, |
|
"grad_norm": 3.131378412246704, |
|
"learning_rate": 3.810524209683874e-05, |
|
"loss": 1.149, |
|
"step": 5955 |
|
}, |
|
{ |
|
"epoch": 4.760765550239235, |
|
"grad_norm": 3.3418426513671875, |
|
"learning_rate": 3.807523009203682e-05, |
|
"loss": 1.0799, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 4.7727272727272725, |
|
"grad_norm": 2.759793519973755, |
|
"learning_rate": 3.8045218087234895e-05, |
|
"loss": 1.1026, |
|
"step": 5985 |
|
}, |
|
{ |
|
"epoch": 4.784688995215311, |
|
"grad_norm": 3.082688808441162, |
|
"learning_rate": 3.801520608243297e-05, |
|
"loss": 1.0911, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 4.796650717703349, |
|
"grad_norm": 3.788597583770752, |
|
"learning_rate": 3.798519407763105e-05, |
|
"loss": 1.1133, |
|
"step": 6015 |
|
}, |
|
{ |
|
"epoch": 4.8086124401913874, |
|
"grad_norm": 3.0609753131866455, |
|
"learning_rate": 3.795518207282913e-05, |
|
"loss": 1.0023, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 4.820574162679426, |
|
"grad_norm": 3.5260090827941895, |
|
"learning_rate": 3.7925170068027214e-05, |
|
"loss": 1.105, |
|
"step": 6045 |
|
}, |
|
{ |
|
"epoch": 4.832535885167464, |
|
"grad_norm": 3.1473610401153564, |
|
"learning_rate": 3.789515806322529e-05, |
|
"loss": 1.1896, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 4.844497607655502, |
|
"grad_norm": 3.2314066886901855, |
|
"learning_rate": 3.7865146058423376e-05, |
|
"loss": 1.1403, |
|
"step": 6075 |
|
}, |
|
{ |
|
"epoch": 4.856459330143541, |
|
"grad_norm": 3.1266963481903076, |
|
"learning_rate": 3.7835134053621454e-05, |
|
"loss": 1.123, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 4.868421052631579, |
|
"grad_norm": 3.1995601654052734, |
|
"learning_rate": 3.780512204881953e-05, |
|
"loss": 1.1833, |
|
"step": 6105 |
|
}, |
|
{ |
|
"epoch": 4.880382775119617, |
|
"grad_norm": 3.251296043395996, |
|
"learning_rate": 3.777511004401761e-05, |
|
"loss": 1.1502, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 4.892344497607656, |
|
"grad_norm": 3.1420419216156006, |
|
"learning_rate": 3.774509803921569e-05, |
|
"loss": 1.1207, |
|
"step": 6135 |
|
}, |
|
{ |
|
"epoch": 4.904306220095694, |
|
"grad_norm": 2.992222785949707, |
|
"learning_rate": 3.7715086034413765e-05, |
|
"loss": 1.1347, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 4.916267942583732, |
|
"grad_norm": 3.03808856010437, |
|
"learning_rate": 3.768507402961184e-05, |
|
"loss": 1.131, |
|
"step": 6165 |
|
}, |
|
{ |
|
"epoch": 4.92822966507177, |
|
"grad_norm": 3.9193668365478516, |
|
"learning_rate": 3.765506202480993e-05, |
|
"loss": 1.0749, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 4.940191387559809, |
|
"grad_norm": 3.3145644664764404, |
|
"learning_rate": 3.7625050020008006e-05, |
|
"loss": 1.0406, |
|
"step": 6195 |
|
}, |
|
{ |
|
"epoch": 4.952153110047847, |
|
"grad_norm": 3.134812116622925, |
|
"learning_rate": 3.7595038015206084e-05, |
|
"loss": 1.1243, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 4.964114832535885, |
|
"grad_norm": 3.403087854385376, |
|
"learning_rate": 3.756502601040416e-05, |
|
"loss": 1.0429, |
|
"step": 6225 |
|
}, |
|
{ |
|
"epoch": 4.976076555023924, |
|
"grad_norm": 3.0964858531951904, |
|
"learning_rate": 3.753501400560224e-05, |
|
"loss": 1.112, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 4.988038277511961, |
|
"grad_norm": 4.416729927062988, |
|
"learning_rate": 3.7505002000800324e-05, |
|
"loss": 1.1144, |
|
"step": 6255 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 4.442926406860352, |
|
"learning_rate": 3.74749899959984e-05, |
|
"loss": 1.0938, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 5.011961722488039, |
|
"grad_norm": 3.0728983879089355, |
|
"learning_rate": 3.744497799119648e-05, |
|
"loss": 0.6816, |
|
"step": 6285 |
|
}, |
|
{ |
|
"epoch": 5.023923444976076, |
|
"grad_norm": 3.4252402782440186, |
|
"learning_rate": 3.7414965986394564e-05, |
|
"loss": 0.7263, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 5.035885167464115, |
|
"grad_norm": 4.501566410064697, |
|
"learning_rate": 3.738495398159264e-05, |
|
"loss": 0.6744, |
|
"step": 6315 |
|
}, |
|
{ |
|
"epoch": 5.047846889952153, |
|
"grad_norm": 3.8966481685638428, |
|
"learning_rate": 3.735494197679072e-05, |
|
"loss": 0.645, |
|
"step": 6330 |
|
}, |
|
{ |
|
"epoch": 5.059808612440191, |
|
"grad_norm": 3.794740915298462, |
|
"learning_rate": 3.73249299719888e-05, |
|
"loss": 0.6894, |
|
"step": 6345 |
|
}, |
|
{ |
|
"epoch": 5.07177033492823, |
|
"grad_norm": 3.1294026374816895, |
|
"learning_rate": 3.7294917967186876e-05, |
|
"loss": 0.6101, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 5.083732057416268, |
|
"grad_norm": 3.1900405883789062, |
|
"learning_rate": 3.7264905962384953e-05, |
|
"loss": 0.6731, |
|
"step": 6375 |
|
}, |
|
{ |
|
"epoch": 5.095693779904306, |
|
"grad_norm": 3.9348907470703125, |
|
"learning_rate": 3.723489395758303e-05, |
|
"loss": 0.7257, |
|
"step": 6390 |
|
}, |
|
{ |
|
"epoch": 5.107655502392345, |
|
"grad_norm": 3.5655553340911865, |
|
"learning_rate": 3.720488195278111e-05, |
|
"loss": 0.6219, |
|
"step": 6405 |
|
}, |
|
{ |
|
"epoch": 5.119617224880383, |
|
"grad_norm": 3.678565740585327, |
|
"learning_rate": 3.7174869947979194e-05, |
|
"loss": 0.6896, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 5.131578947368421, |
|
"grad_norm": 3.041287422180176, |
|
"learning_rate": 3.714485794317727e-05, |
|
"loss": 0.7084, |
|
"step": 6435 |
|
}, |
|
{ |
|
"epoch": 5.143540669856459, |
|
"grad_norm": 3.382601737976074, |
|
"learning_rate": 3.7114845938375356e-05, |
|
"loss": 0.6298, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 5.155502392344498, |
|
"grad_norm": 3.4510035514831543, |
|
"learning_rate": 3.7084833933573434e-05, |
|
"loss": 0.6882, |
|
"step": 6465 |
|
}, |
|
{ |
|
"epoch": 5.167464114832536, |
|
"grad_norm": 4.204371929168701, |
|
"learning_rate": 3.705482192877151e-05, |
|
"loss": 0.7478, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 5.179425837320574, |
|
"grad_norm": 3.669754981994629, |
|
"learning_rate": 3.702480992396959e-05, |
|
"loss": 0.7159, |
|
"step": 6495 |
|
}, |
|
{ |
|
"epoch": 5.1913875598086126, |
|
"grad_norm": 3.454606056213379, |
|
"learning_rate": 3.699479791916767e-05, |
|
"loss": 0.7049, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 5.203349282296651, |
|
"grad_norm": 3.548112154006958, |
|
"learning_rate": 3.6964785914365746e-05, |
|
"loss": 0.7279, |
|
"step": 6525 |
|
}, |
|
{ |
|
"epoch": 5.215311004784689, |
|
"grad_norm": 4.184609413146973, |
|
"learning_rate": 3.693477390956382e-05, |
|
"loss": 0.7747, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 5.2272727272727275, |
|
"grad_norm": 3.418808937072754, |
|
"learning_rate": 3.690476190476191e-05, |
|
"loss": 0.7833, |
|
"step": 6555 |
|
}, |
|
{ |
|
"epoch": 5.239234449760765, |
|
"grad_norm": 3.444638729095459, |
|
"learning_rate": 3.6874749899959986e-05, |
|
"loss": 0.81, |
|
"step": 6570 |
|
}, |
|
{ |
|
"epoch": 5.251196172248804, |
|
"grad_norm": 3.960958242416382, |
|
"learning_rate": 3.6844737895158064e-05, |
|
"loss": 0.6915, |
|
"step": 6585 |
|
}, |
|
{ |
|
"epoch": 5.2631578947368425, |
|
"grad_norm": 3.772879123687744, |
|
"learning_rate": 3.681472589035614e-05, |
|
"loss": 0.7157, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 5.27511961722488, |
|
"grad_norm": 4.02428674697876, |
|
"learning_rate": 3.6784713885554226e-05, |
|
"loss": 0.7383, |
|
"step": 6615 |
|
}, |
|
{ |
|
"epoch": 5.287081339712919, |
|
"grad_norm": 3.4093050956726074, |
|
"learning_rate": 3.6754701880752304e-05, |
|
"loss": 0.7163, |
|
"step": 6630 |
|
}, |
|
{ |
|
"epoch": 5.2990430622009566, |
|
"grad_norm": 3.6924562454223633, |
|
"learning_rate": 3.672468987595038e-05, |
|
"loss": 0.7022, |
|
"step": 6645 |
|
}, |
|
{ |
|
"epoch": 5.311004784688995, |
|
"grad_norm": 3.356632947921753, |
|
"learning_rate": 3.669467787114846e-05, |
|
"loss": 0.737, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 5.322966507177034, |
|
"grad_norm": 3.501210927963257, |
|
"learning_rate": 3.6664665866346544e-05, |
|
"loss": 0.7474, |
|
"step": 6675 |
|
}, |
|
{ |
|
"epoch": 5.3349282296650715, |
|
"grad_norm": 3.852551221847534, |
|
"learning_rate": 3.663465386154462e-05, |
|
"loss": 0.779, |
|
"step": 6690 |
|
}, |
|
{ |
|
"epoch": 5.34688995215311, |
|
"grad_norm": 3.4461312294006348, |
|
"learning_rate": 3.66046418567427e-05, |
|
"loss": 0.6816, |
|
"step": 6705 |
|
}, |
|
{ |
|
"epoch": 5.358851674641148, |
|
"grad_norm": 2.9088375568389893, |
|
"learning_rate": 3.657462985194078e-05, |
|
"loss": 0.7619, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 5.3708133971291865, |
|
"grad_norm": 3.4227547645568848, |
|
"learning_rate": 3.6544617847138856e-05, |
|
"loss": 0.7646, |
|
"step": 6735 |
|
}, |
|
{ |
|
"epoch": 5.382775119617225, |
|
"grad_norm": 4.553009986877441, |
|
"learning_rate": 3.6514605842336934e-05, |
|
"loss": 0.7907, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 5.394736842105263, |
|
"grad_norm": 3.965406656265259, |
|
"learning_rate": 3.648459383753501e-05, |
|
"loss": 0.7901, |
|
"step": 6765 |
|
}, |
|
{ |
|
"epoch": 5.4066985645933014, |
|
"grad_norm": 3.7064077854156494, |
|
"learning_rate": 3.645458183273309e-05, |
|
"loss": 0.758, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 5.41866028708134, |
|
"grad_norm": 3.4479455947875977, |
|
"learning_rate": 3.6424569827931174e-05, |
|
"loss": 0.7439, |
|
"step": 6795 |
|
}, |
|
{ |
|
"epoch": 5.430622009569378, |
|
"grad_norm": 3.9599294662475586, |
|
"learning_rate": 3.639455782312925e-05, |
|
"loss": 0.8257, |
|
"step": 6810 |
|
}, |
|
{ |
|
"epoch": 5.442583732057416, |
|
"grad_norm": 3.7063801288604736, |
|
"learning_rate": 3.6364545818327336e-05, |
|
"loss": 0.7717, |
|
"step": 6825 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": 4.6955060958862305, |
|
"learning_rate": 3.6334533813525414e-05, |
|
"loss": 0.7575, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 5.466507177033493, |
|
"grad_norm": 3.915292501449585, |
|
"learning_rate": 3.630452180872349e-05, |
|
"loss": 0.7989, |
|
"step": 6855 |
|
}, |
|
{ |
|
"epoch": 5.478468899521531, |
|
"grad_norm": 3.974541664123535, |
|
"learning_rate": 3.627450980392157e-05, |
|
"loss": 0.8685, |
|
"step": 6870 |
|
}, |
|
{ |
|
"epoch": 5.490430622009569, |
|
"grad_norm": 3.9493520259857178, |
|
"learning_rate": 3.624449779911965e-05, |
|
"loss": 0.8111, |
|
"step": 6885 |
|
}, |
|
{ |
|
"epoch": 5.502392344497608, |
|
"grad_norm": 3.7138257026672363, |
|
"learning_rate": 3.6214485794317726e-05, |
|
"loss": 0.8086, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 5.514354066985646, |
|
"grad_norm": 3.838562250137329, |
|
"learning_rate": 3.618447378951581e-05, |
|
"loss": 0.8, |
|
"step": 6915 |
|
}, |
|
{ |
|
"epoch": 5.526315789473684, |
|
"grad_norm": 3.5369865894317627, |
|
"learning_rate": 3.615446178471389e-05, |
|
"loss": 0.7449, |
|
"step": 6930 |
|
}, |
|
{ |
|
"epoch": 5.538277511961723, |
|
"grad_norm": 3.607936382293701, |
|
"learning_rate": 3.6124449779911966e-05, |
|
"loss": 0.7974, |
|
"step": 6945 |
|
}, |
|
{ |
|
"epoch": 5.55023923444976, |
|
"grad_norm": 4.021537780761719, |
|
"learning_rate": 3.6094437775110044e-05, |
|
"loss": 0.6972, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 5.562200956937799, |
|
"grad_norm": 4.086754322052002, |
|
"learning_rate": 3.606442577030812e-05, |
|
"loss": 0.8349, |
|
"step": 6975 |
|
}, |
|
{ |
|
"epoch": 5.574162679425838, |
|
"grad_norm": 3.385819673538208, |
|
"learning_rate": 3.6034413765506206e-05, |
|
"loss": 0.8016, |
|
"step": 6990 |
|
}, |
|
{ |
|
"epoch": 5.586124401913875, |
|
"grad_norm": 3.3851637840270996, |
|
"learning_rate": 3.6004401760704284e-05, |
|
"loss": 0.8013, |
|
"step": 7005 |
|
}, |
|
{ |
|
"epoch": 5.598086124401914, |
|
"grad_norm": 3.6127657890319824, |
|
"learning_rate": 3.597438975590236e-05, |
|
"loss": 0.889, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 5.610047846889952, |
|
"grad_norm": 3.7455716133117676, |
|
"learning_rate": 3.594437775110045e-05, |
|
"loss": 0.8244, |
|
"step": 7035 |
|
}, |
|
{ |
|
"epoch": 5.62200956937799, |
|
"grad_norm": 3.5797011852264404, |
|
"learning_rate": 3.5914365746298525e-05, |
|
"loss": 0.8794, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 5.633971291866029, |
|
"grad_norm": 3.6951963901519775, |
|
"learning_rate": 3.58843537414966e-05, |
|
"loss": 0.8377, |
|
"step": 7065 |
|
}, |
|
{ |
|
"epoch": 5.645933014354067, |
|
"grad_norm": 4.805546283721924, |
|
"learning_rate": 3.585434173669468e-05, |
|
"loss": 0.7658, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 5.657894736842105, |
|
"grad_norm": 3.3476104736328125, |
|
"learning_rate": 3.582432973189276e-05, |
|
"loss": 0.8535, |
|
"step": 7095 |
|
}, |
|
{ |
|
"epoch": 5.669856459330144, |
|
"grad_norm": 3.7429189682006836, |
|
"learning_rate": 3.5794317727090836e-05, |
|
"loss": 0.7698, |
|
"step": 7110 |
|
}, |
|
{ |
|
"epoch": 5.681818181818182, |
|
"grad_norm": 3.6189913749694824, |
|
"learning_rate": 3.5764305722288914e-05, |
|
"loss": 0.8843, |
|
"step": 7125 |
|
}, |
|
{ |
|
"epoch": 5.69377990430622, |
|
"grad_norm": 3.614164113998413, |
|
"learning_rate": 3.573429371748699e-05, |
|
"loss": 0.7855, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 5.705741626794258, |
|
"grad_norm": 3.9962081909179688, |
|
"learning_rate": 3.5704281712685076e-05, |
|
"loss": 0.8501, |
|
"step": 7155 |
|
}, |
|
{ |
|
"epoch": 5.717703349282297, |
|
"grad_norm": 3.6668338775634766, |
|
"learning_rate": 3.5674269707883154e-05, |
|
"loss": 0.7866, |
|
"step": 7170 |
|
}, |
|
{ |
|
"epoch": 5.729665071770335, |
|
"grad_norm": 3.9314942359924316, |
|
"learning_rate": 3.564425770308123e-05, |
|
"loss": 0.8003, |
|
"step": 7185 |
|
}, |
|
{ |
|
"epoch": 5.741626794258373, |
|
"grad_norm": 4.32262659072876, |
|
"learning_rate": 3.5614245698279317e-05, |
|
"loss": 0.8137, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 5.753588516746412, |
|
"grad_norm": 5.040790557861328, |
|
"learning_rate": 3.5584233693477394e-05, |
|
"loss": 0.8354, |
|
"step": 7215 |
|
}, |
|
{ |
|
"epoch": 5.76555023923445, |
|
"grad_norm": 3.7755401134490967, |
|
"learning_rate": 3.555422168867547e-05, |
|
"loss": 0.8574, |
|
"step": 7230 |
|
}, |
|
{ |
|
"epoch": 5.777511961722488, |
|
"grad_norm": 3.8143343925476074, |
|
"learning_rate": 3.552420968387355e-05, |
|
"loss": 0.8091, |
|
"step": 7245 |
|
}, |
|
{ |
|
"epoch": 5.7894736842105265, |
|
"grad_norm": 3.4861605167388916, |
|
"learning_rate": 3.549419767907163e-05, |
|
"loss": 0.8304, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 5.801435406698564, |
|
"grad_norm": 3.5389742851257324, |
|
"learning_rate": 3.546418567426971e-05, |
|
"loss": 0.8676, |
|
"step": 7275 |
|
}, |
|
{ |
|
"epoch": 5.813397129186603, |
|
"grad_norm": 3.465071439743042, |
|
"learning_rate": 3.543417366946779e-05, |
|
"loss": 0.8296, |
|
"step": 7290 |
|
}, |
|
{ |
|
"epoch": 5.8253588516746415, |
|
"grad_norm": 3.9034931659698486, |
|
"learning_rate": 3.540416166466587e-05, |
|
"loss": 0.8398, |
|
"step": 7305 |
|
}, |
|
{ |
|
"epoch": 5.837320574162679, |
|
"grad_norm": 3.817934989929199, |
|
"learning_rate": 3.5374149659863946e-05, |
|
"loss": 0.8602, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 5.849282296650718, |
|
"grad_norm": 4.706762790679932, |
|
"learning_rate": 3.5344137655062024e-05, |
|
"loss": 0.8684, |
|
"step": 7335 |
|
}, |
|
{ |
|
"epoch": 5.861244019138756, |
|
"grad_norm": 3.3008809089660645, |
|
"learning_rate": 3.53141256502601e-05, |
|
"loss": 0.8182, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 5.873205741626794, |
|
"grad_norm": 3.5898377895355225, |
|
"learning_rate": 3.5284113645458186e-05, |
|
"loss": 0.8512, |
|
"step": 7365 |
|
}, |
|
{ |
|
"epoch": 5.885167464114833, |
|
"grad_norm": 3.8670029640197754, |
|
"learning_rate": 3.5254101640656264e-05, |
|
"loss": 0.8412, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 5.8971291866028706, |
|
"grad_norm": 3.6071064472198486, |
|
"learning_rate": 3.522408963585435e-05, |
|
"loss": 0.8578, |
|
"step": 7395 |
|
}, |
|
{ |
|
"epoch": 5.909090909090909, |
|
"grad_norm": 4.674183368682861, |
|
"learning_rate": 3.519407763105243e-05, |
|
"loss": 0.8554, |
|
"step": 7410 |
|
}, |
|
{ |
|
"epoch": 5.921052631578947, |
|
"grad_norm": 3.45503306388855, |
|
"learning_rate": 3.5164065626250505e-05, |
|
"loss": 0.9224, |
|
"step": 7425 |
|
}, |
|
{ |
|
"epoch": 5.9330143540669855, |
|
"grad_norm": 3.4863317012786865, |
|
"learning_rate": 3.513405362144858e-05, |
|
"loss": 0.8177, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 5.944976076555024, |
|
"grad_norm": 3.9804773330688477, |
|
"learning_rate": 3.510404161664666e-05, |
|
"loss": 0.8379, |
|
"step": 7455 |
|
}, |
|
{ |
|
"epoch": 5.956937799043062, |
|
"grad_norm": 3.6782078742980957, |
|
"learning_rate": 3.507402961184474e-05, |
|
"loss": 0.8634, |
|
"step": 7470 |
|
}, |
|
{ |
|
"epoch": 5.9688995215311005, |
|
"grad_norm": 3.7234580516815186, |
|
"learning_rate": 3.5044017607042816e-05, |
|
"loss": 0.9142, |
|
"step": 7485 |
|
}, |
|
{ |
|
"epoch": 5.980861244019139, |
|
"grad_norm": 3.6034648418426514, |
|
"learning_rate": 3.5014005602240894e-05, |
|
"loss": 0.8777, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 5.992822966507177, |
|
"grad_norm": 3.407047748565674, |
|
"learning_rate": 3.498399359743898e-05, |
|
"loss": 0.8191, |
|
"step": 7515 |
|
}, |
|
{ |
|
"epoch": 6.0047846889952154, |
|
"grad_norm": 4.2239508628845215, |
|
"learning_rate": 3.4953981592637056e-05, |
|
"loss": 0.7896, |
|
"step": 7530 |
|
}, |
|
{ |
|
"epoch": 6.016746411483253, |
|
"grad_norm": 2.516592502593994, |
|
"learning_rate": 3.4923969587835134e-05, |
|
"loss": 0.5012, |
|
"step": 7545 |
|
}, |
|
{ |
|
"epoch": 6.028708133971292, |
|
"grad_norm": 3.366042375564575, |
|
"learning_rate": 3.489395758303321e-05, |
|
"loss": 0.4626, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 6.04066985645933, |
|
"grad_norm": 4.176771640777588, |
|
"learning_rate": 3.48639455782313e-05, |
|
"loss": 0.4813, |
|
"step": 7575 |
|
}, |
|
{ |
|
"epoch": 6.052631578947368, |
|
"grad_norm": 3.807236671447754, |
|
"learning_rate": 3.4833933573429375e-05, |
|
"loss": 0.4928, |
|
"step": 7590 |
|
}, |
|
{ |
|
"epoch": 6.064593301435407, |
|
"grad_norm": 3.5176925659179688, |
|
"learning_rate": 3.480392156862745e-05, |
|
"loss": 0.4474, |
|
"step": 7605 |
|
}, |
|
{ |
|
"epoch": 6.076555023923445, |
|
"grad_norm": 3.860903739929199, |
|
"learning_rate": 3.477390956382553e-05, |
|
"loss": 0.5181, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 6.088516746411483, |
|
"grad_norm": 3.883094072341919, |
|
"learning_rate": 3.4743897559023615e-05, |
|
"loss": 0.497, |
|
"step": 7635 |
|
}, |
|
{ |
|
"epoch": 6.100478468899522, |
|
"grad_norm": 3.299124240875244, |
|
"learning_rate": 3.471388555422169e-05, |
|
"loss": 0.5023, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 6.1124401913875595, |
|
"grad_norm": 3.780906915664673, |
|
"learning_rate": 3.468387354941977e-05, |
|
"loss": 0.4938, |
|
"step": 7665 |
|
}, |
|
{ |
|
"epoch": 6.124401913875598, |
|
"grad_norm": 3.906473159790039, |
|
"learning_rate": 3.465386154461785e-05, |
|
"loss": 0.52, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 6.136363636363637, |
|
"grad_norm": 3.7031853199005127, |
|
"learning_rate": 3.4623849539815926e-05, |
|
"loss": 0.4922, |
|
"step": 7695 |
|
}, |
|
{ |
|
"epoch": 6.148325358851674, |
|
"grad_norm": 4.119719505310059, |
|
"learning_rate": 3.4593837535014004e-05, |
|
"loss": 0.4726, |
|
"step": 7710 |
|
}, |
|
{ |
|
"epoch": 6.160287081339713, |
|
"grad_norm": 3.637122869491577, |
|
"learning_rate": 3.456382553021208e-05, |
|
"loss": 0.4522, |
|
"step": 7725 |
|
}, |
|
{ |
|
"epoch": 6.172248803827751, |
|
"grad_norm": 3.6455516815185547, |
|
"learning_rate": 3.453381352541017e-05, |
|
"loss": 0.497, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 6.184210526315789, |
|
"grad_norm": 3.90136981010437, |
|
"learning_rate": 3.4503801520608245e-05, |
|
"loss": 0.5286, |
|
"step": 7755 |
|
}, |
|
{ |
|
"epoch": 6.196172248803828, |
|
"grad_norm": 3.776540994644165, |
|
"learning_rate": 3.447378951580633e-05, |
|
"loss": 0.5407, |
|
"step": 7770 |
|
}, |
|
{ |
|
"epoch": 6.208133971291866, |
|
"grad_norm": 4.160264015197754, |
|
"learning_rate": 3.444377751100441e-05, |
|
"loss": 0.4874, |
|
"step": 7785 |
|
}, |
|
{ |
|
"epoch": 6.220095693779904, |
|
"grad_norm": 3.5366413593292236, |
|
"learning_rate": 3.4413765506202485e-05, |
|
"loss": 0.4708, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 6.232057416267943, |
|
"grad_norm": 3.604766368865967, |
|
"learning_rate": 3.438375350140056e-05, |
|
"loss": 0.5326, |
|
"step": 7815 |
|
}, |
|
{ |
|
"epoch": 6.244019138755981, |
|
"grad_norm": 3.5916519165039062, |
|
"learning_rate": 3.435374149659864e-05, |
|
"loss": 0.5411, |
|
"step": 7830 |
|
}, |
|
{ |
|
"epoch": 6.255980861244019, |
|
"grad_norm": 3.626094102859497, |
|
"learning_rate": 3.432372949179672e-05, |
|
"loss": 0.5142, |
|
"step": 7845 |
|
}, |
|
{ |
|
"epoch": 6.267942583732057, |
|
"grad_norm": 4.346883296966553, |
|
"learning_rate": 3.4293717486994796e-05, |
|
"loss": 0.5135, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 6.279904306220096, |
|
"grad_norm": 4.123327732086182, |
|
"learning_rate": 3.426370548219288e-05, |
|
"loss": 0.5403, |
|
"step": 7875 |
|
}, |
|
{ |
|
"epoch": 6.291866028708134, |
|
"grad_norm": 4.1574482917785645, |
|
"learning_rate": 3.423369347739096e-05, |
|
"loss": 0.5373, |
|
"step": 7890 |
|
}, |
|
{ |
|
"epoch": 6.303827751196172, |
|
"grad_norm": 3.9462273120880127, |
|
"learning_rate": 3.4203681472589037e-05, |
|
"loss": 0.5223, |
|
"step": 7905 |
|
}, |
|
{ |
|
"epoch": 6.315789473684211, |
|
"grad_norm": 4.356924533843994, |
|
"learning_rate": 3.4173669467787114e-05, |
|
"loss": 0.5857, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 6.327751196172249, |
|
"grad_norm": 3.8217930793762207, |
|
"learning_rate": 3.41436574629852e-05, |
|
"loss": 0.5272, |
|
"step": 7935 |
|
}, |
|
{ |
|
"epoch": 6.339712918660287, |
|
"grad_norm": 3.689328908920288, |
|
"learning_rate": 3.411364545818328e-05, |
|
"loss": 0.5162, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 6.351674641148326, |
|
"grad_norm": 3.6850223541259766, |
|
"learning_rate": 3.4083633453381355e-05, |
|
"loss": 0.582, |
|
"step": 7965 |
|
}, |
|
{ |
|
"epoch": 6.363636363636363, |
|
"grad_norm": 4.063047885894775, |
|
"learning_rate": 3.405362144857943e-05, |
|
"loss": 0.5642, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 6.375598086124402, |
|
"grad_norm": 3.6065573692321777, |
|
"learning_rate": 3.402360944377751e-05, |
|
"loss": 0.5225, |
|
"step": 7995 |
|
}, |
|
{ |
|
"epoch": 6.3875598086124405, |
|
"grad_norm": 4.188450336456299, |
|
"learning_rate": 3.3993597438975595e-05, |
|
"loss": 0.5911, |
|
"step": 8010 |
|
}, |
|
{ |
|
"epoch": 6.399521531100478, |
|
"grad_norm": 3.9791886806488037, |
|
"learning_rate": 3.396358543417367e-05, |
|
"loss": 0.5178, |
|
"step": 8025 |
|
}, |
|
{ |
|
"epoch": 6.411483253588517, |
|
"grad_norm": 4.381253719329834, |
|
"learning_rate": 3.393357342937175e-05, |
|
"loss": 0.5344, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 6.423444976076555, |
|
"grad_norm": 3.810927152633667, |
|
"learning_rate": 3.390356142456983e-05, |
|
"loss": 0.4915, |
|
"step": 8055 |
|
}, |
|
{ |
|
"epoch": 6.435406698564593, |
|
"grad_norm": 4.254152774810791, |
|
"learning_rate": 3.3873549419767907e-05, |
|
"loss": 0.601, |
|
"step": 8070 |
|
}, |
|
{ |
|
"epoch": 6.447368421052632, |
|
"grad_norm": 4.086537837982178, |
|
"learning_rate": 3.3843537414965984e-05, |
|
"loss": 0.5944, |
|
"step": 8085 |
|
}, |
|
{ |
|
"epoch": 6.45933014354067, |
|
"grad_norm": 4.881983280181885, |
|
"learning_rate": 3.381352541016406e-05, |
|
"loss": 0.5789, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 6.471291866028708, |
|
"grad_norm": 4.15606689453125, |
|
"learning_rate": 3.378351340536215e-05, |
|
"loss": 0.5397, |
|
"step": 8115 |
|
}, |
|
{ |
|
"epoch": 6.483253588516747, |
|
"grad_norm": 3.6769986152648926, |
|
"learning_rate": 3.3753501400560225e-05, |
|
"loss": 0.5449, |
|
"step": 8130 |
|
}, |
|
{ |
|
"epoch": 6.4952153110047846, |
|
"grad_norm": 3.846041440963745, |
|
"learning_rate": 3.372348939575831e-05, |
|
"loss": 0.5555, |
|
"step": 8145 |
|
}, |
|
{ |
|
"epoch": 6.507177033492823, |
|
"grad_norm": 4.353069305419922, |
|
"learning_rate": 3.369347739095639e-05, |
|
"loss": 0.608, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 6.519138755980861, |
|
"grad_norm": 4.087284564971924, |
|
"learning_rate": 3.3663465386154465e-05, |
|
"loss": 0.5741, |
|
"step": 8175 |
|
}, |
|
{ |
|
"epoch": 6.5311004784688995, |
|
"grad_norm": 4.356995582580566, |
|
"learning_rate": 3.363345338135254e-05, |
|
"loss": 0.6432, |
|
"step": 8190 |
|
}, |
|
{ |
|
"epoch": 6.543062200956938, |
|
"grad_norm": 3.855937957763672, |
|
"learning_rate": 3.360344137655062e-05, |
|
"loss": 0.5783, |
|
"step": 8205 |
|
}, |
|
{ |
|
"epoch": 6.555023923444976, |
|
"grad_norm": 3.820133686065674, |
|
"learning_rate": 3.35734293717487e-05, |
|
"loss": 0.5814, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 6.5669856459330145, |
|
"grad_norm": 4.873568058013916, |
|
"learning_rate": 3.3543417366946776e-05, |
|
"loss": 0.6264, |
|
"step": 8235 |
|
}, |
|
{ |
|
"epoch": 6.578947368421053, |
|
"grad_norm": 3.8670310974121094, |
|
"learning_rate": 3.351340536214486e-05, |
|
"loss": 0.6271, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 6.590909090909091, |
|
"grad_norm": 4.838265895843506, |
|
"learning_rate": 3.348339335734294e-05, |
|
"loss": 0.6346, |
|
"step": 8265 |
|
}, |
|
{ |
|
"epoch": 6.6028708133971294, |
|
"grad_norm": 4.0044403076171875, |
|
"learning_rate": 3.345338135254102e-05, |
|
"loss": 0.5724, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 6.614832535885167, |
|
"grad_norm": 3.866497039794922, |
|
"learning_rate": 3.3423369347739095e-05, |
|
"loss": 0.6172, |
|
"step": 8295 |
|
}, |
|
{ |
|
"epoch": 6.626794258373206, |
|
"grad_norm": 4.213998317718506, |
|
"learning_rate": 3.339335734293718e-05, |
|
"loss": 0.6246, |
|
"step": 8310 |
|
}, |
|
{ |
|
"epoch": 6.638755980861244, |
|
"grad_norm": 4.162674427032471, |
|
"learning_rate": 3.336334533813526e-05, |
|
"loss": 0.6301, |
|
"step": 8325 |
|
}, |
|
{ |
|
"epoch": 6.650717703349282, |
|
"grad_norm": 4.032559394836426, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.6557, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 6.662679425837321, |
|
"grad_norm": 4.416426658630371, |
|
"learning_rate": 3.330332132853141e-05, |
|
"loss": 0.6592, |
|
"step": 8355 |
|
}, |
|
{ |
|
"epoch": 6.6746411483253585, |
|
"grad_norm": 4.758429527282715, |
|
"learning_rate": 3.32733093237295e-05, |
|
"loss": 0.6753, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 6.686602870813397, |
|
"grad_norm": 4.513240337371826, |
|
"learning_rate": 3.3243297318927575e-05, |
|
"loss": 0.5713, |
|
"step": 8385 |
|
}, |
|
{ |
|
"epoch": 6.698564593301436, |
|
"grad_norm": 4.007817268371582, |
|
"learning_rate": 3.321328531412565e-05, |
|
"loss": 0.596, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 6.7105263157894735, |
|
"grad_norm": 4.17065954208374, |
|
"learning_rate": 3.318327330932373e-05, |
|
"loss": 0.5975, |
|
"step": 8415 |
|
}, |
|
{ |
|
"epoch": 6.722488038277512, |
|
"grad_norm": 3.68249773979187, |
|
"learning_rate": 3.315326130452181e-05, |
|
"loss": 0.563, |
|
"step": 8430 |
|
}, |
|
{ |
|
"epoch": 6.73444976076555, |
|
"grad_norm": 4.292535781860352, |
|
"learning_rate": 3.312324929971989e-05, |
|
"loss": 0.6413, |
|
"step": 8445 |
|
}, |
|
{ |
|
"epoch": 6.746411483253588, |
|
"grad_norm": 4.380221843719482, |
|
"learning_rate": 3.3093237294917965e-05, |
|
"loss": 0.5637, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 6.758373205741627, |
|
"grad_norm": 3.799266815185547, |
|
"learning_rate": 3.306322529011604e-05, |
|
"loss": 0.6653, |
|
"step": 8475 |
|
}, |
|
{ |
|
"epoch": 6.770334928229665, |
|
"grad_norm": 4.119513034820557, |
|
"learning_rate": 3.303321328531413e-05, |
|
"loss": 0.6436, |
|
"step": 8490 |
|
}, |
|
{ |
|
"epoch": 6.782296650717703, |
|
"grad_norm": 4.17624044418335, |
|
"learning_rate": 3.3003201280512205e-05, |
|
"loss": 0.6778, |
|
"step": 8505 |
|
}, |
|
{ |
|
"epoch": 6.794258373205742, |
|
"grad_norm": 4.3085761070251465, |
|
"learning_rate": 3.297318927571029e-05, |
|
"loss": 0.6298, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 6.80622009569378, |
|
"grad_norm": 3.8202457427978516, |
|
"learning_rate": 3.294317727090837e-05, |
|
"loss": 0.5924, |
|
"step": 8535 |
|
}, |
|
{ |
|
"epoch": 6.818181818181818, |
|
"grad_norm": 4.103767395019531, |
|
"learning_rate": 3.2913165266106445e-05, |
|
"loss": 0.5925, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 6.830143540669856, |
|
"grad_norm": 4.139376640319824, |
|
"learning_rate": 3.288315326130452e-05, |
|
"loss": 0.6656, |
|
"step": 8565 |
|
}, |
|
{ |
|
"epoch": 6.842105263157895, |
|
"grad_norm": 4.039120674133301, |
|
"learning_rate": 3.28531412565026e-05, |
|
"loss": 0.6807, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 6.854066985645933, |
|
"grad_norm": 4.153085708618164, |
|
"learning_rate": 3.282312925170068e-05, |
|
"loss": 0.6194, |
|
"step": 8595 |
|
}, |
|
{ |
|
"epoch": 6.866028708133971, |
|
"grad_norm": 4.125678539276123, |
|
"learning_rate": 3.279311724689876e-05, |
|
"loss": 0.6333, |
|
"step": 8610 |
|
}, |
|
{ |
|
"epoch": 6.87799043062201, |
|
"grad_norm": 4.25078821182251, |
|
"learning_rate": 3.276310524209684e-05, |
|
"loss": 0.6895, |
|
"step": 8625 |
|
}, |
|
{ |
|
"epoch": 6.889952153110048, |
|
"grad_norm": 3.782094955444336, |
|
"learning_rate": 3.273309323729492e-05, |
|
"loss": 0.6789, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 6.901913875598086, |
|
"grad_norm": 4.739928245544434, |
|
"learning_rate": 3.2703081232493e-05, |
|
"loss": 0.6427, |
|
"step": 8655 |
|
}, |
|
{ |
|
"epoch": 6.913875598086125, |
|
"grad_norm": 4.479592800140381, |
|
"learning_rate": 3.2673069227691075e-05, |
|
"loss": 0.6309, |
|
"step": 8670 |
|
}, |
|
{ |
|
"epoch": 6.925837320574162, |
|
"grad_norm": 4.018124580383301, |
|
"learning_rate": 3.264305722288916e-05, |
|
"loss": 0.6828, |
|
"step": 8685 |
|
}, |
|
{ |
|
"epoch": 6.937799043062201, |
|
"grad_norm": 3.8505430221557617, |
|
"learning_rate": 3.261304521808724e-05, |
|
"loss": 0.6361, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 6.94976076555024, |
|
"grad_norm": 3.596605062484741, |
|
"learning_rate": 3.2583033213285315e-05, |
|
"loss": 0.6297, |
|
"step": 8715 |
|
}, |
|
{ |
|
"epoch": 6.961722488038277, |
|
"grad_norm": 4.318160533905029, |
|
"learning_rate": 3.25530212084834e-05, |
|
"loss": 0.6403, |
|
"step": 8730 |
|
}, |
|
{ |
|
"epoch": 6.973684210526316, |
|
"grad_norm": 4.0697431564331055, |
|
"learning_rate": 3.252300920368148e-05, |
|
"loss": 0.6154, |
|
"step": 8745 |
|
}, |
|
{ |
|
"epoch": 6.985645933014354, |
|
"grad_norm": 4.358625411987305, |
|
"learning_rate": 3.2492997198879555e-05, |
|
"loss": 0.6782, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 6.997607655502392, |
|
"grad_norm": 4.264054298400879, |
|
"learning_rate": 3.246298519407763e-05, |
|
"loss": 0.6498, |
|
"step": 8775 |
|
}, |
|
{ |
|
"epoch": 7.009569377990431, |
|
"grad_norm": 3.4622254371643066, |
|
"learning_rate": 3.243297318927571e-05, |
|
"loss": 0.4423, |
|
"step": 8790 |
|
}, |
|
{ |
|
"epoch": 7.021531100478469, |
|
"grad_norm": 4.359318733215332, |
|
"learning_rate": 3.240296118447379e-05, |
|
"loss": 0.2942, |
|
"step": 8805 |
|
}, |
|
{ |
|
"epoch": 7.033492822966507, |
|
"grad_norm": 4.986384391784668, |
|
"learning_rate": 3.237294917967187e-05, |
|
"loss": 0.3187, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 7.045454545454546, |
|
"grad_norm": 4.1987104415893555, |
|
"learning_rate": 3.2342937174869945e-05, |
|
"loss": 0.3306, |
|
"step": 8835 |
|
}, |
|
{ |
|
"epoch": 7.057416267942584, |
|
"grad_norm": 4.6675028800964355, |
|
"learning_rate": 3.231292517006803e-05, |
|
"loss": 0.349, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 7.069377990430622, |
|
"grad_norm": 4.357269763946533, |
|
"learning_rate": 3.228291316526611e-05, |
|
"loss": 0.3153, |
|
"step": 8865 |
|
}, |
|
{ |
|
"epoch": 7.08133971291866, |
|
"grad_norm": 3.168750762939453, |
|
"learning_rate": 3.225290116046419e-05, |
|
"loss": 0.3223, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 7.0933014354066986, |
|
"grad_norm": 4.13469934463501, |
|
"learning_rate": 3.222288915566227e-05, |
|
"loss": 0.3289, |
|
"step": 8895 |
|
}, |
|
{ |
|
"epoch": 7.105263157894737, |
|
"grad_norm": 3.306483507156372, |
|
"learning_rate": 3.219287715086035e-05, |
|
"loss": 0.3357, |
|
"step": 8910 |
|
}, |
|
{ |
|
"epoch": 7.117224880382775, |
|
"grad_norm": 3.830190896987915, |
|
"learning_rate": 3.2162865146058425e-05, |
|
"loss": 0.3425, |
|
"step": 8925 |
|
}, |
|
{ |
|
"epoch": 7.1291866028708135, |
|
"grad_norm": 3.848161220550537, |
|
"learning_rate": 3.21328531412565e-05, |
|
"loss": 0.3412, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 7.141148325358851, |
|
"grad_norm": 4.058409214019775, |
|
"learning_rate": 3.210284113645458e-05, |
|
"loss": 0.3333, |
|
"step": 8955 |
|
}, |
|
{ |
|
"epoch": 7.15311004784689, |
|
"grad_norm": 3.780856132507324, |
|
"learning_rate": 3.2072829131652666e-05, |
|
"loss": 0.3205, |
|
"step": 8970 |
|
}, |
|
{ |
|
"epoch": 7.1650717703349285, |
|
"grad_norm": 3.9334750175476074, |
|
"learning_rate": 3.2042817126850744e-05, |
|
"loss": 0.3546, |
|
"step": 8985 |
|
}, |
|
{ |
|
"epoch": 7.177033492822966, |
|
"grad_norm": 4.092038631439209, |
|
"learning_rate": 3.201280512204882e-05, |
|
"loss": 0.3295, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 7.188995215311005, |
|
"grad_norm": 4.35646390914917, |
|
"learning_rate": 3.19827931172469e-05, |
|
"loss": 0.3593, |
|
"step": 9015 |
|
}, |
|
{ |
|
"epoch": 7.2009569377990434, |
|
"grad_norm": 3.8881773948669434, |
|
"learning_rate": 3.195278111244498e-05, |
|
"loss": 0.3541, |
|
"step": 9030 |
|
}, |
|
{ |
|
"epoch": 7.212918660287081, |
|
"grad_norm": 4.399089336395264, |
|
"learning_rate": 3.1922769107643055e-05, |
|
"loss": 0.3271, |
|
"step": 9045 |
|
}, |
|
{ |
|
"epoch": 7.22488038277512, |
|
"grad_norm": 4.376395225524902, |
|
"learning_rate": 3.189275710284114e-05, |
|
"loss": 0.4131, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 7.2368421052631575, |
|
"grad_norm": 4.1286468505859375, |
|
"learning_rate": 3.186274509803922e-05, |
|
"loss": 0.3824, |
|
"step": 9075 |
|
}, |
|
{ |
|
"epoch": 7.248803827751196, |
|
"grad_norm": 4.728172302246094, |
|
"learning_rate": 3.18327330932373e-05, |
|
"loss": 0.3706, |
|
"step": 9090 |
|
}, |
|
{ |
|
"epoch": 7.260765550239235, |
|
"grad_norm": 3.76225209236145, |
|
"learning_rate": 3.180272108843538e-05, |
|
"loss": 0.3568, |
|
"step": 9105 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": 3.939035415649414, |
|
"learning_rate": 3.177270908363346e-05, |
|
"loss": 0.4092, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 7.284688995215311, |
|
"grad_norm": 4.537744045257568, |
|
"learning_rate": 3.1742697078831536e-05, |
|
"loss": 0.3606, |
|
"step": 9135 |
|
}, |
|
{ |
|
"epoch": 7.296650717703349, |
|
"grad_norm": 4.309103965759277, |
|
"learning_rate": 3.1712685074029613e-05, |
|
"loss": 0.406, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 7.3086124401913874, |
|
"grad_norm": 4.298764228820801, |
|
"learning_rate": 3.168267306922769e-05, |
|
"loss": 0.373, |
|
"step": 9165 |
|
}, |
|
{ |
|
"epoch": 7.320574162679426, |
|
"grad_norm": 4.205005645751953, |
|
"learning_rate": 3.165266106442577e-05, |
|
"loss": 0.3567, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 7.332535885167464, |
|
"grad_norm": 4.051873207092285, |
|
"learning_rate": 3.162264905962385e-05, |
|
"loss": 0.377, |
|
"step": 9195 |
|
}, |
|
{ |
|
"epoch": 7.344497607655502, |
|
"grad_norm": 4.320316314697266, |
|
"learning_rate": 3.159263705482193e-05, |
|
"loss": 0.4071, |
|
"step": 9210 |
|
}, |
|
{ |
|
"epoch": 7.356459330143541, |
|
"grad_norm": 4.617473125457764, |
|
"learning_rate": 3.156262505002001e-05, |
|
"loss": 0.4048, |
|
"step": 9225 |
|
}, |
|
{ |
|
"epoch": 7.368421052631579, |
|
"grad_norm": 4.013522148132324, |
|
"learning_rate": 3.153261304521809e-05, |
|
"loss": 0.3792, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 7.380382775119617, |
|
"grad_norm": 4.339334487915039, |
|
"learning_rate": 3.150260104041617e-05, |
|
"loss": 0.4172, |
|
"step": 9255 |
|
}, |
|
{ |
|
"epoch": 7.392344497607655, |
|
"grad_norm": 4.555285453796387, |
|
"learning_rate": 3.147258903561425e-05, |
|
"loss": 0.397, |
|
"step": 9270 |
|
}, |
|
{ |
|
"epoch": 7.404306220095694, |
|
"grad_norm": 3.832693576812744, |
|
"learning_rate": 3.144257703081233e-05, |
|
"loss": 0.3784, |
|
"step": 9285 |
|
}, |
|
{ |
|
"epoch": 7.416267942583732, |
|
"grad_norm": 4.14719295501709, |
|
"learning_rate": 3.1412565026010406e-05, |
|
"loss": 0.3979, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 7.42822966507177, |
|
"grad_norm": 3.914750337600708, |
|
"learning_rate": 3.138255302120848e-05, |
|
"loss": 0.3848, |
|
"step": 9315 |
|
}, |
|
{ |
|
"epoch": 7.440191387559809, |
|
"grad_norm": 4.9536967277526855, |
|
"learning_rate": 3.135254101640656e-05, |
|
"loss": 0.4144, |
|
"step": 9330 |
|
}, |
|
{ |
|
"epoch": 7.452153110047847, |
|
"grad_norm": 4.35673713684082, |
|
"learning_rate": 3.1322529011604646e-05, |
|
"loss": 0.4446, |
|
"step": 9345 |
|
}, |
|
{ |
|
"epoch": 7.464114832535885, |
|
"grad_norm": 4.106342315673828, |
|
"learning_rate": 3.1292517006802724e-05, |
|
"loss": 0.4056, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 7.476076555023924, |
|
"grad_norm": 4.211533546447754, |
|
"learning_rate": 3.12625050020008e-05, |
|
"loss": 0.4072, |
|
"step": 9375 |
|
}, |
|
{ |
|
"epoch": 7.488038277511961, |
|
"grad_norm": 3.965963840484619, |
|
"learning_rate": 3.123249299719888e-05, |
|
"loss": 0.4329, |
|
"step": 9390 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 4.13434362411499, |
|
"learning_rate": 3.120248099239696e-05, |
|
"loss": 0.4161, |
|
"step": 9405 |
|
}, |
|
{ |
|
"epoch": 7.511961722488039, |
|
"grad_norm": 6.448205947875977, |
|
"learning_rate": 3.1172468987595035e-05, |
|
"loss": 0.3927, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 7.523923444976076, |
|
"grad_norm": 4.125397682189941, |
|
"learning_rate": 3.114245698279312e-05, |
|
"loss": 0.4021, |
|
"step": 9435 |
|
}, |
|
{ |
|
"epoch": 7.535885167464115, |
|
"grad_norm": 4.477077007293701, |
|
"learning_rate": 3.11124449779912e-05, |
|
"loss": 0.4195, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 7.547846889952153, |
|
"grad_norm": 3.9981772899627686, |
|
"learning_rate": 3.108243297318928e-05, |
|
"loss": 0.4473, |
|
"step": 9465 |
|
}, |
|
{ |
|
"epoch": 7.559808612440191, |
|
"grad_norm": 4.3731689453125, |
|
"learning_rate": 3.105242096838736e-05, |
|
"loss": 0.4264, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 7.57177033492823, |
|
"grad_norm": 4.046823501586914, |
|
"learning_rate": 3.102240896358544e-05, |
|
"loss": 0.4151, |
|
"step": 9495 |
|
}, |
|
{ |
|
"epoch": 7.583732057416268, |
|
"grad_norm": 4.526839733123779, |
|
"learning_rate": 3.0992396958783516e-05, |
|
"loss": 0.4426, |
|
"step": 9510 |
|
}, |
|
{ |
|
"epoch": 7.595693779904306, |
|
"grad_norm": 4.215605735778809, |
|
"learning_rate": 3.0962384953981594e-05, |
|
"loss": 0.4376, |
|
"step": 9525 |
|
}, |
|
{ |
|
"epoch": 7.607655502392344, |
|
"grad_norm": 4.018391132354736, |
|
"learning_rate": 3.093237294917967e-05, |
|
"loss": 0.4385, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 7.619617224880383, |
|
"grad_norm": 5.19038200378418, |
|
"learning_rate": 3.090236094437775e-05, |
|
"loss": 0.4379, |
|
"step": 9555 |
|
}, |
|
{ |
|
"epoch": 7.631578947368421, |
|
"grad_norm": 4.6209611892700195, |
|
"learning_rate": 3.087234893957583e-05, |
|
"loss": 0.4445, |
|
"step": 9570 |
|
}, |
|
{ |
|
"epoch": 7.643540669856459, |
|
"grad_norm": 4.700253486633301, |
|
"learning_rate": 3.084233693477391e-05, |
|
"loss": 0.4309, |
|
"step": 9585 |
|
}, |
|
{ |
|
"epoch": 7.655502392344498, |
|
"grad_norm": 4.6337761878967285, |
|
"learning_rate": 3.081232492997199e-05, |
|
"loss": 0.4256, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 7.667464114832536, |
|
"grad_norm": 4.5144734382629395, |
|
"learning_rate": 3.078231292517007e-05, |
|
"loss": 0.4685, |
|
"step": 9615 |
|
}, |
|
{ |
|
"epoch": 7.679425837320574, |
|
"grad_norm": 4.41657829284668, |
|
"learning_rate": 3.075230092036815e-05, |
|
"loss": 0.4455, |
|
"step": 9630 |
|
}, |
|
{ |
|
"epoch": 7.6913875598086126, |
|
"grad_norm": 4.547213554382324, |
|
"learning_rate": 3.072228891556623e-05, |
|
"loss": 0.4935, |
|
"step": 9645 |
|
}, |
|
{ |
|
"epoch": 7.703349282296651, |
|
"grad_norm": 4.367729187011719, |
|
"learning_rate": 3.069227691076431e-05, |
|
"loss": 0.4636, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 7.715311004784689, |
|
"grad_norm": 4.459219932556152, |
|
"learning_rate": 3.0662264905962386e-05, |
|
"loss": 0.4668, |
|
"step": 9675 |
|
}, |
|
{ |
|
"epoch": 7.7272727272727275, |
|
"grad_norm": 4.355218887329102, |
|
"learning_rate": 3.0632252901160464e-05, |
|
"loss": 0.4296, |
|
"step": 9690 |
|
}, |
|
{ |
|
"epoch": 7.739234449760765, |
|
"grad_norm": 3.960000514984131, |
|
"learning_rate": 3.060224089635855e-05, |
|
"loss": 0.4429, |
|
"step": 9705 |
|
}, |
|
{ |
|
"epoch": 7.751196172248804, |
|
"grad_norm": 4.526662349700928, |
|
"learning_rate": 3.0572228891556626e-05, |
|
"loss": 0.4751, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 7.7631578947368425, |
|
"grad_norm": 4.3358259201049805, |
|
"learning_rate": 3.0542216886754704e-05, |
|
"loss": 0.4885, |
|
"step": 9735 |
|
}, |
|
{ |
|
"epoch": 7.77511961722488, |
|
"grad_norm": 4.190465927124023, |
|
"learning_rate": 3.0512204881952782e-05, |
|
"loss": 0.4633, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 7.787081339712919, |
|
"grad_norm": 4.320166110992432, |
|
"learning_rate": 3.0482192877150863e-05, |
|
"loss": 0.4926, |
|
"step": 9765 |
|
}, |
|
{ |
|
"epoch": 7.7990430622009566, |
|
"grad_norm": 3.990604877471924, |
|
"learning_rate": 3.045218087234894e-05, |
|
"loss": 0.4516, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 7.811004784688995, |
|
"grad_norm": 5.037746906280518, |
|
"learning_rate": 3.042216886754702e-05, |
|
"loss": 0.4121, |
|
"step": 9795 |
|
}, |
|
{ |
|
"epoch": 7.822966507177034, |
|
"grad_norm": 5.006950855255127, |
|
"learning_rate": 3.0392156862745097e-05, |
|
"loss": 0.4643, |
|
"step": 9810 |
|
}, |
|
{ |
|
"epoch": 7.8349282296650715, |
|
"grad_norm": 4.678879261016846, |
|
"learning_rate": 3.036214485794318e-05, |
|
"loss": 0.4733, |
|
"step": 9825 |
|
}, |
|
{ |
|
"epoch": 7.84688995215311, |
|
"grad_norm": 4.293395042419434, |
|
"learning_rate": 3.033213285314126e-05, |
|
"loss": 0.4866, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 7.858851674641148, |
|
"grad_norm": 4.712632656097412, |
|
"learning_rate": 3.0302120848339337e-05, |
|
"loss": 0.4878, |
|
"step": 9855 |
|
}, |
|
{ |
|
"epoch": 7.8708133971291865, |
|
"grad_norm": 4.51541805267334, |
|
"learning_rate": 3.0272108843537418e-05, |
|
"loss": 0.4721, |
|
"step": 9870 |
|
}, |
|
{ |
|
"epoch": 7.882775119617225, |
|
"grad_norm": 4.705857753753662, |
|
"learning_rate": 3.0242096838735496e-05, |
|
"loss": 0.4849, |
|
"step": 9885 |
|
}, |
|
{ |
|
"epoch": 7.894736842105263, |
|
"grad_norm": 4.610105037689209, |
|
"learning_rate": 3.0212084833933574e-05, |
|
"loss": 0.4974, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 7.9066985645933014, |
|
"grad_norm": 4.228977680206299, |
|
"learning_rate": 3.018207282913165e-05, |
|
"loss": 0.468, |
|
"step": 9915 |
|
}, |
|
{ |
|
"epoch": 7.91866028708134, |
|
"grad_norm": 4.514330863952637, |
|
"learning_rate": 3.015206082432973e-05, |
|
"loss": 0.4857, |
|
"step": 9930 |
|
}, |
|
{ |
|
"epoch": 7.930622009569378, |
|
"grad_norm": 4.639202117919922, |
|
"learning_rate": 3.0122048819527814e-05, |
|
"loss": 0.3874, |
|
"step": 9945 |
|
}, |
|
{ |
|
"epoch": 7.942583732057416, |
|
"grad_norm": 4.870967864990234, |
|
"learning_rate": 3.0092036814725892e-05, |
|
"loss": 0.4849, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 7.954545454545455, |
|
"grad_norm": 4.402018070220947, |
|
"learning_rate": 3.0062024809923973e-05, |
|
"loss": 0.492, |
|
"step": 9975 |
|
}, |
|
{ |
|
"epoch": 7.966507177033493, |
|
"grad_norm": 4.405611991882324, |
|
"learning_rate": 3.003201280512205e-05, |
|
"loss": 0.4874, |
|
"step": 9990 |
|
}, |
|
{ |
|
"epoch": 7.978468899521531, |
|
"grad_norm": 4.78075647354126, |
|
"learning_rate": 3.000200080032013e-05, |
|
"loss": 0.5089, |
|
"step": 10005 |
|
}, |
|
{ |
|
"epoch": 7.990430622009569, |
|
"grad_norm": 4.583403587341309, |
|
"learning_rate": 2.9971988795518207e-05, |
|
"loss": 0.4791, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 8.002392344497608, |
|
"grad_norm": 3.6340909004211426, |
|
"learning_rate": 2.9941976790716285e-05, |
|
"loss": 0.4022, |
|
"step": 10035 |
|
}, |
|
{ |
|
"epoch": 8.014354066985646, |
|
"grad_norm": 3.58935809135437, |
|
"learning_rate": 2.9911964785914366e-05, |
|
"loss": 0.2033, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 8.026315789473685, |
|
"grad_norm": 4.309442520141602, |
|
"learning_rate": 2.988195278111245e-05, |
|
"loss": 0.209, |
|
"step": 10065 |
|
}, |
|
{ |
|
"epoch": 8.038277511961722, |
|
"grad_norm": 3.540694236755371, |
|
"learning_rate": 2.985194077631053e-05, |
|
"loss": 0.2269, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 8.05023923444976, |
|
"grad_norm": 4.051588535308838, |
|
"learning_rate": 2.9821928771508606e-05, |
|
"loss": 0.225, |
|
"step": 10095 |
|
}, |
|
{ |
|
"epoch": 8.062200956937799, |
|
"grad_norm": 3.8642947673797607, |
|
"learning_rate": 2.9791916766706684e-05, |
|
"loss": 0.2408, |
|
"step": 10110 |
|
}, |
|
{ |
|
"epoch": 8.074162679425838, |
|
"grad_norm": 4.4070539474487305, |
|
"learning_rate": 2.9761904761904762e-05, |
|
"loss": 0.2131, |
|
"step": 10125 |
|
}, |
|
{ |
|
"epoch": 8.086124401913876, |
|
"grad_norm": 3.5634195804595947, |
|
"learning_rate": 2.9731892757102843e-05, |
|
"loss": 0.2253, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 8.098086124401913, |
|
"grad_norm": 4.4950737953186035, |
|
"learning_rate": 2.970188075230092e-05, |
|
"loss": 0.2438, |
|
"step": 10155 |
|
}, |
|
{ |
|
"epoch": 8.110047846889952, |
|
"grad_norm": 4.489715576171875, |
|
"learning_rate": 2.9671868747499e-05, |
|
"loss": 0.2151, |
|
"step": 10170 |
|
}, |
|
{ |
|
"epoch": 8.12200956937799, |
|
"grad_norm": 4.503179550170898, |
|
"learning_rate": 2.9641856742697083e-05, |
|
"loss": 0.2375, |
|
"step": 10185 |
|
}, |
|
{ |
|
"epoch": 8.133971291866029, |
|
"grad_norm": 4.019615173339844, |
|
"learning_rate": 2.961184473789516e-05, |
|
"loss": 0.253, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 8.145933014354068, |
|
"grad_norm": 3.398512601852417, |
|
"learning_rate": 2.958183273309324e-05, |
|
"loss": 0.2437, |
|
"step": 10215 |
|
}, |
|
{ |
|
"epoch": 8.157894736842104, |
|
"grad_norm": 2.8724753856658936, |
|
"learning_rate": 2.9551820728291317e-05, |
|
"loss": 0.2236, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 8.169856459330143, |
|
"grad_norm": 3.7883143424987793, |
|
"learning_rate": 2.9521808723489398e-05, |
|
"loss": 0.2164, |
|
"step": 10245 |
|
}, |
|
{ |
|
"epoch": 8.181818181818182, |
|
"grad_norm": 4.483898639678955, |
|
"learning_rate": 2.9491796718687476e-05, |
|
"loss": 0.231, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 8.19377990430622, |
|
"grad_norm": 4.909805774688721, |
|
"learning_rate": 2.9461784713885554e-05, |
|
"loss": 0.2511, |
|
"step": 10275 |
|
}, |
|
{ |
|
"epoch": 8.205741626794259, |
|
"grad_norm": 4.415759563446045, |
|
"learning_rate": 2.9431772709083632e-05, |
|
"loss": 0.2259, |
|
"step": 10290 |
|
}, |
|
{ |
|
"epoch": 8.217703349282298, |
|
"grad_norm": 3.9223194122314453, |
|
"learning_rate": 2.9401760704281716e-05, |
|
"loss": 0.2479, |
|
"step": 10305 |
|
}, |
|
{ |
|
"epoch": 8.229665071770334, |
|
"grad_norm": 3.4528160095214844, |
|
"learning_rate": 2.9371748699479794e-05, |
|
"loss": 0.2275, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 8.241626794258373, |
|
"grad_norm": 4.239967346191406, |
|
"learning_rate": 2.9341736694677872e-05, |
|
"loss": 0.2316, |
|
"step": 10335 |
|
}, |
|
{ |
|
"epoch": 8.253588516746412, |
|
"grad_norm": 4.16427755355835, |
|
"learning_rate": 2.9311724689875953e-05, |
|
"loss": 0.2818, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 8.26555023923445, |
|
"grad_norm": 4.7562994956970215, |
|
"learning_rate": 2.928171268507403e-05, |
|
"loss": 0.2658, |
|
"step": 10365 |
|
}, |
|
{ |
|
"epoch": 8.277511961722489, |
|
"grad_norm": 4.450767517089844, |
|
"learning_rate": 2.925170068027211e-05, |
|
"loss": 0.2792, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 8.289473684210526, |
|
"grad_norm": 4.766055583953857, |
|
"learning_rate": 2.9221688675470187e-05, |
|
"loss": 0.2926, |
|
"step": 10395 |
|
}, |
|
{ |
|
"epoch": 8.301435406698564, |
|
"grad_norm": 4.053709030151367, |
|
"learning_rate": 2.9191676670668268e-05, |
|
"loss": 0.2418, |
|
"step": 10410 |
|
}, |
|
{ |
|
"epoch": 8.313397129186603, |
|
"grad_norm": 4.844228267669678, |
|
"learning_rate": 2.916166466586635e-05, |
|
"loss": 0.2426, |
|
"step": 10425 |
|
}, |
|
{ |
|
"epoch": 8.325358851674642, |
|
"grad_norm": 3.6860673427581787, |
|
"learning_rate": 2.913165266106443e-05, |
|
"loss": 0.2542, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 8.33732057416268, |
|
"grad_norm": 3.938351631164551, |
|
"learning_rate": 2.910164065626251e-05, |
|
"loss": 0.2769, |
|
"step": 10455 |
|
}, |
|
{ |
|
"epoch": 8.349282296650717, |
|
"grad_norm": 4.569359302520752, |
|
"learning_rate": 2.9071628651460586e-05, |
|
"loss": 0.2456, |
|
"step": 10470 |
|
}, |
|
{ |
|
"epoch": 8.361244019138756, |
|
"grad_norm": 3.8243377208709717, |
|
"learning_rate": 2.9041616646658664e-05, |
|
"loss": 0.2666, |
|
"step": 10485 |
|
}, |
|
{ |
|
"epoch": 8.373205741626794, |
|
"grad_norm": 4.553408145904541, |
|
"learning_rate": 2.9011604641856742e-05, |
|
"loss": 0.2891, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 8.385167464114833, |
|
"grad_norm": 4.640753746032715, |
|
"learning_rate": 2.8981592637054823e-05, |
|
"loss": 0.2912, |
|
"step": 10515 |
|
}, |
|
{ |
|
"epoch": 8.397129186602871, |
|
"grad_norm": 4.968740940093994, |
|
"learning_rate": 2.89515806322529e-05, |
|
"loss": 0.2761, |
|
"step": 10530 |
|
}, |
|
{ |
|
"epoch": 8.409090909090908, |
|
"grad_norm": 4.833539962768555, |
|
"learning_rate": 2.8921568627450986e-05, |
|
"loss": 0.2915, |
|
"step": 10545 |
|
}, |
|
{ |
|
"epoch": 8.421052631578947, |
|
"grad_norm": 4.913358211517334, |
|
"learning_rate": 2.8891556622649064e-05, |
|
"loss": 0.2703, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 8.433014354066986, |
|
"grad_norm": 3.7276763916015625, |
|
"learning_rate": 2.886154461784714e-05, |
|
"loss": 0.2705, |
|
"step": 10575 |
|
}, |
|
{ |
|
"epoch": 8.444976076555024, |
|
"grad_norm": 4.225296974182129, |
|
"learning_rate": 2.883153261304522e-05, |
|
"loss": 0.2944, |
|
"step": 10590 |
|
}, |
|
{ |
|
"epoch": 8.456937799043063, |
|
"grad_norm": 4.071160793304443, |
|
"learning_rate": 2.8801520608243297e-05, |
|
"loss": 0.3017, |
|
"step": 10605 |
|
}, |
|
{ |
|
"epoch": 8.4688995215311, |
|
"grad_norm": 4.818964958190918, |
|
"learning_rate": 2.877150860344138e-05, |
|
"loss": 0.3057, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 8.480861244019138, |
|
"grad_norm": 4.391495704650879, |
|
"learning_rate": 2.8741496598639456e-05, |
|
"loss": 0.2854, |
|
"step": 10635 |
|
}, |
|
{ |
|
"epoch": 8.492822966507177, |
|
"grad_norm": 4.263548374176025, |
|
"learning_rate": 2.8711484593837534e-05, |
|
"loss": 0.2604, |
|
"step": 10650 |
|
} |
|
], |
|
"logging_steps": 15, |
|
"max_steps": 25000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 15, |
|
"total_flos": 7.844568831858917e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|