{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.99947367395076, "eval_steps": 500, "global_step": 28854, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.4185574054718018, "learning_rate": 1.9999998518172358e-05, "loss": 0.3131, "step": 5 }, { "epoch": 0.0, "grad_norm": 1.2674533128738403, "learning_rate": 1.9999994072689862e-05, "loss": 0.1822, "step": 10 }, { "epoch": 0.0, "grad_norm": 1.4997848272323608, "learning_rate": 1.999998666355383e-05, "loss": 0.2147, "step": 15 }, { "epoch": 0.01, "grad_norm": 1.265012502670288, "learning_rate": 1.9999976290766464e-05, "loss": 0.2305, "step": 20 }, { "epoch": 0.01, "grad_norm": 1.29109525680542, "learning_rate": 1.9999962954330833e-05, "loss": 0.2123, "step": 25 }, { "epoch": 0.01, "grad_norm": 1.475464940071106, "learning_rate": 1.9999946654250892e-05, "loss": 0.1856, "step": 30 }, { "epoch": 0.01, "grad_norm": 1.054665207862854, "learning_rate": 1.9999927390531468e-05, "loss": 0.1843, "step": 35 }, { "epoch": 0.01, "grad_norm": 1.248856544494629, "learning_rate": 1.9999905163178273e-05, "loss": 0.1667, "step": 40 }, { "epoch": 0.01, "grad_norm": 1.092337965965271, "learning_rate": 1.9999879972197895e-05, "loss": 0.1828, "step": 45 }, { "epoch": 0.02, "grad_norm": 1.166669249534607, "learning_rate": 1.99998518175978e-05, "loss": 0.1984, "step": 50 }, { "epoch": 0.02, "grad_norm": 1.5460426807403564, "learning_rate": 1.999982069938633e-05, "loss": 0.1694, "step": 55 }, { "epoch": 0.02, "grad_norm": 1.0065215826034546, "learning_rate": 1.999978661757271e-05, "loss": 0.1549, "step": 60 }, { "epoch": 0.02, "grad_norm": 1.2282688617706299, "learning_rate": 1.9999749572167037e-05, "loss": 0.1621, "step": 65 }, { "epoch": 0.02, "grad_norm": 0.9245144724845886, "learning_rate": 1.9999709563180293e-05, "loss": 0.1693, "step": 70 }, { "epoch": 0.02, "grad_norm": 1.2537037134170532, "learning_rate": 1.9999666590624332e-05, "loss": 0.2242, "step": 75 }, { "epoch": 0.02, "grad_norm": 1.1063697338104248, "learning_rate": 1.9999620654511895e-05, "loss": 0.1641, "step": 80 }, { "epoch": 0.03, "grad_norm": 1.0977532863616943, "learning_rate": 1.9999571754856594e-05, "loss": 0.2038, "step": 85 }, { "epoch": 0.03, "grad_norm": 1.041774034500122, "learning_rate": 1.9999519891672918e-05, "loss": 0.2041, "step": 90 }, { "epoch": 0.03, "grad_norm": 0.9609395861625671, "learning_rate": 1.9999465064976234e-05, "loss": 0.1759, "step": 95 }, { "epoch": 0.03, "grad_norm": 0.7936742305755615, "learning_rate": 1.9999407274782804e-05, "loss": 0.1415, "step": 100 }, { "epoch": 0.03, "grad_norm": 1.1698766946792603, "learning_rate": 1.9999346521109742e-05, "loss": 0.1801, "step": 105 }, { "epoch": 0.03, "grad_norm": 1.1854974031448364, "learning_rate": 1.9999282803975058e-05, "loss": 0.1927, "step": 110 }, { "epoch": 0.04, "grad_norm": 1.0320738554000854, "learning_rate": 1.999921612339764e-05, "loss": 0.1966, "step": 115 }, { "epoch": 0.04, "grad_norm": 0.8534027338027954, "learning_rate": 1.9999146479397245e-05, "loss": 0.1739, "step": 120 }, { "epoch": 0.04, "grad_norm": 0.8952992558479309, "learning_rate": 1.9999073871994513e-05, "loss": 0.151, "step": 125 }, { "epoch": 0.04, "grad_norm": 0.941814661026001, "learning_rate": 1.9998998301210965e-05, "loss": 0.1678, "step": 130 }, { "epoch": 0.04, "grad_norm": 1.2861160039901733, "learning_rate": 1.9998919767068993e-05, "loss": 0.2027, "step": 135 }, { "epoch": 0.04, "grad_norm": 1.0701113939285278, "learning_rate": 1.9998838269591877e-05, "loss": 0.2, "step": 140 }, { "epoch": 0.05, "grad_norm": 1.2391180992126465, "learning_rate": 1.9998753808803766e-05, "loss": 0.1855, "step": 145 }, { "epoch": 0.05, "grad_norm": 0.7739980220794678, "learning_rate": 1.9998666384729692e-05, "loss": 0.1495, "step": 150 }, { "epoch": 0.05, "grad_norm": 1.0521777868270874, "learning_rate": 1.9998575997395566e-05, "loss": 0.1829, "step": 155 }, { "epoch": 0.05, "grad_norm": 1.0798540115356445, "learning_rate": 1.9998482646828178e-05, "loss": 0.1714, "step": 160 }, { "epoch": 0.05, "grad_norm": 0.8348418474197388, "learning_rate": 1.9998386333055188e-05, "loss": 0.135, "step": 165 }, { "epoch": 0.05, "grad_norm": 0.9550169706344604, "learning_rate": 1.999828705610514e-05, "loss": 0.1329, "step": 170 }, { "epoch": 0.05, "grad_norm": 1.1403224468231201, "learning_rate": 1.9998184816007466e-05, "loss": 0.1536, "step": 175 }, { "epoch": 0.06, "grad_norm": 0.7705615758895874, "learning_rate": 1.9998079612792457e-05, "loss": 0.152, "step": 180 }, { "epoch": 0.06, "grad_norm": 1.1348049640655518, "learning_rate": 1.9997971446491295e-05, "loss": 0.183, "step": 185 }, { "epoch": 0.06, "grad_norm": 1.1489228010177612, "learning_rate": 1.9997860317136035e-05, "loss": 0.2018, "step": 190 }, { "epoch": 0.06, "grad_norm": 0.8085460662841797, "learning_rate": 1.9997746224759612e-05, "loss": 0.1793, "step": 195 }, { "epoch": 0.06, "grad_norm": 0.7113868594169617, "learning_rate": 1.999762916939584e-05, "loss": 0.1468, "step": 200 }, { "epoch": 0.06, "grad_norm": 1.2538015842437744, "learning_rate": 1.9997509151079413e-05, "loss": 0.1668, "step": 205 }, { "epoch": 0.07, "grad_norm": 1.0342391729354858, "learning_rate": 1.9997386169845896e-05, "loss": 0.1892, "step": 210 }, { "epoch": 0.07, "grad_norm": 1.0485116243362427, "learning_rate": 1.999726022573174e-05, "loss": 0.1544, "step": 215 }, { "epoch": 0.07, "grad_norm": 1.0436828136444092, "learning_rate": 1.9997131318774267e-05, "loss": 0.2145, "step": 220 }, { "epoch": 0.07, "grad_norm": 0.9429926872253418, "learning_rate": 1.999699944901168e-05, "loss": 0.1212, "step": 225 }, { "epoch": 0.07, "grad_norm": 0.61878901720047, "learning_rate": 1.9996864616483064e-05, "loss": 0.1253, "step": 230 }, { "epoch": 0.07, "grad_norm": 0.9540243148803711, "learning_rate": 1.999672682122838e-05, "loss": 0.1656, "step": 235 }, { "epoch": 0.07, "grad_norm": 0.9568746089935303, "learning_rate": 1.9996586063288458e-05, "loss": 0.1676, "step": 240 }, { "epoch": 0.08, "grad_norm": 0.7145164012908936, "learning_rate": 1.9996442342705023e-05, "loss": 0.1354, "step": 245 }, { "epoch": 0.08, "grad_norm": 0.8123164176940918, "learning_rate": 1.9996295659520663e-05, "loss": 0.1439, "step": 250 }, { "epoch": 0.08, "grad_norm": 0.9031246900558472, "learning_rate": 1.9996146013778853e-05, "loss": 0.1539, "step": 255 }, { "epoch": 0.08, "grad_norm": 0.7666079998016357, "learning_rate": 1.9995993405523943e-05, "loss": 0.137, "step": 260 }, { "epoch": 0.08, "grad_norm": 1.2642663717269897, "learning_rate": 1.9995837834801155e-05, "loss": 0.1893, "step": 265 }, { "epoch": 0.08, "grad_norm": 0.8628743886947632, "learning_rate": 1.99956793016566e-05, "loss": 0.1766, "step": 270 }, { "epoch": 0.09, "grad_norm": 0.9469863772392273, "learning_rate": 1.9995517806137263e-05, "loss": 0.1306, "step": 275 }, { "epoch": 0.09, "grad_norm": 0.8292585611343384, "learning_rate": 1.9995353348291003e-05, "loss": 0.1885, "step": 280 }, { "epoch": 0.09, "grad_norm": 0.8164292573928833, "learning_rate": 1.999518592816656e-05, "loss": 0.1484, "step": 285 }, { "epoch": 0.09, "grad_norm": 0.9202820658683777, "learning_rate": 1.9995015545813552e-05, "loss": 0.1488, "step": 290 }, { "epoch": 0.09, "grad_norm": 0.8424696326255798, "learning_rate": 1.999484220128247e-05, "loss": 0.1545, "step": 295 }, { "epoch": 0.09, "grad_norm": 0.9172386527061462, "learning_rate": 1.9994665894624697e-05, "loss": 0.1279, "step": 300 }, { "epoch": 0.1, "grad_norm": 0.8169159889221191, "learning_rate": 1.9994486625892477e-05, "loss": 0.1475, "step": 305 }, { "epoch": 0.1, "grad_norm": 0.902919352054596, "learning_rate": 1.9994304395138942e-05, "loss": 0.1662, "step": 310 }, { "epoch": 0.1, "grad_norm": 0.8908593654632568, "learning_rate": 1.99941192024181e-05, "loss": 0.1976, "step": 315 }, { "epoch": 0.1, "grad_norm": 0.8870375752449036, "learning_rate": 1.999393104778483e-05, "loss": 0.1145, "step": 320 }, { "epoch": 0.1, "grad_norm": 0.7934074401855469, "learning_rate": 1.9993739931294895e-05, "loss": 0.1481, "step": 325 }, { "epoch": 0.1, "grad_norm": 0.6324104070663452, "learning_rate": 1.9993545853004944e-05, "loss": 0.1619, "step": 330 }, { "epoch": 0.1, "grad_norm": 1.1757112741470337, "learning_rate": 1.9993348812972484e-05, "loss": 0.1661, "step": 335 }, { "epoch": 0.11, "grad_norm": 0.7598108649253845, "learning_rate": 1.999314881125592e-05, "loss": 0.1254, "step": 340 }, { "epoch": 0.11, "grad_norm": 0.7880079746246338, "learning_rate": 1.999294584791452e-05, "loss": 0.1378, "step": 345 }, { "epoch": 0.11, "grad_norm": 0.9844062328338623, "learning_rate": 1.999273992300844e-05, "loss": 0.1837, "step": 350 }, { "epoch": 0.11, "grad_norm": 0.672408938407898, "learning_rate": 1.9992531036598705e-05, "loss": 0.1289, "step": 355 }, { "epoch": 0.11, "grad_norm": 0.6497504711151123, "learning_rate": 1.9992319188747225e-05, "loss": 0.1444, "step": 360 }, { "epoch": 0.11, "grad_norm": 0.8867689371109009, "learning_rate": 1.999210437951678e-05, "loss": 0.1401, "step": 365 }, { "epoch": 0.12, "grad_norm": 0.939816415309906, "learning_rate": 1.9991886608971037e-05, "loss": 0.1773, "step": 370 }, { "epoch": 0.12, "grad_norm": 0.9581828117370605, "learning_rate": 1.9991665877174528e-05, "loss": 0.164, "step": 375 }, { "epoch": 0.12, "grad_norm": 0.9168484210968018, "learning_rate": 1.999144218419268e-05, "loss": 0.1992, "step": 380 }, { "epoch": 0.12, "grad_norm": 0.9601644277572632, "learning_rate": 1.9991215530091782e-05, "loss": 0.1664, "step": 385 }, { "epoch": 0.12, "grad_norm": 1.1929875612258911, "learning_rate": 1.999098591493901e-05, "loss": 0.154, "step": 390 }, { "epoch": 0.12, "grad_norm": 1.0054327249526978, "learning_rate": 1.9990753338802408e-05, "loss": 0.139, "step": 395 }, { "epoch": 0.12, "grad_norm": 0.8742963075637817, "learning_rate": 1.999051780175091e-05, "loss": 0.1881, "step": 400 }, { "epoch": 0.13, "grad_norm": 0.6974807977676392, "learning_rate": 1.999027930385432e-05, "loss": 0.148, "step": 405 }, { "epoch": 0.13, "grad_norm": 0.7910093665122986, "learning_rate": 1.9990037845183313e-05, "loss": 0.136, "step": 410 }, { "epoch": 0.13, "grad_norm": 0.8378175497055054, "learning_rate": 1.9989793425809463e-05, "loss": 0.1005, "step": 415 }, { "epoch": 0.13, "grad_norm": 0.7058895230293274, "learning_rate": 1.9989546045805195e-05, "loss": 0.1394, "step": 420 }, { "epoch": 0.13, "grad_norm": 0.6101320385932922, "learning_rate": 1.998929570524383e-05, "loss": 0.1416, "step": 425 }, { "epoch": 0.13, "grad_norm": 0.8287069201469421, "learning_rate": 1.9989042404199558e-05, "loss": 0.1251, "step": 430 }, { "epoch": 0.14, "grad_norm": 0.8602848052978516, "learning_rate": 1.998878614274745e-05, "loss": 0.1405, "step": 435 }, { "epoch": 0.14, "grad_norm": 0.8005295991897583, "learning_rate": 1.998852692096345e-05, "loss": 0.1436, "step": 440 }, { "epoch": 0.14, "grad_norm": 1.1535545587539673, "learning_rate": 1.998826473892439e-05, "loss": 0.1675, "step": 445 }, { "epoch": 0.14, "grad_norm": 0.8452690839767456, "learning_rate": 1.998799959670796e-05, "loss": 0.1592, "step": 450 }, { "epoch": 0.14, "grad_norm": 0.7784842252731323, "learning_rate": 1.998773149439275e-05, "loss": 0.1079, "step": 455 }, { "epoch": 0.14, "grad_norm": 0.6018524765968323, "learning_rate": 1.9987460432058213e-05, "loss": 0.1295, "step": 460 }, { "epoch": 0.15, "grad_norm": 0.880659282207489, "learning_rate": 1.998718640978468e-05, "loss": 0.1467, "step": 465 }, { "epoch": 0.15, "grad_norm": 0.8606876134872437, "learning_rate": 1.998690942765336e-05, "loss": 0.1585, "step": 470 }, { "epoch": 0.15, "grad_norm": 0.73162841796875, "learning_rate": 1.998662948574635e-05, "loss": 0.1589, "step": 475 }, { "epoch": 0.15, "grad_norm": 0.6358553171157837, "learning_rate": 1.9986346584146607e-05, "loss": 0.1754, "step": 480 }, { "epoch": 0.15, "grad_norm": 0.804429829120636, "learning_rate": 1.9986060722937974e-05, "loss": 0.1178, "step": 485 }, { "epoch": 0.15, "grad_norm": 0.834313690662384, "learning_rate": 1.9985771902205172e-05, "loss": 0.1394, "step": 490 }, { "epoch": 0.15, "grad_norm": 0.9056774973869324, "learning_rate": 1.99854801220338e-05, "loss": 0.1521, "step": 495 }, { "epoch": 0.16, "grad_norm": 0.8872849345207214, "learning_rate": 1.9985185382510327e-05, "loss": 0.1537, "step": 500 }, { "epoch": 0.16, "grad_norm": 0.8164851069450378, "learning_rate": 1.998488768372211e-05, "loss": 0.1508, "step": 505 }, { "epoch": 0.16, "grad_norm": 0.8496866226196289, "learning_rate": 1.9984587025757367e-05, "loss": 0.1372, "step": 510 }, { "epoch": 0.16, "grad_norm": 0.8128467798233032, "learning_rate": 1.998428340870521e-05, "loss": 0.1409, "step": 515 }, { "epoch": 0.16, "grad_norm": 0.8530411720275879, "learning_rate": 1.9983976832655618e-05, "loss": 0.1313, "step": 520 }, { "epoch": 0.16, "grad_norm": 0.7878673076629639, "learning_rate": 1.9983667297699453e-05, "loss": 0.1747, "step": 525 }, { "epoch": 0.17, "grad_norm": 0.6734103560447693, "learning_rate": 1.9983354803928443e-05, "loss": 0.1228, "step": 530 }, { "epoch": 0.17, "grad_norm": 0.9592450261116028, "learning_rate": 1.9983039351435208e-05, "loss": 0.1349, "step": 535 }, { "epoch": 0.17, "grad_norm": 0.7146002054214478, "learning_rate": 1.9982720940313232e-05, "loss": 0.1335, "step": 540 }, { "epoch": 0.17, "grad_norm": 0.8208376169204712, "learning_rate": 1.998239957065689e-05, "loss": 0.1627, "step": 545 }, { "epoch": 0.17, "grad_norm": 0.7010489702224731, "learning_rate": 1.998207524256141e-05, "loss": 0.1125, "step": 550 }, { "epoch": 0.17, "grad_norm": 0.6489346027374268, "learning_rate": 1.9981747956122928e-05, "loss": 0.1302, "step": 555 }, { "epoch": 0.17, "grad_norm": 0.8398034572601318, "learning_rate": 1.9981417711438427e-05, "loss": 0.1863, "step": 560 }, { "epoch": 0.18, "grad_norm": 0.7627323269844055, "learning_rate": 1.998108450860579e-05, "loss": 0.1263, "step": 565 }, { "epoch": 0.18, "grad_norm": 0.8145926594734192, "learning_rate": 1.9980748347723757e-05, "loss": 0.1588, "step": 570 }, { "epoch": 0.18, "grad_norm": 0.75948166847229, "learning_rate": 1.9980409228891962e-05, "loss": 0.1425, "step": 575 }, { "epoch": 0.18, "grad_norm": 1.1241673231124878, "learning_rate": 1.9980067152210908e-05, "loss": 0.1381, "step": 580 }, { "epoch": 0.18, "grad_norm": 0.7214015126228333, "learning_rate": 1.9979722117781972e-05, "loss": 0.1329, "step": 585 }, { "epoch": 0.18, "grad_norm": 0.6891055703163147, "learning_rate": 1.997937412570741e-05, "loss": 0.1164, "step": 590 }, { "epoch": 0.19, "grad_norm": 0.8613958954811096, "learning_rate": 1.9979023176090356e-05, "loss": 0.1379, "step": 595 }, { "epoch": 0.19, "grad_norm": 0.7776545286178589, "learning_rate": 1.997866926903482e-05, "loss": 0.1524, "step": 600 }, { "epoch": 0.19, "grad_norm": 0.943812906742096, "learning_rate": 1.9978312404645685e-05, "loss": 0.1449, "step": 605 }, { "epoch": 0.19, "grad_norm": 0.75004643201828, "learning_rate": 1.9977952583028718e-05, "loss": 0.1315, "step": 610 }, { "epoch": 0.19, "grad_norm": 0.6595169305801392, "learning_rate": 1.9977589804290557e-05, "loss": 0.109, "step": 615 }, { "epoch": 0.19, "grad_norm": 0.7484739422798157, "learning_rate": 1.9977224068538713e-05, "loss": 0.1217, "step": 620 }, { "epoch": 0.19, "grad_norm": 0.9818029999732971, "learning_rate": 1.997685537588158e-05, "loss": 0.1321, "step": 625 }, { "epoch": 0.2, "grad_norm": 0.8451599478721619, "learning_rate": 1.9976483726428423e-05, "loss": 0.1375, "step": 630 }, { "epoch": 0.2, "grad_norm": 0.7618489861488342, "learning_rate": 1.997610912028939e-05, "loss": 0.1446, "step": 635 }, { "epoch": 0.2, "grad_norm": 0.6438541412353516, "learning_rate": 1.99757315575755e-05, "loss": 0.1501, "step": 640 }, { "epoch": 0.2, "grad_norm": 0.8832077383995056, "learning_rate": 1.9975351038398654e-05, "loss": 0.122, "step": 645 }, { "epoch": 0.2, "grad_norm": 0.9104491472244263, "learning_rate": 1.9974967562871616e-05, "loss": 0.147, "step": 650 }, { "epoch": 0.2, "grad_norm": 0.9494624733924866, "learning_rate": 1.997458113110804e-05, "loss": 0.1319, "step": 655 }, { "epoch": 0.21, "grad_norm": 0.7751930952072144, "learning_rate": 1.997419174322245e-05, "loss": 0.1488, "step": 660 }, { "epoch": 0.21, "grad_norm": 0.80446857213974, "learning_rate": 1.997379939933025e-05, "loss": 0.1691, "step": 665 }, { "epoch": 0.21, "grad_norm": 0.8367449641227722, "learning_rate": 1.997340409954771e-05, "loss": 0.1514, "step": 670 }, { "epoch": 0.21, "grad_norm": 0.8535647392272949, "learning_rate": 1.997300584399199e-05, "loss": 0.1436, "step": 675 }, { "epoch": 0.21, "grad_norm": 0.7626618146896362, "learning_rate": 1.997260463278112e-05, "loss": 0.1323, "step": 680 }, { "epoch": 0.21, "grad_norm": 0.9637759923934937, "learning_rate": 1.9972200466034e-05, "loss": 0.1571, "step": 685 }, { "epoch": 0.22, "grad_norm": 0.8359416723251343, "learning_rate": 1.9971793343870414e-05, "loss": 0.1406, "step": 690 }, { "epoch": 0.22, "grad_norm": 0.7154871821403503, "learning_rate": 1.9971383266411015e-05, "loss": 0.1425, "step": 695 }, { "epoch": 0.22, "grad_norm": 0.6426644325256348, "learning_rate": 1.9970970233777343e-05, "loss": 0.1401, "step": 700 }, { "epoch": 0.22, "grad_norm": 0.8010983467102051, "learning_rate": 1.99705542460918e-05, "loss": 0.1159, "step": 705 }, { "epoch": 0.22, "grad_norm": 0.8396361470222473, "learning_rate": 1.9970135303477674e-05, "loss": 0.1577, "step": 710 }, { "epoch": 0.22, "grad_norm": 0.8315424919128418, "learning_rate": 1.9969713406059126e-05, "loss": 0.1614, "step": 715 }, { "epoch": 0.22, "grad_norm": 0.8752270340919495, "learning_rate": 1.996928855396119e-05, "loss": 0.1679, "step": 720 }, { "epoch": 0.23, "grad_norm": 0.558314859867096, "learning_rate": 1.9968860747309778e-05, "loss": 0.1395, "step": 725 }, { "epoch": 0.23, "grad_norm": 0.7923253178596497, "learning_rate": 1.9968429986231678e-05, "loss": 0.1326, "step": 730 }, { "epoch": 0.23, "grad_norm": 0.8511756062507629, "learning_rate": 1.996799627085455e-05, "loss": 0.1192, "step": 735 }, { "epoch": 0.23, "grad_norm": 0.7928483486175537, "learning_rate": 1.996755960130693e-05, "loss": 0.1247, "step": 740 }, { "epoch": 0.23, "grad_norm": 0.922551691532135, "learning_rate": 1.9967119977718245e-05, "loss": 0.18, "step": 745 }, { "epoch": 0.23, "grad_norm": 0.4644975960254669, "learning_rate": 1.9966677400218768e-05, "loss": 0.1349, "step": 750 }, { "epoch": 0.24, "grad_norm": 0.6948322653770447, "learning_rate": 1.9966231868939672e-05, "loss": 0.1393, "step": 755 }, { "epoch": 0.24, "grad_norm": 0.6136431097984314, "learning_rate": 1.9965783384012996e-05, "loss": 0.1236, "step": 760 }, { "epoch": 0.24, "grad_norm": 0.776211678981781, "learning_rate": 1.9965331945571656e-05, "loss": 0.1378, "step": 765 }, { "epoch": 0.24, "grad_norm": 0.8530936241149902, "learning_rate": 1.996487755374944e-05, "loss": 0.1007, "step": 770 }, { "epoch": 0.24, "grad_norm": 0.8924842476844788, "learning_rate": 1.9964420208681016e-05, "loss": 0.126, "step": 775 }, { "epoch": 0.24, "grad_norm": 1.0084704160690308, "learning_rate": 1.9963959910501927e-05, "loss": 0.1524, "step": 780 }, { "epoch": 0.24, "grad_norm": 0.7858508825302124, "learning_rate": 1.9963496659348588e-05, "loss": 0.12, "step": 785 }, { "epoch": 0.25, "grad_norm": 1.0238721370697021, "learning_rate": 1.9963030455358293e-05, "loss": 0.1261, "step": 790 }, { "epoch": 0.25, "grad_norm": 0.6840950846672058, "learning_rate": 1.9962561298669202e-05, "loss": 0.178, "step": 795 }, { "epoch": 0.25, "grad_norm": 0.777533233165741, "learning_rate": 1.9962089189420365e-05, "loss": 0.1601, "step": 800 }, { "epoch": 0.25, "grad_norm": 1.1069340705871582, "learning_rate": 1.9961614127751695e-05, "loss": 0.131, "step": 805 }, { "epoch": 0.25, "grad_norm": 0.6931054592132568, "learning_rate": 1.9961136113803982e-05, "loss": 0.1377, "step": 810 }, { "epoch": 0.25, "grad_norm": 0.7552673816680908, "learning_rate": 1.9960655147718894e-05, "loss": 0.1438, "step": 815 }, { "epoch": 0.26, "grad_norm": 0.6302088499069214, "learning_rate": 1.9960171229638977e-05, "loss": 0.1128, "step": 820 }, { "epoch": 0.26, "grad_norm": 0.6855728626251221, "learning_rate": 1.9959684359707643e-05, "loss": 0.1254, "step": 825 }, { "epoch": 0.26, "grad_norm": 0.8626918196678162, "learning_rate": 1.9959194538069186e-05, "loss": 0.1077, "step": 830 }, { "epoch": 0.26, "grad_norm": 0.5936461687088013, "learning_rate": 1.9958701764868768e-05, "loss": 0.1824, "step": 835 }, { "epoch": 0.26, "grad_norm": 0.9230837821960449, "learning_rate": 1.9958206040252437e-05, "loss": 0.0893, "step": 840 }, { "epoch": 0.26, "grad_norm": 0.7443073391914368, "learning_rate": 1.9957707364367102e-05, "loss": 0.1643, "step": 845 }, { "epoch": 0.27, "grad_norm": 0.9582584500312805, "learning_rate": 1.9957205737360557e-05, "loss": 0.1704, "step": 850 }, { "epoch": 0.27, "grad_norm": 0.6180398464202881, "learning_rate": 1.9956701159381468e-05, "loss": 0.1821, "step": 855 }, { "epoch": 0.27, "grad_norm": 0.7253844141960144, "learning_rate": 1.9956193630579368e-05, "loss": 0.136, "step": 860 }, { "epoch": 0.27, "grad_norm": 0.7296341061592102, "learning_rate": 1.995568315110468e-05, "loss": 0.1712, "step": 865 }, { "epoch": 0.27, "grad_norm": 0.7956708073616028, "learning_rate": 1.9955169721108686e-05, "loss": 0.136, "step": 870 }, { "epoch": 0.27, "grad_norm": 0.6485627293586731, "learning_rate": 1.9954653340743552e-05, "loss": 0.132, "step": 875 }, { "epoch": 0.27, "grad_norm": 0.5205156207084656, "learning_rate": 1.9954134010162317e-05, "loss": 0.1179, "step": 880 }, { "epoch": 0.28, "grad_norm": 0.5457340478897095, "learning_rate": 1.9953611729518884e-05, "loss": 0.1473, "step": 885 }, { "epoch": 0.28, "grad_norm": 0.93240886926651, "learning_rate": 1.995308649896805e-05, "loss": 0.1205, "step": 890 }, { "epoch": 0.28, "grad_norm": 0.6138172149658203, "learning_rate": 1.9952558318665468e-05, "loss": 0.1397, "step": 895 }, { "epoch": 0.28, "grad_norm": 0.7555159330368042, "learning_rate": 1.9952027188767674e-05, "loss": 0.1041, "step": 900 }, { "epoch": 0.28, "grad_norm": 0.7118566036224365, "learning_rate": 1.995149310943208e-05, "loss": 0.1189, "step": 905 }, { "epoch": 0.28, "grad_norm": 0.5578028559684753, "learning_rate": 1.9950956080816964e-05, "loss": 0.1069, "step": 910 }, { "epoch": 0.29, "grad_norm": 0.8313916325569153, "learning_rate": 1.9950416103081486e-05, "loss": 0.1292, "step": 915 }, { "epoch": 0.29, "grad_norm": 0.6720024347305298, "learning_rate": 1.9949873176385676e-05, "loss": 0.1212, "step": 920 }, { "epoch": 0.29, "grad_norm": 0.831915020942688, "learning_rate": 1.994932730089044e-05, "loss": 0.1134, "step": 925 }, { "epoch": 0.29, "grad_norm": 0.8834342956542969, "learning_rate": 1.9948778476757554e-05, "loss": 0.1652, "step": 930 }, { "epoch": 0.29, "grad_norm": 0.7775338888168335, "learning_rate": 1.9948226704149668e-05, "loss": 0.1344, "step": 935 }, { "epoch": 0.29, "grad_norm": 0.7776153683662415, "learning_rate": 1.9947671983230315e-05, "loss": 0.1366, "step": 940 }, { "epoch": 0.29, "grad_norm": 0.7737155556678772, "learning_rate": 1.9947114314163892e-05, "loss": 0.1685, "step": 945 }, { "epoch": 0.3, "grad_norm": 0.6648966073989868, "learning_rate": 1.9946553697115672e-05, "loss": 0.1584, "step": 950 }, { "epoch": 0.3, "grad_norm": 0.8067174553871155, "learning_rate": 1.9945990132251805e-05, "loss": 0.147, "step": 955 }, { "epoch": 0.3, "grad_norm": 0.740641713142395, "learning_rate": 1.994542361973931e-05, "loss": 0.1128, "step": 960 }, { "epoch": 0.3, "grad_norm": 0.716967761516571, "learning_rate": 1.9944854159746084e-05, "loss": 0.1438, "step": 965 }, { "epoch": 0.3, "grad_norm": 0.7191652059555054, "learning_rate": 1.994428175244089e-05, "loss": 0.1037, "step": 970 }, { "epoch": 0.3, "grad_norm": 0.8301136493682861, "learning_rate": 1.9943706397993382e-05, "loss": 0.1434, "step": 975 }, { "epoch": 0.31, "grad_norm": 0.7556055784225464, "learning_rate": 1.994312809657406e-05, "loss": 0.1145, "step": 980 }, { "epoch": 0.31, "grad_norm": 0.7464662194252014, "learning_rate": 1.9942546848354322e-05, "loss": 0.1387, "step": 985 }, { "epoch": 0.31, "grad_norm": 0.8151674866676331, "learning_rate": 1.9941962653506426e-05, "loss": 0.1253, "step": 990 }, { "epoch": 0.31, "grad_norm": 0.7346523404121399, "learning_rate": 1.994137551220351e-05, "loss": 0.1459, "step": 995 }, { "epoch": 0.31, "grad_norm": 0.6668190360069275, "learning_rate": 1.9940785424619578e-05, "loss": 0.1105, "step": 1000 }, { "epoch": 0.31, "grad_norm": 0.6598778963088989, "learning_rate": 1.9940192390929518e-05, "loss": 0.133, "step": 1005 }, { "epoch": 0.32, "grad_norm": 0.5903392434120178, "learning_rate": 1.9939596411309083e-05, "loss": 0.1344, "step": 1010 }, { "epoch": 0.32, "grad_norm": 0.7818740010261536, "learning_rate": 1.9938997485934896e-05, "loss": 0.1178, "step": 1015 }, { "epoch": 0.32, "grad_norm": 0.5331541895866394, "learning_rate": 1.993839561498446e-05, "loss": 0.1117, "step": 1020 }, { "epoch": 0.32, "grad_norm": 0.5732979774475098, "learning_rate": 1.9937790798636155e-05, "loss": 0.0965, "step": 1025 }, { "epoch": 0.32, "grad_norm": 0.6993948817253113, "learning_rate": 1.9937183037069217e-05, "loss": 0.1001, "step": 1030 }, { "epoch": 0.32, "grad_norm": 0.7085026502609253, "learning_rate": 1.9936572330463777e-05, "loss": 0.1209, "step": 1035 }, { "epoch": 0.32, "grad_norm": 0.5701314806938171, "learning_rate": 1.9935958679000816e-05, "loss": 0.1394, "step": 1040 }, { "epoch": 0.33, "grad_norm": 0.8212972283363342, "learning_rate": 1.9935342082862208e-05, "loss": 0.1148, "step": 1045 }, { "epoch": 0.33, "grad_norm": 0.6771918535232544, "learning_rate": 1.9934722542230687e-05, "loss": 0.1649, "step": 1050 }, { "epoch": 0.33, "grad_norm": 0.712510883808136, "learning_rate": 1.9934100057289866e-05, "loss": 0.1188, "step": 1055 }, { "epoch": 0.33, "grad_norm": 0.7547973990440369, "learning_rate": 1.9933474628224222e-05, "loss": 0.1336, "step": 1060 }, { "epoch": 0.33, "grad_norm": 0.6179800629615784, "learning_rate": 1.9932846255219117e-05, "loss": 0.1277, "step": 1065 }, { "epoch": 0.33, "grad_norm": 0.9826509356498718, "learning_rate": 1.993221493846078e-05, "loss": 0.1138, "step": 1070 }, { "epoch": 0.34, "grad_norm": 1.0901100635528564, "learning_rate": 1.9931580678136305e-05, "loss": 0.1467, "step": 1075 }, { "epoch": 0.34, "grad_norm": 0.727664589881897, "learning_rate": 1.9930943474433667e-05, "loss": 0.0986, "step": 1080 }, { "epoch": 0.34, "grad_norm": 0.5122860670089722, "learning_rate": 1.9930303327541714e-05, "loss": 0.1246, "step": 1085 }, { "epoch": 0.34, "grad_norm": 0.5919774174690247, "learning_rate": 1.9929660237650164e-05, "loss": 0.1011, "step": 1090 }, { "epoch": 0.34, "grad_norm": 1.0322712659835815, "learning_rate": 1.9929014204949603e-05, "loss": 0.1939, "step": 1095 }, { "epoch": 0.34, "grad_norm": 0.8237559199333191, "learning_rate": 1.9928365229631494e-05, "loss": 0.1441, "step": 1100 }, { "epoch": 0.34, "grad_norm": 0.7380067706108093, "learning_rate": 1.992771331188817e-05, "loss": 0.1517, "step": 1105 }, { "epoch": 0.35, "grad_norm": 0.5965713858604431, "learning_rate": 1.9927058451912843e-05, "loss": 0.0884, "step": 1110 }, { "epoch": 0.35, "grad_norm": 0.7433302402496338, "learning_rate": 1.9926400649899584e-05, "loss": 0.1443, "step": 1115 }, { "epoch": 0.35, "grad_norm": 0.6493307948112488, "learning_rate": 1.9925739906043345e-05, "loss": 0.1296, "step": 1120 }, { "epoch": 0.35, "grad_norm": 0.8302630186080933, "learning_rate": 1.9925076220539947e-05, "loss": 0.1202, "step": 1125 }, { "epoch": 0.35, "grad_norm": 0.7189318537712097, "learning_rate": 1.9924409593586086e-05, "loss": 0.152, "step": 1130 }, { "epoch": 0.35, "grad_norm": 0.7712987065315247, "learning_rate": 1.9923740025379326e-05, "loss": 0.1146, "step": 1135 }, { "epoch": 0.36, "grad_norm": 0.9559170603752136, "learning_rate": 1.99230675161181e-05, "loss": 0.1279, "step": 1140 }, { "epoch": 0.36, "grad_norm": 0.9394153356552124, "learning_rate": 1.9922392066001724e-05, "loss": 0.0898, "step": 1145 }, { "epoch": 0.36, "grad_norm": 0.8282424211502075, "learning_rate": 1.992171367523037e-05, "loss": 0.1355, "step": 1150 }, { "epoch": 0.36, "grad_norm": 0.8344165682792664, "learning_rate": 1.9921032344005097e-05, "loss": 0.158, "step": 1155 }, { "epoch": 0.36, "grad_norm": 0.5974935293197632, "learning_rate": 1.9920348072527824e-05, "loss": 0.1349, "step": 1160 }, { "epoch": 0.36, "grad_norm": 0.8082500696182251, "learning_rate": 1.9919660861001346e-05, "loss": 0.1336, "step": 1165 }, { "epoch": 0.36, "grad_norm": 0.6113331913948059, "learning_rate": 1.991897070962933e-05, "loss": 0.1098, "step": 1170 }, { "epoch": 0.37, "grad_norm": 0.8488194942474365, "learning_rate": 1.9918277618616312e-05, "loss": 0.1632, "step": 1175 }, { "epoch": 0.37, "grad_norm": 0.8501783609390259, "learning_rate": 1.99175815881677e-05, "loss": 0.1106, "step": 1180 }, { "epoch": 0.37, "grad_norm": 0.7749114036560059, "learning_rate": 1.9916882618489777e-05, "loss": 0.1325, "step": 1185 }, { "epoch": 0.37, "grad_norm": 0.5864267349243164, "learning_rate": 1.991618070978969e-05, "loss": 0.1081, "step": 1190 }, { "epoch": 0.37, "grad_norm": 0.7307422757148743, "learning_rate": 1.991547586227546e-05, "loss": 0.1206, "step": 1195 }, { "epoch": 0.37, "grad_norm": 0.5497795939445496, "learning_rate": 1.9914768076155977e-05, "loss": 0.1195, "step": 1200 }, { "epoch": 0.38, "grad_norm": 0.7037328481674194, "learning_rate": 1.991405735164101e-05, "loss": 0.1272, "step": 1205 }, { "epoch": 0.38, "grad_norm": 0.6973326206207275, "learning_rate": 1.9913343688941192e-05, "loss": 0.1299, "step": 1210 }, { "epoch": 0.38, "grad_norm": 0.7845016717910767, "learning_rate": 1.991262708826803e-05, "loss": 0.1212, "step": 1215 }, { "epoch": 0.38, "grad_norm": 0.6262115836143494, "learning_rate": 1.9911907549833895e-05, "loss": 0.1388, "step": 1220 }, { "epoch": 0.38, "grad_norm": 0.7345215082168579, "learning_rate": 1.9911185073852036e-05, "loss": 0.1271, "step": 1225 }, { "epoch": 0.38, "grad_norm": 0.48333102464675903, "learning_rate": 1.9910459660536567e-05, "loss": 0.1041, "step": 1230 }, { "epoch": 0.39, "grad_norm": 0.6141420006752014, "learning_rate": 1.9909731310102477e-05, "loss": 0.1335, "step": 1235 }, { "epoch": 0.39, "grad_norm": 0.8758596777915955, "learning_rate": 1.990900002276563e-05, "loss": 0.1266, "step": 1240 }, { "epoch": 0.39, "grad_norm": 0.6014870405197144, "learning_rate": 1.9908265798742747e-05, "loss": 0.1311, "step": 1245 }, { "epoch": 0.39, "grad_norm": 0.6260560154914856, "learning_rate": 1.9907528638251427e-05, "loss": 0.116, "step": 1250 }, { "epoch": 0.39, "grad_norm": 0.8104715943336487, "learning_rate": 1.990678854151014e-05, "loss": 0.1237, "step": 1255 }, { "epoch": 0.39, "grad_norm": 0.6168515682220459, "learning_rate": 1.990604550873823e-05, "loss": 0.1125, "step": 1260 }, { "epoch": 0.39, "grad_norm": 0.6618785262107849, "learning_rate": 1.9905299540155897e-05, "loss": 0.0979, "step": 1265 }, { "epoch": 0.4, "grad_norm": 0.7389912605285645, "learning_rate": 1.9904550635984228e-05, "loss": 0.1644, "step": 1270 }, { "epoch": 0.4, "grad_norm": 0.7781631350517273, "learning_rate": 1.9903798796445174e-05, "loss": 0.1228, "step": 1275 }, { "epoch": 0.4, "grad_norm": 0.7687239646911621, "learning_rate": 1.990304402176155e-05, "loss": 0.1313, "step": 1280 }, { "epoch": 0.4, "grad_norm": 0.49261704087257385, "learning_rate": 1.990228631215704e-05, "loss": 0.0842, "step": 1285 }, { "epoch": 0.4, "grad_norm": 0.8993775844573975, "learning_rate": 1.9901525667856213e-05, "loss": 0.123, "step": 1290 }, { "epoch": 0.4, "grad_norm": 0.5044394731521606, "learning_rate": 1.990076208908449e-05, "loss": 0.1053, "step": 1295 }, { "epoch": 0.41, "grad_norm": 0.6558696627616882, "learning_rate": 1.9899995576068177e-05, "loss": 0.1264, "step": 1300 }, { "epoch": 0.41, "grad_norm": 0.6986610889434814, "learning_rate": 1.9899226129034433e-05, "loss": 0.1344, "step": 1305 }, { "epoch": 0.41, "grad_norm": 0.8104435205459595, "learning_rate": 1.9898453748211305e-05, "loss": 0.1053, "step": 1310 }, { "epoch": 0.41, "grad_norm": 0.6717681884765625, "learning_rate": 1.989767843382769e-05, "loss": 0.1071, "step": 1315 }, { "epoch": 0.41, "grad_norm": 1.0109965801239014, "learning_rate": 1.9896900186113375e-05, "loss": 0.1361, "step": 1320 }, { "epoch": 0.41, "grad_norm": 0.5133586525917053, "learning_rate": 1.9896119005299e-05, "loss": 0.0943, "step": 1325 }, { "epoch": 0.41, "grad_norm": 0.6515102386474609, "learning_rate": 1.989533489161608e-05, "loss": 0.1263, "step": 1330 }, { "epoch": 0.42, "grad_norm": 0.6868942975997925, "learning_rate": 1.9894547845297e-05, "loss": 0.0937, "step": 1335 }, { "epoch": 0.42, "grad_norm": 0.8736807107925415, "learning_rate": 1.9893757866575012e-05, "loss": 0.1256, "step": 1340 }, { "epoch": 0.42, "grad_norm": 0.5859454274177551, "learning_rate": 1.989296495568424e-05, "loss": 0.1203, "step": 1345 }, { "epoch": 0.42, "grad_norm": 0.8191548585891724, "learning_rate": 1.9892169112859677e-05, "loss": 0.1654, "step": 1350 }, { "epoch": 0.42, "grad_norm": 0.5680137276649475, "learning_rate": 1.9891370338337182e-05, "loss": 0.1169, "step": 1355 }, { "epoch": 0.42, "grad_norm": 0.8363646864891052, "learning_rate": 1.9890568632353485e-05, "loss": 0.1673, "step": 1360 }, { "epoch": 0.43, "grad_norm": 0.7161871194839478, "learning_rate": 1.9889763995146178e-05, "loss": 0.1491, "step": 1365 }, { "epoch": 0.43, "grad_norm": 0.5407841205596924, "learning_rate": 1.9888956426953735e-05, "loss": 0.0926, "step": 1370 }, { "epoch": 0.43, "grad_norm": 0.9694133996963501, "learning_rate": 1.988814592801549e-05, "loss": 0.0969, "step": 1375 }, { "epoch": 0.43, "grad_norm": 0.7110317349433899, "learning_rate": 1.988733249857165e-05, "loss": 0.1375, "step": 1380 }, { "epoch": 0.43, "grad_norm": 0.8600018620491028, "learning_rate": 1.9886516138863276e-05, "loss": 0.1333, "step": 1385 }, { "epoch": 0.43, "grad_norm": 0.6293290853500366, "learning_rate": 1.9885696849132323e-05, "loss": 0.1053, "step": 1390 }, { "epoch": 0.44, "grad_norm": 0.7569682598114014, "learning_rate": 1.9884874629621588e-05, "loss": 0.1677, "step": 1395 }, { "epoch": 0.44, "grad_norm": 0.6882773041725159, "learning_rate": 1.9884049480574753e-05, "loss": 0.1546, "step": 1400 }, { "epoch": 0.44, "grad_norm": 0.8036536574363708, "learning_rate": 1.988322140223637e-05, "loss": 0.1645, "step": 1405 }, { "epoch": 0.44, "grad_norm": 0.8944361805915833, "learning_rate": 1.9882390394851846e-05, "loss": 0.1588, "step": 1410 }, { "epoch": 0.44, "grad_norm": 0.6535063982009888, "learning_rate": 1.9881556458667463e-05, "loss": 0.1357, "step": 1415 }, { "epoch": 0.44, "grad_norm": 0.7615301609039307, "learning_rate": 1.9880719593930375e-05, "loss": 0.1442, "step": 1420 }, { "epoch": 0.44, "grad_norm": 1.0170069932937622, "learning_rate": 1.9879879800888596e-05, "loss": 0.1655, "step": 1425 }, { "epoch": 0.45, "grad_norm": 0.739402174949646, "learning_rate": 1.9879037079791014e-05, "loss": 0.1644, "step": 1430 }, { "epoch": 0.45, "grad_norm": 0.48603636026382446, "learning_rate": 1.9878191430887382e-05, "loss": 0.1349, "step": 1435 }, { "epoch": 0.45, "grad_norm": 0.512487530708313, "learning_rate": 1.987734285442832e-05, "loss": 0.1102, "step": 1440 }, { "epoch": 0.45, "grad_norm": 0.6387705206871033, "learning_rate": 1.9876491350665315e-05, "loss": 0.1031, "step": 1445 }, { "epoch": 0.45, "grad_norm": 0.597423255443573, "learning_rate": 1.987563691985073e-05, "loss": 0.0989, "step": 1450 }, { "epoch": 0.45, "grad_norm": 0.5606899857521057, "learning_rate": 1.9874779562237784e-05, "loss": 0.0946, "step": 1455 }, { "epoch": 0.46, "grad_norm": 0.8332621455192566, "learning_rate": 1.987391927808057e-05, "loss": 0.147, "step": 1460 }, { "epoch": 0.46, "grad_norm": 0.6586269736289978, "learning_rate": 1.9873056067634043e-05, "loss": 0.1221, "step": 1465 }, { "epoch": 0.46, "grad_norm": 0.6761701703071594, "learning_rate": 1.9872189931154035e-05, "loss": 0.1293, "step": 1470 }, { "epoch": 0.46, "grad_norm": 0.6314623951911926, "learning_rate": 1.987132086889723e-05, "loss": 0.107, "step": 1475 }, { "epoch": 0.46, "grad_norm": 0.9015997052192688, "learning_rate": 1.98704488811212e-05, "loss": 0.1291, "step": 1480 }, { "epoch": 0.46, "grad_norm": 0.7586043477058411, "learning_rate": 1.986957396808436e-05, "loss": 0.0968, "step": 1485 }, { "epoch": 0.46, "grad_norm": 0.6687948107719421, "learning_rate": 1.9868696130046016e-05, "loss": 0.085, "step": 1490 }, { "epoch": 0.47, "grad_norm": 0.7385773062705994, "learning_rate": 1.9867815367266317e-05, "loss": 0.1183, "step": 1495 }, { "epoch": 0.47, "grad_norm": 0.8993258476257324, "learning_rate": 1.98669316800063e-05, "loss": 0.1374, "step": 1500 }, { "epoch": 0.47, "grad_norm": 0.6431350708007812, "learning_rate": 1.986604506852785e-05, "loss": 0.1281, "step": 1505 }, { "epoch": 0.47, "grad_norm": 0.7766661047935486, "learning_rate": 1.986515553309374e-05, "loss": 0.1193, "step": 1510 }, { "epoch": 0.47, "grad_norm": 0.9873250126838684, "learning_rate": 1.986426307396759e-05, "loss": 0.1195, "step": 1515 }, { "epoch": 0.47, "grad_norm": 0.7213776111602783, "learning_rate": 1.9863367691413896e-05, "loss": 0.1338, "step": 1520 }, { "epoch": 0.48, "grad_norm": 0.8704515099525452, "learning_rate": 1.9862469385698015e-05, "loss": 0.1237, "step": 1525 }, { "epoch": 0.48, "grad_norm": 0.8648638725280762, "learning_rate": 1.9861568157086182e-05, "loss": 0.1406, "step": 1530 }, { "epoch": 0.48, "grad_norm": 0.7704949378967285, "learning_rate": 1.9860664005845482e-05, "loss": 0.1264, "step": 1535 }, { "epoch": 0.48, "grad_norm": 0.5187184810638428, "learning_rate": 1.9859756932243878e-05, "loss": 0.1391, "step": 1540 }, { "epoch": 0.48, "grad_norm": 1.0362634658813477, "learning_rate": 1.9858846936550197e-05, "loss": 0.1341, "step": 1545 }, { "epoch": 0.48, "grad_norm": 0.693230152130127, "learning_rate": 1.9857934019034126e-05, "loss": 0.1053, "step": 1550 }, { "epoch": 0.48, "grad_norm": 0.8774945139884949, "learning_rate": 1.9857018179966223e-05, "loss": 0.1829, "step": 1555 }, { "epoch": 0.49, "grad_norm": 0.8253774642944336, "learning_rate": 1.9856099419617915e-05, "loss": 0.0974, "step": 1560 }, { "epoch": 0.49, "grad_norm": 0.5385066270828247, "learning_rate": 1.985517773826149e-05, "loss": 0.1159, "step": 1565 }, { "epoch": 0.49, "grad_norm": 0.5954375267028809, "learning_rate": 1.9854253136170096e-05, "loss": 0.1122, "step": 1570 }, { "epoch": 0.49, "grad_norm": 0.5960990786552429, "learning_rate": 1.985332561361776e-05, "loss": 0.1026, "step": 1575 }, { "epoch": 0.49, "grad_norm": 0.8077450394630432, "learning_rate": 1.9852395170879366e-05, "loss": 0.1304, "step": 1580 }, { "epoch": 0.49, "grad_norm": 0.686133086681366, "learning_rate": 1.9851461808230667e-05, "loss": 0.1017, "step": 1585 }, { "epoch": 0.5, "grad_norm": 0.9486279487609863, "learning_rate": 1.9850525525948277e-05, "loss": 0.1321, "step": 1590 }, { "epoch": 0.5, "grad_norm": 0.7093173265457153, "learning_rate": 1.9849586324309677e-05, "loss": 0.1195, "step": 1595 }, { "epoch": 0.5, "grad_norm": 0.7591297626495361, "learning_rate": 1.9848644203593217e-05, "loss": 0.135, "step": 1600 }, { "epoch": 0.5, "grad_norm": 0.6641188263893127, "learning_rate": 1.9847699164078105e-05, "loss": 0.1013, "step": 1605 }, { "epoch": 0.5, "grad_norm": 0.7843316793441772, "learning_rate": 1.9846751206044424e-05, "loss": 0.1333, "step": 1610 }, { "epoch": 0.5, "grad_norm": 0.6371971368789673, "learning_rate": 1.984580032977311e-05, "loss": 0.1338, "step": 1615 }, { "epoch": 0.51, "grad_norm": 0.5326147079467773, "learning_rate": 1.9844846535545975e-05, "loss": 0.0938, "step": 1620 }, { "epoch": 0.51, "grad_norm": 0.7384750843048096, "learning_rate": 1.9843889823645685e-05, "loss": 0.0977, "step": 1625 }, { "epoch": 0.51, "grad_norm": 0.6112232804298401, "learning_rate": 1.9842930194355784e-05, "loss": 0.1374, "step": 1630 }, { "epoch": 0.51, "grad_norm": 0.7577245831489563, "learning_rate": 1.9841967647960666e-05, "loss": 0.1432, "step": 1635 }, { "epoch": 0.51, "grad_norm": 0.7354856133460999, "learning_rate": 1.9841002184745598e-05, "loss": 0.1481, "step": 1640 }, { "epoch": 0.51, "grad_norm": 0.6963324546813965, "learning_rate": 1.9840033804996715e-05, "loss": 0.0993, "step": 1645 }, { "epoch": 0.51, "grad_norm": 0.8006656169891357, "learning_rate": 1.9839062509001004e-05, "loss": 0.1478, "step": 1650 }, { "epoch": 0.52, "grad_norm": 0.7103011012077332, "learning_rate": 1.983808829704633e-05, "loss": 0.1289, "step": 1655 }, { "epoch": 0.52, "grad_norm": 0.9841951727867126, "learning_rate": 1.9837111169421416e-05, "loss": 0.1455, "step": 1660 }, { "epoch": 0.52, "grad_norm": 0.8196072578430176, "learning_rate": 1.983613112641584e-05, "loss": 0.1225, "step": 1665 }, { "epoch": 0.52, "grad_norm": 0.7359209656715393, "learning_rate": 1.9835148168320064e-05, "loss": 0.1356, "step": 1670 }, { "epoch": 0.52, "grad_norm": 0.6371826529502869, "learning_rate": 1.9834162295425393e-05, "loss": 0.1227, "step": 1675 }, { "epoch": 0.52, "grad_norm": 0.5500133633613586, "learning_rate": 1.9833173508024017e-05, "loss": 0.1214, "step": 1680 }, { "epoch": 0.53, "grad_norm": 0.7776175737380981, "learning_rate": 1.983218180640897e-05, "loss": 0.0967, "step": 1685 }, { "epoch": 0.53, "grad_norm": 0.5735242366790771, "learning_rate": 1.983118719087416e-05, "loss": 0.0962, "step": 1690 }, { "epoch": 0.53, "grad_norm": 0.4979003071784973, "learning_rate": 1.9830189661714358e-05, "loss": 0.1208, "step": 1695 }, { "epoch": 0.53, "grad_norm": 0.6308743953704834, "learning_rate": 1.9829189219225194e-05, "loss": 0.1219, "step": 1700 }, { "epoch": 0.53, "grad_norm": 0.7372656464576721, "learning_rate": 1.9828185863703172e-05, "loss": 0.1446, "step": 1705 }, { "epoch": 0.53, "grad_norm": 0.6539852619171143, "learning_rate": 1.9827179595445644e-05, "loss": 0.1302, "step": 1710 }, { "epoch": 0.53, "grad_norm": 0.5244619846343994, "learning_rate": 1.9826170414750836e-05, "loss": 0.1014, "step": 1715 }, { "epoch": 0.54, "grad_norm": 0.5680856108665466, "learning_rate": 1.9825158321917836e-05, "loss": 0.0799, "step": 1720 }, { "epoch": 0.54, "grad_norm": 0.8987188935279846, "learning_rate": 1.9824143317246595e-05, "loss": 0.156, "step": 1725 }, { "epoch": 0.54, "grad_norm": 0.6499376893043518, "learning_rate": 1.982312540103792e-05, "loss": 0.12, "step": 1730 }, { "epoch": 0.54, "grad_norm": 0.6216092705726624, "learning_rate": 1.9822104573593485e-05, "loss": 0.1271, "step": 1735 }, { "epoch": 0.54, "grad_norm": 0.6452203989028931, "learning_rate": 1.9821080835215836e-05, "loss": 0.0962, "step": 1740 }, { "epoch": 0.54, "grad_norm": 0.6163891553878784, "learning_rate": 1.982005418620837e-05, "loss": 0.0765, "step": 1745 }, { "epoch": 0.55, "grad_norm": 0.6932926774024963, "learning_rate": 1.981902462687535e-05, "loss": 0.1292, "step": 1750 }, { "epoch": 0.55, "grad_norm": 0.7456857562065125, "learning_rate": 1.9817992157521898e-05, "loss": 0.1224, "step": 1755 }, { "epoch": 0.55, "grad_norm": 0.7458961009979248, "learning_rate": 1.9816956778454012e-05, "loss": 0.1101, "step": 1760 }, { "epoch": 0.55, "grad_norm": 1.1397936344146729, "learning_rate": 1.9815918489978535e-05, "loss": 0.1421, "step": 1765 }, { "epoch": 0.55, "grad_norm": 0.6820269227027893, "learning_rate": 1.981487729240318e-05, "loss": 0.0916, "step": 1770 }, { "epoch": 0.55, "grad_norm": 1.1029489040374756, "learning_rate": 1.9813833186036526e-05, "loss": 0.1663, "step": 1775 }, { "epoch": 0.56, "grad_norm": 1.127271056175232, "learning_rate": 1.9812786171188008e-05, "loss": 0.1151, "step": 1780 }, { "epoch": 0.56, "grad_norm": 0.8425104022026062, "learning_rate": 1.981173624816792e-05, "loss": 0.1362, "step": 1785 }, { "epoch": 0.56, "grad_norm": 0.65611332654953, "learning_rate": 1.9810683417287437e-05, "loss": 0.1048, "step": 1790 }, { "epoch": 0.56, "grad_norm": 0.7015208601951599, "learning_rate": 1.980962767885857e-05, "loss": 0.1278, "step": 1795 }, { "epoch": 0.56, "grad_norm": 0.7320536375045776, "learning_rate": 1.9808569033194205e-05, "loss": 0.1443, "step": 1800 }, { "epoch": 0.56, "grad_norm": 0.56165611743927, "learning_rate": 1.980750748060809e-05, "loss": 0.1023, "step": 1805 }, { "epoch": 0.56, "grad_norm": 0.6682567596435547, "learning_rate": 1.980644302141483e-05, "loss": 0.1398, "step": 1810 }, { "epoch": 0.57, "grad_norm": 0.6880092024803162, "learning_rate": 1.98053756559299e-05, "loss": 0.1097, "step": 1815 }, { "epoch": 0.57, "grad_norm": 0.7480456829071045, "learning_rate": 1.9804305384469623e-05, "loss": 0.1643, "step": 1820 }, { "epoch": 0.57, "grad_norm": 0.7221274375915527, "learning_rate": 1.9803232207351197e-05, "loss": 0.1072, "step": 1825 }, { "epoch": 0.57, "grad_norm": 0.6828133463859558, "learning_rate": 1.980215612489267e-05, "loss": 0.1442, "step": 1830 }, { "epoch": 0.57, "grad_norm": 0.8638070821762085, "learning_rate": 1.980107713741296e-05, "loss": 0.1219, "step": 1835 }, { "epoch": 0.57, "grad_norm": 0.6649238467216492, "learning_rate": 1.9799995245231837e-05, "loss": 0.1035, "step": 1840 }, { "epoch": 0.58, "grad_norm": 0.8936854600906372, "learning_rate": 1.9798910448669935e-05, "loss": 0.1316, "step": 1845 }, { "epoch": 0.58, "grad_norm": 0.4825485050678253, "learning_rate": 1.979782274804876e-05, "loss": 0.0937, "step": 1850 }, { "epoch": 0.58, "grad_norm": 0.7550978064537048, "learning_rate": 1.9796732143690663e-05, "loss": 0.1424, "step": 1855 }, { "epoch": 0.58, "grad_norm": 0.7773075699806213, "learning_rate": 1.979563863591886e-05, "loss": 0.1314, "step": 1860 }, { "epoch": 0.58, "grad_norm": 0.9723725914955139, "learning_rate": 1.979454222505743e-05, "loss": 0.1628, "step": 1865 }, { "epoch": 0.58, "grad_norm": 0.4671371877193451, "learning_rate": 1.9793442911431315e-05, "loss": 0.0999, "step": 1870 }, { "epoch": 0.58, "grad_norm": 0.5132455825805664, "learning_rate": 1.9792340695366304e-05, "loss": 0.1186, "step": 1875 }, { "epoch": 0.59, "grad_norm": 0.47127968072891235, "learning_rate": 1.9791235577189067e-05, "loss": 0.1121, "step": 1880 }, { "epoch": 0.59, "grad_norm": 0.725286066532135, "learning_rate": 1.9790127557227115e-05, "loss": 0.1027, "step": 1885 }, { "epoch": 0.59, "grad_norm": 0.6264200210571289, "learning_rate": 1.9789016635808836e-05, "loss": 0.101, "step": 1890 }, { "epoch": 0.59, "grad_norm": 0.6502572894096375, "learning_rate": 1.978790281326346e-05, "loss": 0.089, "step": 1895 }, { "epoch": 0.59, "grad_norm": 0.8432275056838989, "learning_rate": 1.978678608992109e-05, "loss": 0.1644, "step": 1900 }, { "epoch": 0.59, "grad_norm": 0.7738016843795776, "learning_rate": 1.978566646611268e-05, "loss": 0.0792, "step": 1905 }, { "epoch": 0.6, "grad_norm": 0.881592869758606, "learning_rate": 1.9784543942170056e-05, "loss": 0.1287, "step": 1910 }, { "epoch": 0.6, "grad_norm": 0.7415223717689514, "learning_rate": 1.9783418518425885e-05, "loss": 0.0938, "step": 1915 }, { "epoch": 0.6, "grad_norm": 0.86686110496521, "learning_rate": 1.9782290195213713e-05, "loss": 0.1249, "step": 1920 }, { "epoch": 0.6, "grad_norm": 0.6496177315711975, "learning_rate": 1.978115897286793e-05, "loss": 0.1146, "step": 1925 }, { "epoch": 0.6, "grad_norm": 0.8109525442123413, "learning_rate": 1.9780024851723795e-05, "loss": 0.1019, "step": 1930 }, { "epoch": 0.6, "grad_norm": 0.9621292352676392, "learning_rate": 1.9778887832117418e-05, "loss": 0.092, "step": 1935 }, { "epoch": 0.61, "grad_norm": 0.7076414227485657, "learning_rate": 1.9777747914385782e-05, "loss": 0.1485, "step": 1940 }, { "epoch": 0.61, "grad_norm": 0.6960060596466064, "learning_rate": 1.9776605098866705e-05, "loss": 0.104, "step": 1945 }, { "epoch": 0.61, "grad_norm": 0.9698060154914856, "learning_rate": 1.9775459385898892e-05, "loss": 0.09, "step": 1950 }, { "epoch": 0.61, "grad_norm": 0.5515087842941284, "learning_rate": 1.9774310775821883e-05, "loss": 0.0888, "step": 1955 }, { "epoch": 0.61, "grad_norm": 0.644008457660675, "learning_rate": 1.977315926897609e-05, "loss": 0.1185, "step": 1960 }, { "epoch": 0.61, "grad_norm": 0.7313825488090515, "learning_rate": 1.9772004865702784e-05, "loss": 0.1103, "step": 1965 }, { "epoch": 0.61, "grad_norm": 0.7930110096931458, "learning_rate": 1.9770847566344083e-05, "loss": 0.1038, "step": 1970 }, { "epoch": 0.62, "grad_norm": 0.6186053156852722, "learning_rate": 1.9769687371242973e-05, "loss": 0.1131, "step": 1975 }, { "epoch": 0.62, "grad_norm": 0.5100052952766418, "learning_rate": 1.9768524280743298e-05, "loss": 0.0998, "step": 1980 }, { "epoch": 0.62, "grad_norm": 0.6867700815200806, "learning_rate": 1.976735829518976e-05, "loss": 0.1045, "step": 1985 }, { "epoch": 0.62, "grad_norm": 0.7107873558998108, "learning_rate": 1.976618941492791e-05, "loss": 0.1067, "step": 1990 }, { "epoch": 0.62, "grad_norm": 0.6350167393684387, "learning_rate": 1.9765017640304166e-05, "loss": 0.1032, "step": 1995 }, { "epoch": 0.62, "grad_norm": 0.7512570023536682, "learning_rate": 1.976384297166581e-05, "loss": 0.1477, "step": 2000 }, { "epoch": 0.63, "grad_norm": 0.5293204188346863, "learning_rate": 1.9762665409360958e-05, "loss": 0.1131, "step": 2005 }, { "epoch": 0.63, "grad_norm": 0.6789517402648926, "learning_rate": 1.9761484953738614e-05, "loss": 0.1101, "step": 2010 }, { "epoch": 0.63, "grad_norm": 0.7041693329811096, "learning_rate": 1.9760301605148615e-05, "loss": 0.1155, "step": 2015 }, { "epoch": 0.63, "grad_norm": 0.5716454982757568, "learning_rate": 1.9759115363941668e-05, "loss": 0.116, "step": 2020 }, { "epoch": 0.63, "grad_norm": 0.5885083079338074, "learning_rate": 1.9757926230469334e-05, "loss": 0.0912, "step": 2025 }, { "epoch": 0.63, "grad_norm": 0.5873716473579407, "learning_rate": 1.975673420508403e-05, "loss": 0.0879, "step": 2030 }, { "epoch": 0.63, "grad_norm": 0.8442503809928894, "learning_rate": 1.975553928813903e-05, "loss": 0.1104, "step": 2035 }, { "epoch": 0.64, "grad_norm": 0.5353924036026001, "learning_rate": 1.975434147998847e-05, "loss": 0.1202, "step": 2040 }, { "epoch": 0.64, "grad_norm": 0.7577025294303894, "learning_rate": 1.9753140780987334e-05, "loss": 0.113, "step": 2045 }, { "epoch": 0.64, "grad_norm": 0.7398855686187744, "learning_rate": 1.9751937191491474e-05, "loss": 0.1304, "step": 2050 }, { "epoch": 0.64, "grad_norm": 0.6806811690330505, "learning_rate": 1.9750730711857588e-05, "loss": 0.0784, "step": 2055 }, { "epoch": 0.64, "grad_norm": 0.7182321548461914, "learning_rate": 1.974952134244324e-05, "loss": 0.1381, "step": 2060 }, { "epoch": 0.64, "grad_norm": 0.6555871367454529, "learning_rate": 1.9748309083606834e-05, "loss": 0.1076, "step": 2065 }, { "epoch": 0.65, "grad_norm": 0.7293125987052917, "learning_rate": 1.9747093935707658e-05, "loss": 0.1219, "step": 2070 }, { "epoch": 0.65, "grad_norm": 0.8153331875801086, "learning_rate": 1.9745875899105824e-05, "loss": 0.0967, "step": 2075 }, { "epoch": 0.65, "grad_norm": 0.6450050473213196, "learning_rate": 1.974465497416233e-05, "loss": 0.1347, "step": 2080 }, { "epoch": 0.65, "grad_norm": 0.8893457055091858, "learning_rate": 1.9743431161239003e-05, "loss": 0.1311, "step": 2085 }, { "epoch": 0.65, "grad_norm": 0.5727622509002686, "learning_rate": 1.974220446069855e-05, "loss": 0.0853, "step": 2090 }, { "epoch": 0.65, "grad_norm": 0.623758852481842, "learning_rate": 1.9740974872904517e-05, "loss": 0.1195, "step": 2095 }, { "epoch": 0.65, "grad_norm": 0.6649274230003357, "learning_rate": 1.9739742398221314e-05, "loss": 0.1091, "step": 2100 }, { "epoch": 0.66, "grad_norm": 0.7457634806632996, "learning_rate": 1.9738507037014198e-05, "loss": 0.1476, "step": 2105 }, { "epoch": 0.66, "grad_norm": 0.6186316013336182, "learning_rate": 1.9737268789649295e-05, "loss": 0.1125, "step": 2110 }, { "epoch": 0.66, "grad_norm": 0.6898834705352783, "learning_rate": 1.9736027656493576e-05, "loss": 0.1433, "step": 2115 }, { "epoch": 0.66, "grad_norm": 0.5200203061103821, "learning_rate": 1.9734783637914868e-05, "loss": 0.1071, "step": 2120 }, { "epoch": 0.66, "grad_norm": 0.7021803855895996, "learning_rate": 1.9733536734281863e-05, "loss": 0.0774, "step": 2125 }, { "epoch": 0.66, "grad_norm": 0.7065944671630859, "learning_rate": 1.973228694596409e-05, "loss": 0.1142, "step": 2130 }, { "epoch": 0.67, "grad_norm": 0.5829887390136719, "learning_rate": 1.973103427333195e-05, "loss": 0.0856, "step": 2135 }, { "epoch": 0.67, "grad_norm": 0.5470687747001648, "learning_rate": 1.9729778716756687e-05, "loss": 0.1114, "step": 2140 }, { "epoch": 0.67, "grad_norm": 0.6725425720214844, "learning_rate": 1.9728520276610407e-05, "loss": 0.1097, "step": 2145 }, { "epoch": 0.67, "grad_norm": 0.8056449890136719, "learning_rate": 1.972725895326607e-05, "loss": 0.1607, "step": 2150 }, { "epoch": 0.67, "grad_norm": 0.7054765224456787, "learning_rate": 1.9725994747097487e-05, "loss": 0.0967, "step": 2155 }, { "epoch": 0.67, "grad_norm": 0.6087821125984192, "learning_rate": 1.972472765847933e-05, "loss": 0.1179, "step": 2160 }, { "epoch": 0.68, "grad_norm": 0.6627722978591919, "learning_rate": 1.972345768778711e-05, "loss": 0.114, "step": 2165 }, { "epoch": 0.68, "grad_norm": 0.8059279918670654, "learning_rate": 1.972218483539721e-05, "loss": 0.1533, "step": 2170 }, { "epoch": 0.68, "grad_norm": 0.9217575788497925, "learning_rate": 1.972090910168686e-05, "loss": 0.1613, "step": 2175 }, { "epoch": 0.68, "grad_norm": 0.8698978424072266, "learning_rate": 1.9719630487034138e-05, "loss": 0.122, "step": 2180 }, { "epoch": 0.68, "grad_norm": 0.4954151213169098, "learning_rate": 1.9718348991817988e-05, "loss": 0.1374, "step": 2185 }, { "epoch": 0.68, "grad_norm": 0.7494037747383118, "learning_rate": 1.9717064616418195e-05, "loss": 0.1298, "step": 2190 }, { "epoch": 0.68, "grad_norm": 0.5662988424301147, "learning_rate": 1.971577736121541e-05, "loss": 0.0793, "step": 2195 }, { "epoch": 0.69, "grad_norm": 0.7248169779777527, "learning_rate": 1.9714487226591122e-05, "loss": 0.1084, "step": 2200 }, { "epoch": 0.69, "grad_norm": 0.5932888984680176, "learning_rate": 1.9713194212927694e-05, "loss": 0.1141, "step": 2205 }, { "epoch": 0.69, "grad_norm": 0.5105285048484802, "learning_rate": 1.971189832060832e-05, "loss": 0.0972, "step": 2210 }, { "epoch": 0.69, "grad_norm": 0.667962372303009, "learning_rate": 1.9710599550017068e-05, "loss": 0.1347, "step": 2215 }, { "epoch": 0.69, "grad_norm": 0.6687511205673218, "learning_rate": 1.9709297901538836e-05, "loss": 0.1006, "step": 2220 }, { "epoch": 0.69, "grad_norm": 0.7091338038444519, "learning_rate": 1.9707993375559402e-05, "loss": 0.0885, "step": 2225 }, { "epoch": 0.7, "grad_norm": 0.6677641272544861, "learning_rate": 1.9706685972465372e-05, "loss": 0.1057, "step": 2230 }, { "epoch": 0.7, "grad_norm": 0.7264307141304016, "learning_rate": 1.970537569264422e-05, "loss": 0.136, "step": 2235 }, { "epoch": 0.7, "grad_norm": 0.5116482973098755, "learning_rate": 1.9704062536484264e-05, "loss": 0.0875, "step": 2240 }, { "epoch": 0.7, "grad_norm": 0.8132953643798828, "learning_rate": 1.9702746504374682e-05, "loss": 0.1185, "step": 2245 }, { "epoch": 0.7, "grad_norm": 0.6584734916687012, "learning_rate": 1.9701427596705504e-05, "loss": 0.1155, "step": 2250 }, { "epoch": 0.7, "grad_norm": 0.6060574650764465, "learning_rate": 1.9700105813867598e-05, "loss": 0.1025, "step": 2255 }, { "epoch": 0.7, "grad_norm": 0.8468656539916992, "learning_rate": 1.9698781156252703e-05, "loss": 0.1275, "step": 2260 }, { "epoch": 0.71, "grad_norm": 0.7912387847900391, "learning_rate": 1.9697453624253396e-05, "loss": 0.1226, "step": 2265 }, { "epoch": 0.71, "grad_norm": 0.41651102900505066, "learning_rate": 1.969612321826312e-05, "loss": 0.1076, "step": 2270 }, { "epoch": 0.71, "grad_norm": 0.7137461304664612, "learning_rate": 1.9694789938676156e-05, "loss": 0.1335, "step": 2275 }, { "epoch": 0.71, "grad_norm": 0.7056092023849487, "learning_rate": 1.9693453785887643e-05, "loss": 0.1138, "step": 2280 }, { "epoch": 0.71, "grad_norm": 0.7696053385734558, "learning_rate": 1.969211476029357e-05, "loss": 0.1369, "step": 2285 }, { "epoch": 0.71, "grad_norm": 0.8914565443992615, "learning_rate": 1.969077286229078e-05, "loss": 0.1409, "step": 2290 }, { "epoch": 0.72, "grad_norm": 0.8970075845718384, "learning_rate": 1.9689428092276963e-05, "loss": 0.1603, "step": 2295 }, { "epoch": 0.72, "grad_norm": 0.8135008215904236, "learning_rate": 1.9688080450650667e-05, "loss": 0.1301, "step": 2300 }, { "epoch": 0.72, "grad_norm": 0.7445924878120422, "learning_rate": 1.968672993781128e-05, "loss": 0.1204, "step": 2305 }, { "epoch": 0.72, "grad_norm": 0.631936252117157, "learning_rate": 1.9685376554159048e-05, "loss": 0.0874, "step": 2310 }, { "epoch": 0.72, "grad_norm": 0.7985293865203857, "learning_rate": 1.9684020300095074e-05, "loss": 0.1071, "step": 2315 }, { "epoch": 0.72, "grad_norm": 0.6265071630477905, "learning_rate": 1.9682661176021297e-05, "loss": 0.1446, "step": 2320 }, { "epoch": 0.73, "grad_norm": 0.7634604573249817, "learning_rate": 1.9681299182340523e-05, "loss": 0.134, "step": 2325 }, { "epoch": 0.73, "grad_norm": 0.6241299510002136, "learning_rate": 1.9679934319456393e-05, "loss": 0.1438, "step": 2330 }, { "epoch": 0.73, "grad_norm": 0.7445600628852844, "learning_rate": 1.9678566587773406e-05, "loss": 0.1006, "step": 2335 }, { "epoch": 0.73, "grad_norm": 0.737260103225708, "learning_rate": 1.9677195987696912e-05, "loss": 0.0995, "step": 2340 }, { "epoch": 0.73, "grad_norm": 0.8832901120185852, "learning_rate": 1.9675822519633113e-05, "loss": 0.1441, "step": 2345 }, { "epoch": 0.73, "grad_norm": 1.1757749319076538, "learning_rate": 1.967444618398905e-05, "loss": 0.1051, "step": 2350 }, { "epoch": 0.73, "grad_norm": 0.6915484070777893, "learning_rate": 1.967306698117263e-05, "loss": 0.1169, "step": 2355 }, { "epoch": 0.74, "grad_norm": 0.8908982276916504, "learning_rate": 1.9671684911592595e-05, "loss": 0.1209, "step": 2360 }, { "epoch": 0.74, "grad_norm": 0.6741725206375122, "learning_rate": 1.9670299975658544e-05, "loss": 0.1251, "step": 2365 }, { "epoch": 0.74, "grad_norm": 0.8597294688224792, "learning_rate": 1.9668912173780925e-05, "loss": 0.1387, "step": 2370 }, { "epoch": 0.74, "grad_norm": 0.6731261610984802, "learning_rate": 1.9667521506371036e-05, "loss": 0.0912, "step": 2375 }, { "epoch": 0.74, "grad_norm": 0.6857419013977051, "learning_rate": 1.9666127973841022e-05, "loss": 0.093, "step": 2380 }, { "epoch": 0.74, "grad_norm": 0.5474216341972351, "learning_rate": 1.9664731576603872e-05, "loss": 0.0884, "step": 2385 }, { "epoch": 0.75, "grad_norm": 0.5955496430397034, "learning_rate": 1.966333231507344e-05, "loss": 0.0883, "step": 2390 }, { "epoch": 0.75, "grad_norm": 0.9464544653892517, "learning_rate": 1.9661930189664417e-05, "loss": 0.1198, "step": 2395 }, { "epoch": 0.75, "grad_norm": 0.7434287667274475, "learning_rate": 1.9660525200792336e-05, "loss": 0.1051, "step": 2400 }, { "epoch": 0.75, "grad_norm": 0.9932650327682495, "learning_rate": 1.9659117348873595e-05, "loss": 0.1326, "step": 2405 }, { "epoch": 0.75, "grad_norm": 0.6093739867210388, "learning_rate": 1.9657706634325433e-05, "loss": 0.1051, "step": 2410 }, { "epoch": 0.75, "grad_norm": 0.47722938656806946, "learning_rate": 1.9656293057565932e-05, "loss": 0.1202, "step": 2415 }, { "epoch": 0.75, "grad_norm": 0.6692053079605103, "learning_rate": 1.9654876619014034e-05, "loss": 0.1106, "step": 2420 }, { "epoch": 0.76, "grad_norm": 0.8569662570953369, "learning_rate": 1.965345731908952e-05, "loss": 0.1131, "step": 2425 }, { "epoch": 0.76, "grad_norm": 0.7757257223129272, "learning_rate": 1.9652035158213015e-05, "loss": 0.1076, "step": 2430 }, { "epoch": 0.76, "grad_norm": 0.7739999890327454, "learning_rate": 1.965061013680601e-05, "loss": 0.1501, "step": 2435 }, { "epoch": 0.76, "grad_norm": 0.6754980087280273, "learning_rate": 1.9649182255290823e-05, "loss": 0.1372, "step": 2440 }, { "epoch": 0.76, "grad_norm": 0.571395993232727, "learning_rate": 1.9647751514090636e-05, "loss": 0.1411, "step": 2445 }, { "epoch": 0.76, "grad_norm": 0.8778666853904724, "learning_rate": 1.9646317913629464e-05, "loss": 0.1307, "step": 2450 }, { "epoch": 0.77, "grad_norm": 0.6127117276191711, "learning_rate": 1.9644881454332183e-05, "loss": 0.099, "step": 2455 }, { "epoch": 0.77, "grad_norm": 0.852408230304718, "learning_rate": 1.9643442136624507e-05, "loss": 0.0993, "step": 2460 }, { "epoch": 0.77, "grad_norm": 0.8724494576454163, "learning_rate": 1.9641999960932997e-05, "loss": 0.1092, "step": 2465 }, { "epoch": 0.77, "grad_norm": 1.0813883543014526, "learning_rate": 1.964055492768507e-05, "loss": 0.1073, "step": 2470 }, { "epoch": 0.77, "grad_norm": 0.7854318618774414, "learning_rate": 1.9639107037308985e-05, "loss": 0.112, "step": 2475 }, { "epoch": 0.77, "grad_norm": 0.7752684950828552, "learning_rate": 1.963765629023384e-05, "loss": 0.1129, "step": 2480 }, { "epoch": 0.78, "grad_norm": 0.5344607830047607, "learning_rate": 1.963620268688959e-05, "loss": 0.124, "step": 2485 }, { "epoch": 0.78, "grad_norm": 0.5747987627983093, "learning_rate": 1.9634746227707037e-05, "loss": 0.1107, "step": 2490 }, { "epoch": 0.78, "grad_norm": 0.8008213639259338, "learning_rate": 1.9633286913117816e-05, "loss": 0.135, "step": 2495 }, { "epoch": 0.78, "grad_norm": 0.5648581385612488, "learning_rate": 1.9631824743554425e-05, "loss": 0.0976, "step": 2500 }, { "epoch": 0.78, "grad_norm": 0.7081100344657898, "learning_rate": 1.9630359719450197e-05, "loss": 0.1372, "step": 2505 }, { "epoch": 0.78, "grad_norm": 0.71477872133255, "learning_rate": 1.962889184123932e-05, "loss": 0.1119, "step": 2510 }, { "epoch": 0.78, "grad_norm": 0.613400936126709, "learning_rate": 1.9627421109356817e-05, "loss": 0.104, "step": 2515 }, { "epoch": 0.79, "grad_norm": 0.5026811361312866, "learning_rate": 1.9625947524238564e-05, "loss": 0.0951, "step": 2520 }, { "epoch": 0.79, "grad_norm": 0.6399575471878052, "learning_rate": 1.9624471086321276e-05, "loss": 0.0751, "step": 2525 }, { "epoch": 0.79, "grad_norm": 0.7490134835243225, "learning_rate": 1.9622991796042526e-05, "loss": 0.1211, "step": 2530 }, { "epoch": 0.79, "grad_norm": 0.6886721253395081, "learning_rate": 1.9621509653840724e-05, "loss": 0.1058, "step": 2535 }, { "epoch": 0.79, "grad_norm": 0.689605712890625, "learning_rate": 1.962002466015512e-05, "loss": 0.0789, "step": 2540 }, { "epoch": 0.79, "grad_norm": 0.4565759003162384, "learning_rate": 1.9618536815425822e-05, "loss": 0.1147, "step": 2545 }, { "epoch": 0.8, "grad_norm": 0.5247865319252014, "learning_rate": 1.9617046120093772e-05, "loss": 0.1168, "step": 2550 }, { "epoch": 0.8, "grad_norm": 0.6111215353012085, "learning_rate": 1.961555257460076e-05, "loss": 0.1375, "step": 2555 }, { "epoch": 0.8, "grad_norm": 0.9100821614265442, "learning_rate": 1.961405617938942e-05, "loss": 0.1192, "step": 2560 }, { "epoch": 0.8, "grad_norm": 0.7861168384552002, "learning_rate": 1.9612556934903236e-05, "loss": 0.1096, "step": 2565 }, { "epoch": 0.8, "grad_norm": 0.7265927791595459, "learning_rate": 1.9611054841586532e-05, "loss": 0.0869, "step": 2570 }, { "epoch": 0.8, "grad_norm": 0.8864724636077881, "learning_rate": 1.9609549899884476e-05, "loss": 0.1129, "step": 2575 }, { "epoch": 0.8, "grad_norm": 0.7909520864486694, "learning_rate": 1.9608042110243077e-05, "loss": 0.1203, "step": 2580 }, { "epoch": 0.81, "grad_norm": 0.5725612640380859, "learning_rate": 1.96065314731092e-05, "loss": 0.1356, "step": 2585 }, { "epoch": 0.81, "grad_norm": 0.4555843770503998, "learning_rate": 1.9605017988930535e-05, "loss": 0.099, "step": 2590 }, { "epoch": 0.81, "grad_norm": 0.6792924404144287, "learning_rate": 1.9603501658155634e-05, "loss": 0.1184, "step": 2595 }, { "epoch": 0.81, "grad_norm": 2.4023492336273193, "learning_rate": 1.9601982481233885e-05, "loss": 0.0903, "step": 2600 }, { "epoch": 0.81, "grad_norm": 0.8893625140190125, "learning_rate": 1.9600460458615517e-05, "loss": 0.1313, "step": 2605 }, { "epoch": 0.81, "grad_norm": 0.840968132019043, "learning_rate": 1.959893559075161e-05, "loss": 0.1068, "step": 2610 }, { "epoch": 0.82, "grad_norm": 0.8245032429695129, "learning_rate": 1.9597713648187354e-05, "loss": 0.1491, "step": 2615 }, { "epoch": 0.82, "grad_norm": 0.8265867829322815, "learning_rate": 1.9596183660020867e-05, "loss": 0.1156, "step": 2620 }, { "epoch": 0.82, "grad_norm": 0.9567066431045532, "learning_rate": 1.9594650827876332e-05, "loss": 0.1285, "step": 2625 }, { "epoch": 0.82, "grad_norm": 1.0007941722869873, "learning_rate": 1.9593115152208034e-05, "loss": 0.1139, "step": 2630 }, { "epoch": 0.82, "grad_norm": 1.6162289381027222, "learning_rate": 1.9591576633471084e-05, "loss": 0.1314, "step": 2635 }, { "epoch": 0.82, "grad_norm": 0.9753004312515259, "learning_rate": 1.9590035272121453e-05, "loss": 0.1472, "step": 2640 }, { "epoch": 0.82, "grad_norm": 1.2655069828033447, "learning_rate": 1.9588491068615946e-05, "loss": 0.114, "step": 2645 }, { "epoch": 0.83, "grad_norm": 1.1793577671051025, "learning_rate": 1.9586944023412212e-05, "loss": 0.123, "step": 2650 }, { "epoch": 0.83, "grad_norm": 0.7227995991706848, "learning_rate": 1.958539413696874e-05, "loss": 0.1043, "step": 2655 }, { "epoch": 0.83, "grad_norm": 0.6717692017555237, "learning_rate": 1.9583841409744862e-05, "loss": 0.0765, "step": 2660 }, { "epoch": 0.83, "grad_norm": 0.5767033100128174, "learning_rate": 1.9582285842200753e-05, "loss": 0.1184, "step": 2665 }, { "epoch": 0.83, "grad_norm": 0.9931422472000122, "learning_rate": 1.9580727434797432e-05, "loss": 0.0844, "step": 2670 }, { "epoch": 0.83, "grad_norm": 0.6345029473304749, "learning_rate": 1.957916618799676e-05, "loss": 0.1105, "step": 2675 }, { "epoch": 0.84, "grad_norm": 0.7955521941184998, "learning_rate": 1.957760210226143e-05, "loss": 0.1172, "step": 2680 }, { "epoch": 0.84, "grad_norm": 0.7941994071006775, "learning_rate": 1.9576035178054988e-05, "loss": 0.0976, "step": 2685 }, { "epoch": 0.84, "grad_norm": 1.0277880430221558, "learning_rate": 1.957446541584181e-05, "loss": 0.1201, "step": 2690 }, { "epoch": 0.84, "grad_norm": 0.7474991083145142, "learning_rate": 1.9572892816087132e-05, "loss": 0.1154, "step": 2695 }, { "epoch": 0.84, "grad_norm": 0.788794219493866, "learning_rate": 1.9571317379257006e-05, "loss": 0.1399, "step": 2700 }, { "epoch": 0.84, "grad_norm": 0.7396997809410095, "learning_rate": 1.9569739105818342e-05, "loss": 0.0995, "step": 2705 }, { "epoch": 0.85, "grad_norm": 0.9025750756263733, "learning_rate": 1.9568157996238884e-05, "loss": 0.0943, "step": 2710 }, { "epoch": 0.85, "grad_norm": 0.7392351627349854, "learning_rate": 1.956657405098722e-05, "loss": 0.1283, "step": 2715 }, { "epoch": 0.85, "grad_norm": 0.934230625629425, "learning_rate": 1.9564987270532777e-05, "loss": 0.1131, "step": 2720 }, { "epoch": 0.85, "grad_norm": 0.8671979308128357, "learning_rate": 1.9563397655345822e-05, "loss": 0.13, "step": 2725 }, { "epoch": 0.85, "grad_norm": 0.7905526161193848, "learning_rate": 1.956180520589746e-05, "loss": 0.0957, "step": 2730 }, { "epoch": 0.85, "grad_norm": 0.6696571111679077, "learning_rate": 1.956020992265964e-05, "loss": 0.1187, "step": 2735 }, { "epoch": 0.85, "grad_norm": 0.528008759021759, "learning_rate": 1.955861180610515e-05, "loss": 0.0787, "step": 2740 }, { "epoch": 0.86, "grad_norm": 1.014591932296753, "learning_rate": 1.9557010856707615e-05, "loss": 0.1075, "step": 2745 }, { "epoch": 0.86, "grad_norm": 0.8025286197662354, "learning_rate": 1.9555407074941503e-05, "loss": 0.1274, "step": 2750 }, { "epoch": 0.86, "grad_norm": 1.1211886405944824, "learning_rate": 1.9553800461282114e-05, "loss": 0.1212, "step": 2755 }, { "epoch": 0.86, "grad_norm": 1.1589736938476562, "learning_rate": 1.95521910162056e-05, "loss": 0.1171, "step": 2760 }, { "epoch": 0.86, "grad_norm": 0.9360532164573669, "learning_rate": 1.9550578740188945e-05, "loss": 0.1468, "step": 2765 }, { "epoch": 0.86, "grad_norm": 0.8516525626182556, "learning_rate": 1.9548963633709967e-05, "loss": 0.1472, "step": 2770 }, { "epoch": 0.87, "grad_norm": 0.8036392331123352, "learning_rate": 1.954734569724733e-05, "loss": 0.0956, "step": 2775 }, { "epoch": 0.87, "grad_norm": 1.0293813943862915, "learning_rate": 1.9545724931280535e-05, "loss": 0.1474, "step": 2780 }, { "epoch": 0.87, "grad_norm": 0.9267975687980652, "learning_rate": 1.954410133628992e-05, "loss": 0.1416, "step": 2785 }, { "epoch": 0.87, "grad_norm": 0.7071394920349121, "learning_rate": 1.9542474912756663e-05, "loss": 0.0968, "step": 2790 }, { "epoch": 0.87, "grad_norm": 0.9008345007896423, "learning_rate": 1.954084566116278e-05, "loss": 0.143, "step": 2795 }, { "epoch": 0.87, "grad_norm": 0.6870996356010437, "learning_rate": 1.9539213581991127e-05, "loss": 0.133, "step": 2800 }, { "epoch": 0.87, "grad_norm": 0.8136075735092163, "learning_rate": 1.9537578675725393e-05, "loss": 0.1283, "step": 2805 }, { "epoch": 0.88, "grad_norm": 0.7019197344779968, "learning_rate": 1.953594094285011e-05, "loss": 0.1398, "step": 2810 }, { "epoch": 0.88, "grad_norm": 0.6268460750579834, "learning_rate": 1.9534300383850643e-05, "loss": 0.0928, "step": 2815 }, { "epoch": 0.88, "grad_norm": 0.5861523747444153, "learning_rate": 1.95326569992132e-05, "loss": 0.1075, "step": 2820 }, { "epoch": 0.88, "grad_norm": 0.7399869561195374, "learning_rate": 1.953101078942482e-05, "loss": 0.122, "step": 2825 }, { "epoch": 0.88, "grad_norm": 0.8029427528381348, "learning_rate": 1.952936175497339e-05, "loss": 0.109, "step": 2830 }, { "epoch": 0.88, "grad_norm": 0.7381945848464966, "learning_rate": 1.9527709896347623e-05, "loss": 0.1099, "step": 2835 }, { "epoch": 0.89, "grad_norm": 0.680610716342926, "learning_rate": 1.9526055214037067e-05, "loss": 0.1067, "step": 2840 }, { "epoch": 0.89, "grad_norm": 0.5349668264389038, "learning_rate": 1.952439770853212e-05, "loss": 0.0878, "step": 2845 }, { "epoch": 0.89, "grad_norm": 0.8351783156394958, "learning_rate": 1.952273738032401e-05, "loss": 0.0794, "step": 2850 }, { "epoch": 0.89, "grad_norm": 0.887694239616394, "learning_rate": 1.9521074229904797e-05, "loss": 0.118, "step": 2855 }, { "epoch": 0.89, "grad_norm": 0.675528883934021, "learning_rate": 1.9519408257767383e-05, "loss": 0.1485, "step": 2860 }, { "epoch": 0.89, "grad_norm": 1.2613089084625244, "learning_rate": 1.9517739464405503e-05, "loss": 0.0914, "step": 2865 }, { "epoch": 0.9, "grad_norm": 0.6458963751792908, "learning_rate": 1.9516067850313734e-05, "loss": 0.0831, "step": 2870 }, { "epoch": 0.9, "grad_norm": 0.6229132413864136, "learning_rate": 1.9514393415987484e-05, "loss": 0.1205, "step": 2875 }, { "epoch": 0.9, "grad_norm": 0.6624443531036377, "learning_rate": 1.951271616192299e-05, "loss": 0.1246, "step": 2880 }, { "epoch": 0.9, "grad_norm": 0.7545179724693298, "learning_rate": 1.9511036088617344e-05, "loss": 0.1181, "step": 2885 }, { "epoch": 0.9, "grad_norm": 0.689713716506958, "learning_rate": 1.9509353196568454e-05, "loss": 0.117, "step": 2890 }, { "epoch": 0.9, "grad_norm": 0.8974180221557617, "learning_rate": 1.9507667486275072e-05, "loss": 0.1255, "step": 2895 }, { "epoch": 0.9, "grad_norm": 0.8327198028564453, "learning_rate": 1.9505978958236785e-05, "loss": 0.1371, "step": 2900 }, { "epoch": 0.91, "grad_norm": 0.7198882102966309, "learning_rate": 1.9504287612954017e-05, "loss": 0.1352, "step": 2905 }, { "epoch": 0.91, "grad_norm": 0.5859416127204895, "learning_rate": 1.9502593450928025e-05, "loss": 0.0857, "step": 2910 }, { "epoch": 0.91, "grad_norm": 0.7368265986442566, "learning_rate": 1.9500896472660896e-05, "loss": 0.0864, "step": 2915 }, { "epoch": 0.91, "grad_norm": 0.5910233855247498, "learning_rate": 1.9499196678655554e-05, "loss": 0.1122, "step": 2920 }, { "epoch": 0.91, "grad_norm": 0.6884198784828186, "learning_rate": 1.9497494069415767e-05, "loss": 0.1076, "step": 2925 }, { "epoch": 0.91, "grad_norm": 0.8379870057106018, "learning_rate": 1.9495788645446124e-05, "loss": 0.1118, "step": 2930 }, { "epoch": 0.92, "grad_norm": 0.7852750420570374, "learning_rate": 1.9494080407252058e-05, "loss": 0.1028, "step": 2935 }, { "epoch": 0.92, "grad_norm": 0.5889078378677368, "learning_rate": 1.9492369355339827e-05, "loss": 0.1474, "step": 2940 }, { "epoch": 0.92, "grad_norm": 0.8883277177810669, "learning_rate": 1.9490655490216534e-05, "loss": 0.1115, "step": 2945 }, { "epoch": 0.92, "grad_norm": 1.0939563512802124, "learning_rate": 1.9488938812390102e-05, "loss": 0.112, "step": 2950 }, { "epoch": 0.92, "grad_norm": 0.6624184250831604, "learning_rate": 1.94872193223693e-05, "loss": 0.1048, "step": 2955 }, { "epoch": 0.92, "grad_norm": 0.8896359801292419, "learning_rate": 1.9485497020663725e-05, "loss": 0.1351, "step": 2960 }, { "epoch": 0.92, "grad_norm": 0.6318917870521545, "learning_rate": 1.9483771907783806e-05, "loss": 0.1173, "step": 2965 }, { "epoch": 0.93, "grad_norm": 0.7852448225021362, "learning_rate": 1.948204398424081e-05, "loss": 0.1057, "step": 2970 }, { "epoch": 0.93, "grad_norm": 0.6205310225486755, "learning_rate": 1.9480313250546833e-05, "loss": 0.1229, "step": 2975 }, { "epoch": 0.93, "grad_norm": 0.885155439376831, "learning_rate": 1.94785797072148e-05, "loss": 0.0921, "step": 2980 }, { "epoch": 0.93, "grad_norm": 0.7539743185043335, "learning_rate": 1.9476843354758483e-05, "loss": 0.1534, "step": 2985 }, { "epoch": 0.93, "grad_norm": 0.8679502010345459, "learning_rate": 1.947510419369247e-05, "loss": 0.0976, "step": 2990 }, { "epoch": 0.93, "grad_norm": 0.5822016000747681, "learning_rate": 1.947336222453219e-05, "loss": 0.0882, "step": 2995 }, { "epoch": 0.94, "grad_norm": 0.7897557020187378, "learning_rate": 1.9471617447793903e-05, "loss": 0.1242, "step": 3000 }, { "epoch": 0.94, "grad_norm": 1.6063729524612427, "learning_rate": 1.94698698639947e-05, "loss": 0.1531, "step": 3005 }, { "epoch": 0.94, "grad_norm": 0.5307276844978333, "learning_rate": 1.9468119473652506e-05, "loss": 0.1106, "step": 3010 }, { "epoch": 0.94, "grad_norm": 0.6027793884277344, "learning_rate": 1.9466366277286075e-05, "loss": 0.1413, "step": 3015 }, { "epoch": 0.94, "grad_norm": 1.4923043251037598, "learning_rate": 1.9464610275414996e-05, "loss": 0.1398, "step": 3020 }, { "epoch": 0.94, "grad_norm": 0.7162624001502991, "learning_rate": 1.946285146855968e-05, "loss": 0.0702, "step": 3025 }, { "epoch": 0.95, "grad_norm": 0.689167857170105, "learning_rate": 1.9461089857241387e-05, "loss": 0.1331, "step": 3030 }, { "epoch": 0.95, "grad_norm": 0.7341986298561096, "learning_rate": 1.9459325441982192e-05, "loss": 0.062, "step": 3035 }, { "epoch": 0.95, "grad_norm": 0.5668254494667053, "learning_rate": 1.945755822330501e-05, "loss": 0.1277, "step": 3040 }, { "epoch": 0.95, "grad_norm": 0.6565370559692383, "learning_rate": 1.945578820173358e-05, "loss": 0.0937, "step": 3045 }, { "epoch": 0.95, "grad_norm": 0.6408843398094177, "learning_rate": 1.9454015377792478e-05, "loss": 0.1204, "step": 3050 }, { "epoch": 0.95, "grad_norm": 0.683785617351532, "learning_rate": 1.945223975200711e-05, "loss": 0.0826, "step": 3055 }, { "epoch": 0.95, "grad_norm": 0.6759133338928223, "learning_rate": 1.94504613249037e-05, "loss": 0.1165, "step": 3060 }, { "epoch": 0.96, "grad_norm": 0.9206417202949524, "learning_rate": 1.9448680097009326e-05, "loss": 0.1219, "step": 3065 }, { "epoch": 0.96, "grad_norm": 0.5556135773658752, "learning_rate": 1.9446896068851877e-05, "loss": 0.0858, "step": 3070 }, { "epoch": 0.96, "grad_norm": 0.709461510181427, "learning_rate": 1.9445109240960076e-05, "loss": 0.0787, "step": 3075 }, { "epoch": 0.96, "grad_norm": 0.8782566785812378, "learning_rate": 1.9443319613863475e-05, "loss": 0.1239, "step": 3080 }, { "epoch": 0.96, "grad_norm": 0.7591779828071594, "learning_rate": 1.9441527188092462e-05, "loss": 0.1403, "step": 3085 }, { "epoch": 0.96, "grad_norm": 0.8730403780937195, "learning_rate": 1.9439731964178252e-05, "loss": 0.1159, "step": 3090 }, { "epoch": 0.97, "grad_norm": 0.6206468939781189, "learning_rate": 1.9437933942652883e-05, "loss": 0.1126, "step": 3095 }, { "epoch": 0.97, "grad_norm": 0.569102942943573, "learning_rate": 1.9436133124049227e-05, "loss": 0.0954, "step": 3100 }, { "epoch": 0.97, "grad_norm": 0.8233943581581116, "learning_rate": 1.943432950890099e-05, "loss": 0.0972, "step": 3105 }, { "epoch": 0.97, "grad_norm": 0.8483831286430359, "learning_rate": 1.9432523097742693e-05, "loss": 0.1138, "step": 3110 }, { "epoch": 0.97, "grad_norm": 0.47807222604751587, "learning_rate": 1.9430713891109698e-05, "loss": 0.1364, "step": 3115 }, { "epoch": 0.97, "grad_norm": 0.7278242111206055, "learning_rate": 1.942890188953819e-05, "loss": 0.1297, "step": 3120 }, { "epoch": 0.97, "grad_norm": 0.6293314695358276, "learning_rate": 1.942708709356519e-05, "loss": 0.1143, "step": 3125 }, { "epoch": 0.98, "grad_norm": 0.9456065893173218, "learning_rate": 1.9425269503728536e-05, "loss": 0.1052, "step": 3130 }, { "epoch": 0.98, "grad_norm": 0.8846820592880249, "learning_rate": 1.94234491205669e-05, "loss": 0.1414, "step": 3135 }, { "epoch": 0.98, "grad_norm": 0.6343728303909302, "learning_rate": 1.9421625944619778e-05, "loss": 0.0812, "step": 3140 }, { "epoch": 0.98, "grad_norm": 0.5727220177650452, "learning_rate": 1.9419799976427497e-05, "loss": 0.1436, "step": 3145 }, { "epoch": 0.98, "grad_norm": 0.7205094695091248, "learning_rate": 1.9417971216531217e-05, "loss": 0.1175, "step": 3150 }, { "epoch": 0.98, "grad_norm": 0.8147839307785034, "learning_rate": 1.941613966547291e-05, "loss": 0.1143, "step": 3155 }, { "epoch": 0.99, "grad_norm": 1.0907286405563354, "learning_rate": 1.941430532379539e-05, "loss": 0.1352, "step": 3160 }, { "epoch": 0.99, "grad_norm": 0.6234989762306213, "learning_rate": 1.9412468192042298e-05, "loss": 0.0985, "step": 3165 }, { "epoch": 0.99, "grad_norm": 0.9922168850898743, "learning_rate": 1.941062827075809e-05, "loss": 0.1644, "step": 3170 }, { "epoch": 0.99, "grad_norm": 0.5943562984466553, "learning_rate": 1.9408785560488052e-05, "loss": 0.1064, "step": 3175 }, { "epoch": 0.99, "grad_norm": 0.7550861239433289, "learning_rate": 1.9406940061778306e-05, "loss": 0.0945, "step": 3180 }, { "epoch": 0.99, "grad_norm": 0.4942474663257599, "learning_rate": 1.9405091775175792e-05, "loss": 0.1299, "step": 3185 }, { "epoch": 0.99, "grad_norm": 0.6698021292686462, "learning_rate": 1.9403240701228277e-05, "loss": 0.1189, "step": 3190 }, { "epoch": 1.0, "grad_norm": 0.7570865154266357, "learning_rate": 1.940138684048436e-05, "loss": 0.1232, "step": 3195 }, { "epoch": 1.0, "grad_norm": 0.40966135263442993, "learning_rate": 1.939953019349346e-05, "loss": 0.1092, "step": 3200 }, { "epoch": 1.0, "grad_norm": 0.6771470904350281, "learning_rate": 1.9397670760805817e-05, "loss": 0.1223, "step": 3205 }, { "epoch": 1.0, "grad_norm": 0.6577381491661072, "learning_rate": 1.9395808542972507e-05, "loss": 0.076, "step": 3210 }, { "epoch": 1.0, "grad_norm": 0.6543042063713074, "learning_rate": 1.9393943540545434e-05, "loss": 0.0738, "step": 3215 }, { "epoch": 1.0, "grad_norm": 0.8199294209480286, "learning_rate": 1.9392075754077307e-05, "loss": 0.0757, "step": 3220 }, { "epoch": 1.01, "grad_norm": 1.0831599235534668, "learning_rate": 1.9390205184121683e-05, "loss": 0.063, "step": 3225 }, { "epoch": 1.01, "grad_norm": 0.5358847975730896, "learning_rate": 1.9388331831232934e-05, "loss": 0.075, "step": 3230 }, { "epoch": 1.01, "grad_norm": 0.7586596608161926, "learning_rate": 1.9386455695966253e-05, "loss": 0.0643, "step": 3235 }, { "epoch": 1.01, "grad_norm": 0.8688328862190247, "learning_rate": 1.9384576778877663e-05, "loss": 0.0563, "step": 3240 }, { "epoch": 1.01, "grad_norm": 1.305880069732666, "learning_rate": 1.9382695080524013e-05, "loss": 0.0794, "step": 3245 }, { "epoch": 1.01, "grad_norm": 0.7951090931892395, "learning_rate": 1.9380810601462974e-05, "loss": 0.0795, "step": 3250 }, { "epoch": 1.02, "grad_norm": 0.5801917314529419, "learning_rate": 1.9378923342253035e-05, "loss": 0.0567, "step": 3255 }, { "epoch": 1.02, "grad_norm": 0.7360835075378418, "learning_rate": 1.9377033303453522e-05, "loss": 0.0528, "step": 3260 }, { "epoch": 1.02, "grad_norm": 13.658980369567871, "learning_rate": 1.9375140485624568e-05, "loss": 0.066, "step": 3265 }, { "epoch": 1.02, "grad_norm": 0.7239563465118408, "learning_rate": 1.937324488932715e-05, "loss": 0.0806, "step": 3270 }, { "epoch": 1.02, "grad_norm": 0.6245361566543579, "learning_rate": 1.9371346515123047e-05, "loss": 0.0443, "step": 3275 }, { "epoch": 1.02, "grad_norm": 0.6881046295166016, "learning_rate": 1.9369445363574877e-05, "loss": 0.0639, "step": 3280 }, { "epoch": 1.02, "grad_norm": 0.8322771191596985, "learning_rate": 1.9367541435246077e-05, "loss": 0.0843, "step": 3285 }, { "epoch": 1.03, "grad_norm": 0.9991777539253235, "learning_rate": 1.9365634730700903e-05, "loss": 0.0498, "step": 3290 }, { "epoch": 1.03, "grad_norm": 0.6667386293411255, "learning_rate": 1.9363725250504438e-05, "loss": 0.0881, "step": 3295 }, { "epoch": 1.03, "grad_norm": 0.7544474005699158, "learning_rate": 1.9361812995222586e-05, "loss": 0.0717, "step": 3300 }, { "epoch": 1.03, "grad_norm": 0.7047283053398132, "learning_rate": 1.935989796542207e-05, "loss": 0.0863, "step": 3305 }, { "epoch": 1.03, "grad_norm": 0.7891948819160461, "learning_rate": 1.9357980161670443e-05, "loss": 0.0479, "step": 3310 }, { "epoch": 1.03, "grad_norm": 0.8909279704093933, "learning_rate": 1.935605958453608e-05, "loss": 0.0662, "step": 3315 }, { "epoch": 1.04, "grad_norm": 0.6332457065582275, "learning_rate": 1.9354136234588163e-05, "loss": 0.058, "step": 3320 }, { "epoch": 1.04, "grad_norm": 0.6614800691604614, "learning_rate": 1.9352210112396713e-05, "loss": 0.0642, "step": 3325 }, { "epoch": 1.04, "grad_norm": 0.7803705334663391, "learning_rate": 1.935028121853257e-05, "loss": 0.063, "step": 3330 }, { "epoch": 1.04, "grad_norm": 0.6813333034515381, "learning_rate": 1.934834955356738e-05, "loss": 0.0656, "step": 3335 }, { "epoch": 1.04, "grad_norm": 1.1551873683929443, "learning_rate": 1.9346415118073634e-05, "loss": 0.087, "step": 3340 }, { "epoch": 1.04, "grad_norm": 0.6966256499290466, "learning_rate": 1.934447791262463e-05, "loss": 0.0621, "step": 3345 }, { "epoch": 1.04, "grad_norm": 0.8793096542358398, "learning_rate": 1.934253793779448e-05, "loss": 0.0504, "step": 3350 }, { "epoch": 1.05, "grad_norm": 0.9607953429222107, "learning_rate": 1.934059519415813e-05, "loss": 0.0789, "step": 3355 }, { "epoch": 1.05, "grad_norm": 0.8557941317558289, "learning_rate": 1.933864968229135e-05, "loss": 0.106, "step": 3360 }, { "epoch": 1.05, "grad_norm": 0.7457416653633118, "learning_rate": 1.9336701402770716e-05, "loss": 0.0633, "step": 3365 }, { "epoch": 1.05, "grad_norm": 0.5383043885231018, "learning_rate": 1.9334750356173627e-05, "loss": 0.0534, "step": 3370 }, { "epoch": 1.05, "grad_norm": 0.9611634612083435, "learning_rate": 1.9332796543078313e-05, "loss": 0.057, "step": 3375 }, { "epoch": 1.05, "grad_norm": 0.6954233646392822, "learning_rate": 1.9330839964063815e-05, "loss": 0.0562, "step": 3380 }, { "epoch": 1.06, "grad_norm": 0.7774192094802856, "learning_rate": 1.9328880619709995e-05, "loss": 0.068, "step": 3385 }, { "epoch": 1.06, "grad_norm": 0.7439762949943542, "learning_rate": 1.9326918510597534e-05, "loss": 0.0469, "step": 3390 }, { "epoch": 1.06, "grad_norm": 0.5823460221290588, "learning_rate": 1.9324953637307935e-05, "loss": 0.0599, "step": 3395 }, { "epoch": 1.06, "grad_norm": 0.6018666625022888, "learning_rate": 1.932298600042352e-05, "loss": 0.048, "step": 3400 }, { "epoch": 1.06, "grad_norm": 0.6874041557312012, "learning_rate": 1.9321015600527425e-05, "loss": 0.0947, "step": 3405 }, { "epoch": 1.06, "grad_norm": 0.7596636414527893, "learning_rate": 1.931904243820361e-05, "loss": 0.0719, "step": 3410 }, { "epoch": 1.07, "grad_norm": 0.7127168774604797, "learning_rate": 1.931706651403685e-05, "loss": 0.0699, "step": 3415 }, { "epoch": 1.07, "grad_norm": 0.7314068675041199, "learning_rate": 1.9315087828612747e-05, "loss": 0.0626, "step": 3420 }, { "epoch": 1.07, "grad_norm": 1.0029963254928589, "learning_rate": 1.9313106382517714e-05, "loss": 0.0673, "step": 3425 }, { "epoch": 1.07, "grad_norm": 0.8213211297988892, "learning_rate": 1.931112217633898e-05, "loss": 0.0641, "step": 3430 }, { "epoch": 1.07, "grad_norm": 0.6091497540473938, "learning_rate": 1.9309135210664592e-05, "loss": 0.0706, "step": 3435 }, { "epoch": 1.07, "grad_norm": 1.019183874130249, "learning_rate": 1.9307145486083426e-05, "loss": 0.0753, "step": 3440 }, { "epoch": 1.07, "grad_norm": 0.7182345986366272, "learning_rate": 1.9305153003185167e-05, "loss": 0.07, "step": 3445 }, { "epoch": 1.08, "grad_norm": 0.9115707874298096, "learning_rate": 1.930315776256031e-05, "loss": 0.0686, "step": 3450 }, { "epoch": 1.08, "grad_norm": 0.6539191007614136, "learning_rate": 1.9301159764800185e-05, "loss": 0.068, "step": 3455 }, { "epoch": 1.08, "grad_norm": 0.5159173607826233, "learning_rate": 1.9299159010496925e-05, "loss": 0.0555, "step": 3460 }, { "epoch": 1.08, "grad_norm": 0.8276963233947754, "learning_rate": 1.9297155500243485e-05, "loss": 0.0509, "step": 3465 }, { "epoch": 1.08, "grad_norm": 0.9333387613296509, "learning_rate": 1.9295149234633637e-05, "loss": 0.058, "step": 3470 }, { "epoch": 1.08, "grad_norm": 1.083387851715088, "learning_rate": 1.9293140214261967e-05, "loss": 0.0626, "step": 3475 }, { "epoch": 1.09, "grad_norm": 0.7512198090553284, "learning_rate": 1.9291128439723886e-05, "loss": 0.0548, "step": 3480 }, { "epoch": 1.09, "grad_norm": 0.9256929755210876, "learning_rate": 1.9289113911615605e-05, "loss": 0.0795, "step": 3485 }, { "epoch": 1.09, "grad_norm": 0.6555706858634949, "learning_rate": 1.9287096630534168e-05, "loss": 0.0551, "step": 3490 }, { "epoch": 1.09, "grad_norm": 0.8473435044288635, "learning_rate": 1.9285076597077425e-05, "loss": 0.0676, "step": 3495 }, { "epoch": 1.09, "grad_norm": 0.7201313972473145, "learning_rate": 1.928305381184404e-05, "loss": 0.0758, "step": 3500 }, { "epoch": 1.09, "grad_norm": 0.5640273094177246, "learning_rate": 1.9281028275433505e-05, "loss": 0.0706, "step": 3505 }, { "epoch": 1.09, "grad_norm": 0.6947470903396606, "learning_rate": 1.9278999988446118e-05, "loss": 0.0799, "step": 3510 }, { "epoch": 1.1, "grad_norm": 0.9326536059379578, "learning_rate": 1.9276968951482987e-05, "loss": 0.0681, "step": 3515 }, { "epoch": 1.1, "grad_norm": 0.6781297922134399, "learning_rate": 1.9274935165146044e-05, "loss": 0.0565, "step": 3520 }, { "epoch": 1.1, "grad_norm": 0.8024562001228333, "learning_rate": 1.927289863003804e-05, "loss": 0.0762, "step": 3525 }, { "epoch": 1.1, "grad_norm": 0.5281566977500916, "learning_rate": 1.9270859346762522e-05, "loss": 0.0701, "step": 3530 }, { "epoch": 1.1, "grad_norm": 0.5051602721214294, "learning_rate": 1.926881731592387e-05, "loss": 0.0612, "step": 3535 }, { "epoch": 1.1, "grad_norm": 0.5649287104606628, "learning_rate": 1.926677253812727e-05, "loss": 0.076, "step": 3540 }, { "epoch": 1.11, "grad_norm": 0.5975441932678223, "learning_rate": 1.9264725013978727e-05, "loss": 0.0699, "step": 3545 }, { "epoch": 1.11, "grad_norm": 0.5763807892799377, "learning_rate": 1.9262674744085054e-05, "loss": 0.0537, "step": 3550 }, { "epoch": 1.11, "grad_norm": 0.8457044363021851, "learning_rate": 1.926062172905388e-05, "loss": 0.0728, "step": 3555 }, { "epoch": 1.11, "grad_norm": 0.7114633321762085, "learning_rate": 1.9258565969493647e-05, "loss": 0.0566, "step": 3560 }, { "epoch": 1.11, "grad_norm": 0.7445443868637085, "learning_rate": 1.925650746601361e-05, "loss": 0.0562, "step": 3565 }, { "epoch": 1.11, "grad_norm": 0.5508908033370972, "learning_rate": 1.9254446219223845e-05, "loss": 0.0617, "step": 3570 }, { "epoch": 1.12, "grad_norm": 0.8591492176055908, "learning_rate": 1.925238222973523e-05, "loss": 0.071, "step": 3575 }, { "epoch": 1.12, "grad_norm": 0.834518551826477, "learning_rate": 1.925031549815946e-05, "loss": 0.0787, "step": 3580 }, { "epoch": 1.12, "grad_norm": 0.718428909778595, "learning_rate": 1.9248246025109044e-05, "loss": 0.0849, "step": 3585 }, { "epoch": 1.12, "grad_norm": 0.7240105271339417, "learning_rate": 1.9246173811197302e-05, "loss": 0.0561, "step": 3590 }, { "epoch": 1.12, "grad_norm": 0.5630722045898438, "learning_rate": 1.924409885703837e-05, "loss": 0.0515, "step": 3595 }, { "epoch": 1.12, "grad_norm": 0.63336580991745, "learning_rate": 1.9242021163247187e-05, "loss": 0.0576, "step": 3600 }, { "epoch": 1.12, "grad_norm": 0.6805610060691833, "learning_rate": 1.923994073043951e-05, "loss": 0.0727, "step": 3605 }, { "epoch": 1.13, "grad_norm": 1.3520512580871582, "learning_rate": 1.9237857559231914e-05, "loss": 0.0758, "step": 3610 }, { "epoch": 1.13, "grad_norm": 0.8155093789100647, "learning_rate": 1.9235771650241776e-05, "loss": 0.079, "step": 3615 }, { "epoch": 1.13, "grad_norm": 0.9271829128265381, "learning_rate": 1.923368300408729e-05, "loss": 0.0808, "step": 3620 }, { "epoch": 1.13, "grad_norm": 0.8426121473312378, "learning_rate": 1.923159162138745e-05, "loss": 0.0671, "step": 3625 }, { "epoch": 1.13, "grad_norm": 0.6503888368606567, "learning_rate": 1.9229497502762075e-05, "loss": 0.0624, "step": 3630 }, { "epoch": 1.13, "grad_norm": 0.5869102478027344, "learning_rate": 1.922740064883179e-05, "loss": 0.0661, "step": 3635 }, { "epoch": 1.14, "grad_norm": 0.7934098839759827, "learning_rate": 1.9225301060218032e-05, "loss": 0.073, "step": 3640 }, { "epoch": 1.14, "grad_norm": 0.608038067817688, "learning_rate": 1.9223198737543046e-05, "loss": 0.0725, "step": 3645 }, { "epoch": 1.14, "grad_norm": 0.5883069038391113, "learning_rate": 1.9221093681429887e-05, "loss": 0.0647, "step": 3650 }, { "epoch": 1.14, "grad_norm": 0.8434496521949768, "learning_rate": 1.921898589250242e-05, "loss": 0.0582, "step": 3655 }, { "epoch": 1.14, "grad_norm": 0.7228140830993652, "learning_rate": 1.9216875371385323e-05, "loss": 0.0604, "step": 3660 }, { "epoch": 1.14, "grad_norm": 0.7881928086280823, "learning_rate": 1.921476211870408e-05, "loss": 0.0674, "step": 3665 }, { "epoch": 1.14, "grad_norm": 0.7950953841209412, "learning_rate": 1.9212646135084986e-05, "loss": 0.0647, "step": 3670 }, { "epoch": 1.15, "grad_norm": 1.4857760667800903, "learning_rate": 1.9210527421155145e-05, "loss": 0.0701, "step": 3675 }, { "epoch": 1.15, "grad_norm": 0.7841891050338745, "learning_rate": 1.9208405977542474e-05, "loss": 0.0633, "step": 3680 }, { "epoch": 1.15, "grad_norm": 0.9583932757377625, "learning_rate": 1.9206281804875695e-05, "loss": 0.0905, "step": 3685 }, { "epoch": 1.15, "grad_norm": 0.8628225922584534, "learning_rate": 1.9204154903784337e-05, "loss": 0.0739, "step": 3690 }, { "epoch": 1.15, "grad_norm": 0.9285526275634766, "learning_rate": 1.9202025274898744e-05, "loss": 0.0659, "step": 3695 }, { "epoch": 1.15, "grad_norm": 1.1224682331085205, "learning_rate": 1.919989291885006e-05, "loss": 0.062, "step": 3700 }, { "epoch": 1.16, "grad_norm": 0.8893318772315979, "learning_rate": 1.9197757836270245e-05, "loss": 0.0844, "step": 3705 }, { "epoch": 1.16, "grad_norm": 0.6806356906890869, "learning_rate": 1.9195620027792063e-05, "loss": 0.0459, "step": 3710 }, { "epoch": 1.16, "grad_norm": 0.7901091575622559, "learning_rate": 1.9193479494049088e-05, "loss": 0.0829, "step": 3715 }, { "epoch": 1.16, "grad_norm": 1.6605781316757202, "learning_rate": 1.9191336235675698e-05, "loss": 0.0566, "step": 3720 }, { "epoch": 1.16, "grad_norm": 0.8274343609809875, "learning_rate": 1.9189190253307082e-05, "loss": 0.0723, "step": 3725 }, { "epoch": 1.16, "grad_norm": 0.6665601134300232, "learning_rate": 1.9187041547579234e-05, "loss": 0.0623, "step": 3730 }, { "epoch": 1.16, "grad_norm": 0.6877213716506958, "learning_rate": 1.9184890119128963e-05, "loss": 0.0582, "step": 3735 }, { "epoch": 1.17, "grad_norm": 0.7861253619194031, "learning_rate": 1.9182735968593865e-05, "loss": 0.0684, "step": 3740 }, { "epoch": 1.17, "grad_norm": 0.7444209456443787, "learning_rate": 1.918057909661237e-05, "loss": 0.0677, "step": 3745 }, { "epoch": 1.17, "grad_norm": 0.5835312604904175, "learning_rate": 1.9178419503823692e-05, "loss": 0.0677, "step": 3750 }, { "epoch": 1.17, "grad_norm": 0.8612051606178284, "learning_rate": 1.9176257190867864e-05, "loss": 0.0868, "step": 3755 }, { "epoch": 1.17, "grad_norm": 0.8530820608139038, "learning_rate": 1.9174092158385716e-05, "loss": 0.0629, "step": 3760 }, { "epoch": 1.17, "grad_norm": 0.6976132392883301, "learning_rate": 1.9171924407018895e-05, "loss": 0.0742, "step": 3765 }, { "epoch": 1.18, "grad_norm": 0.8631473183631897, "learning_rate": 1.9169753937409844e-05, "loss": 0.0783, "step": 3770 }, { "epoch": 1.18, "grad_norm": 0.6785290241241455, "learning_rate": 1.9167580750201816e-05, "loss": 0.062, "step": 3775 }, { "epoch": 1.18, "grad_norm": 0.9032337665557861, "learning_rate": 1.9165404846038868e-05, "loss": 0.071, "step": 3780 }, { "epoch": 1.18, "grad_norm": 1.2370872497558594, "learning_rate": 1.9163226225565867e-05, "loss": 0.0724, "step": 3785 }, { "epoch": 1.18, "grad_norm": 0.7235238552093506, "learning_rate": 1.9161044889428476e-05, "loss": 0.0482, "step": 3790 }, { "epoch": 1.18, "grad_norm": 0.558603048324585, "learning_rate": 1.915886083827317e-05, "loss": 0.0522, "step": 3795 }, { "epoch": 1.19, "grad_norm": 0.9623020887374878, "learning_rate": 1.915667407274723e-05, "loss": 0.0667, "step": 3800 }, { "epoch": 1.19, "grad_norm": 0.6551677584648132, "learning_rate": 1.915448459349873e-05, "loss": 0.0478, "step": 3805 }, { "epoch": 1.19, "grad_norm": 0.7484034895896912, "learning_rate": 1.9152292401176563e-05, "loss": 0.0731, "step": 3810 }, { "epoch": 1.19, "grad_norm": 0.8798559308052063, "learning_rate": 1.9150097496430415e-05, "loss": 0.0743, "step": 3815 }, { "epoch": 1.19, "grad_norm": 1.0757192373275757, "learning_rate": 1.9147899879910783e-05, "loss": 0.0715, "step": 3820 }, { "epoch": 1.19, "grad_norm": 0.9112750887870789, "learning_rate": 1.914569955226896e-05, "loss": 0.0818, "step": 3825 }, { "epoch": 1.19, "grad_norm": 0.5911403298377991, "learning_rate": 1.9143496514157056e-05, "loss": 0.0709, "step": 3830 }, { "epoch": 1.2, "grad_norm": 0.9122008085250854, "learning_rate": 1.914129076622797e-05, "loss": 0.0885, "step": 3835 }, { "epoch": 1.2, "grad_norm": 0.8840839862823486, "learning_rate": 1.913908230913541e-05, "loss": 0.0642, "step": 3840 }, { "epoch": 1.2, "grad_norm": 0.6602410078048706, "learning_rate": 1.9136871143533884e-05, "loss": 0.0495, "step": 3845 }, { "epoch": 1.2, "grad_norm": 0.5362725853919983, "learning_rate": 1.913465727007871e-05, "loss": 0.0692, "step": 3850 }, { "epoch": 1.2, "grad_norm": 0.8106977343559265, "learning_rate": 1.9132440689426e-05, "loss": 0.0856, "step": 3855 }, { "epoch": 1.2, "grad_norm": 0.6339511871337891, "learning_rate": 1.9130221402232676e-05, "loss": 0.0601, "step": 3860 }, { "epoch": 1.21, "grad_norm": 0.7605702877044678, "learning_rate": 1.9127999409156454e-05, "loss": 0.0627, "step": 3865 }, { "epoch": 1.21, "grad_norm": 0.622286856174469, "learning_rate": 1.912577471085586e-05, "loss": 0.0624, "step": 3870 }, { "epoch": 1.21, "grad_norm": 0.5903645157814026, "learning_rate": 1.9123547307990215e-05, "loss": 0.048, "step": 3875 }, { "epoch": 1.21, "grad_norm": 0.6491131782531738, "learning_rate": 1.9121317201219645e-05, "loss": 0.0706, "step": 3880 }, { "epoch": 1.21, "grad_norm": 0.7359797358512878, "learning_rate": 1.911908439120508e-05, "loss": 0.0626, "step": 3885 }, { "epoch": 1.21, "grad_norm": 0.683800458908081, "learning_rate": 1.9116848878608243e-05, "loss": 0.0506, "step": 3890 }, { "epoch": 1.21, "grad_norm": 0.6683903336524963, "learning_rate": 1.9114610664091665e-05, "loss": 0.064, "step": 3895 }, { "epoch": 1.22, "grad_norm": 0.6654495596885681, "learning_rate": 1.911236974831868e-05, "loss": 0.0674, "step": 3900 }, { "epoch": 1.22, "grad_norm": 0.8582611680030823, "learning_rate": 1.9110126131953408e-05, "loss": 0.0738, "step": 3905 }, { "epoch": 1.22, "grad_norm": 0.9522505402565002, "learning_rate": 1.9107879815660788e-05, "loss": 0.0609, "step": 3910 }, { "epoch": 1.22, "grad_norm": 0.6455296277999878, "learning_rate": 1.910563080010655e-05, "loss": 0.0682, "step": 3915 }, { "epoch": 1.22, "grad_norm": 0.6582437753677368, "learning_rate": 1.910337908595722e-05, "loss": 0.0585, "step": 3920 }, { "epoch": 1.22, "grad_norm": 0.7527994513511658, "learning_rate": 1.9101124673880132e-05, "loss": 0.0541, "step": 3925 }, { "epoch": 1.23, "grad_norm": 0.9205194115638733, "learning_rate": 1.909886756454341e-05, "loss": 0.0913, "step": 3930 }, { "epoch": 1.23, "grad_norm": 0.8828095197677612, "learning_rate": 1.9096607758615998e-05, "loss": 0.0674, "step": 3935 }, { "epoch": 1.23, "grad_norm": 0.6131769418716431, "learning_rate": 1.9094345256767614e-05, "loss": 0.0587, "step": 3940 }, { "epoch": 1.23, "grad_norm": 1.0256649255752563, "learning_rate": 1.9092080059668784e-05, "loss": 0.0627, "step": 3945 }, { "epoch": 1.23, "grad_norm": 0.8035500645637512, "learning_rate": 1.9089812167990836e-05, "loss": 0.0805, "step": 3950 }, { "epoch": 1.23, "grad_norm": 0.6150037050247192, "learning_rate": 1.9087541582405897e-05, "loss": 0.055, "step": 3955 }, { "epoch": 1.24, "grad_norm": 0.6392942667007446, "learning_rate": 1.9085268303586892e-05, "loss": 0.0633, "step": 3960 }, { "epoch": 1.24, "grad_norm": 0.6936980485916138, "learning_rate": 1.908299233220754e-05, "loss": 0.0857, "step": 3965 }, { "epoch": 1.24, "grad_norm": 0.823484480381012, "learning_rate": 1.9080713668942356e-05, "loss": 0.0834, "step": 3970 }, { "epoch": 1.24, "grad_norm": 0.6093165278434753, "learning_rate": 1.9078432314466665e-05, "loss": 0.0637, "step": 3975 }, { "epoch": 1.24, "grad_norm": 0.6953601241111755, "learning_rate": 1.9076148269456576e-05, "loss": 0.0637, "step": 3980 }, { "epoch": 1.24, "grad_norm": 0.8668028116226196, "learning_rate": 1.9073861534589006e-05, "loss": 0.0522, "step": 3985 }, { "epoch": 1.24, "grad_norm": 0.9046851396560669, "learning_rate": 1.907157211054166e-05, "loss": 0.0541, "step": 3990 }, { "epoch": 1.25, "grad_norm": 0.8799270391464233, "learning_rate": 1.906927999799305e-05, "loss": 0.0907, "step": 3995 }, { "epoch": 1.25, "grad_norm": 0.908135175704956, "learning_rate": 1.906698519762247e-05, "loss": 0.0756, "step": 4000 }, { "epoch": 1.25, "grad_norm": 0.7531886100769043, "learning_rate": 1.906468771011003e-05, "loss": 0.0706, "step": 4005 }, { "epoch": 1.25, "grad_norm": 0.8982405662536621, "learning_rate": 1.9062387536136623e-05, "loss": 0.0897, "step": 4010 }, { "epoch": 1.25, "grad_norm": 0.8098554611206055, "learning_rate": 1.9060084676383934e-05, "loss": 0.0654, "step": 4015 }, { "epoch": 1.25, "grad_norm": 0.5767571926116943, "learning_rate": 1.905777913153446e-05, "loss": 0.0658, "step": 4020 }, { "epoch": 1.26, "grad_norm": 0.9261816143989563, "learning_rate": 1.905547090227148e-05, "loss": 0.0552, "step": 4025 }, { "epoch": 1.26, "grad_norm": 0.8063369989395142, "learning_rate": 1.905315998927908e-05, "loss": 0.0567, "step": 4030 }, { "epoch": 1.26, "grad_norm": 0.8452303409576416, "learning_rate": 1.9050846393242127e-05, "loss": 0.0626, "step": 4035 }, { "epoch": 1.26, "grad_norm": 0.695500373840332, "learning_rate": 1.9048530114846295e-05, "loss": 0.0756, "step": 4040 }, { "epoch": 1.26, "grad_norm": 0.6767045855522156, "learning_rate": 1.904621115477805e-05, "loss": 0.0648, "step": 4045 }, { "epoch": 1.26, "grad_norm": 0.8446992635726929, "learning_rate": 1.9043889513724648e-05, "loss": 0.0806, "step": 4050 }, { "epoch": 1.26, "grad_norm": 0.8408441543579102, "learning_rate": 1.904156519237415e-05, "loss": 0.0677, "step": 4055 }, { "epoch": 1.27, "grad_norm": 0.7592993974685669, "learning_rate": 1.9039238191415395e-05, "loss": 0.0756, "step": 4060 }, { "epoch": 1.27, "grad_norm": 0.6402657628059387, "learning_rate": 1.9036908511538036e-05, "loss": 0.0531, "step": 4065 }, { "epoch": 1.27, "grad_norm": 0.7251437902450562, "learning_rate": 1.9034576153432504e-05, "loss": 0.079, "step": 4070 }, { "epoch": 1.27, "grad_norm": 0.8086432814598083, "learning_rate": 1.903224111779003e-05, "loss": 0.0588, "step": 4075 }, { "epoch": 1.27, "grad_norm": 0.5242979526519775, "learning_rate": 1.902990340530264e-05, "loss": 0.0706, "step": 4080 }, { "epoch": 1.27, "grad_norm": 1.258223295211792, "learning_rate": 1.902756301666315e-05, "loss": 0.0768, "step": 4085 }, { "epoch": 1.28, "grad_norm": 0.7568871378898621, "learning_rate": 1.902521995256517e-05, "loss": 0.0829, "step": 4090 }, { "epoch": 1.28, "grad_norm": 0.726394534111023, "learning_rate": 1.90228742137031e-05, "loss": 0.0832, "step": 4095 }, { "epoch": 1.28, "grad_norm": 0.7445221543312073, "learning_rate": 1.9020525800772148e-05, "loss": 0.0553, "step": 4100 }, { "epoch": 1.28, "grad_norm": 0.864100992679596, "learning_rate": 1.9018174714468292e-05, "loss": 0.0836, "step": 4105 }, { "epoch": 1.28, "grad_norm": 0.6084670424461365, "learning_rate": 1.9015820955488316e-05, "loss": 0.0665, "step": 4110 }, { "epoch": 1.28, "grad_norm": 0.6526731252670288, "learning_rate": 1.9013464524529793e-05, "loss": 0.0595, "step": 4115 }, { "epoch": 1.29, "grad_norm": 0.8703616261482239, "learning_rate": 1.9011105422291087e-05, "loss": 0.095, "step": 4120 }, { "epoch": 1.29, "grad_norm": 1.6150163412094116, "learning_rate": 1.9008743649471356e-05, "loss": 0.0676, "step": 4125 }, { "epoch": 1.29, "grad_norm": 0.7024520635604858, "learning_rate": 1.900637920677055e-05, "loss": 0.0615, "step": 4130 }, { "epoch": 1.29, "grad_norm": 3.2384016513824463, "learning_rate": 1.90040120948894e-05, "loss": 0.0737, "step": 4135 }, { "epoch": 1.29, "grad_norm": 0.7834177017211914, "learning_rate": 1.900164231452945e-05, "loss": 0.0814, "step": 4140 }, { "epoch": 1.29, "grad_norm": 0.411509245634079, "learning_rate": 1.8999269866393005e-05, "loss": 0.0487, "step": 4145 }, { "epoch": 1.29, "grad_norm": 0.9382750988006592, "learning_rate": 1.899689475118319e-05, "loss": 0.0619, "step": 4150 }, { "epoch": 1.3, "grad_norm": 0.5898329615592957, "learning_rate": 1.8994516969603903e-05, "loss": 0.0496, "step": 4155 }, { "epoch": 1.3, "grad_norm": 0.8763757348060608, "learning_rate": 1.8992136522359837e-05, "loss": 0.0761, "step": 4160 }, { "epoch": 1.3, "grad_norm": 0.7113845348358154, "learning_rate": 1.898975341015647e-05, "loss": 0.0684, "step": 4165 }, { "epoch": 1.3, "grad_norm": 0.5834293365478516, "learning_rate": 1.8987367633700076e-05, "loss": 0.0569, "step": 4170 }, { "epoch": 1.3, "grad_norm": 0.7183889150619507, "learning_rate": 1.8984979193697727e-05, "loss": 0.059, "step": 4175 }, { "epoch": 1.3, "grad_norm": 0.8346315622329712, "learning_rate": 1.8982588090857257e-05, "loss": 0.0603, "step": 4180 }, { "epoch": 1.31, "grad_norm": 0.5111814141273499, "learning_rate": 1.8980194325887323e-05, "loss": 0.0601, "step": 4185 }, { "epoch": 1.31, "grad_norm": 1.2758036851882935, "learning_rate": 1.8977797899497343e-05, "loss": 0.0487, "step": 4190 }, { "epoch": 1.31, "grad_norm": 0.698638916015625, "learning_rate": 1.897539881239754e-05, "loss": 0.064, "step": 4195 }, { "epoch": 1.31, "grad_norm": 0.6919531226158142, "learning_rate": 1.897299706529892e-05, "loss": 0.0556, "step": 4200 }, { "epoch": 1.31, "grad_norm": 1.2566674947738647, "learning_rate": 1.8970592658913278e-05, "loss": 0.0821, "step": 4205 }, { "epoch": 1.31, "grad_norm": 0.5100722312927246, "learning_rate": 1.8968185593953195e-05, "loss": 0.0447, "step": 4210 }, { "epoch": 1.31, "grad_norm": 0.5391504168510437, "learning_rate": 1.8965775871132047e-05, "loss": 0.0802, "step": 4215 }, { "epoch": 1.32, "grad_norm": 0.5908461213111877, "learning_rate": 1.8963363491163985e-05, "loss": 0.0927, "step": 4220 }, { "epoch": 1.32, "grad_norm": 0.5374833345413208, "learning_rate": 1.8960948454763964e-05, "loss": 0.0561, "step": 4225 }, { "epoch": 1.32, "grad_norm": 0.8269191384315491, "learning_rate": 1.8958530762647714e-05, "loss": 0.0835, "step": 4230 }, { "epoch": 1.32, "grad_norm": 0.6975058317184448, "learning_rate": 1.8956110415531755e-05, "loss": 0.0584, "step": 4235 }, { "epoch": 1.32, "grad_norm": 1.0302953720092773, "learning_rate": 1.8953687414133394e-05, "loss": 0.0698, "step": 4240 }, { "epoch": 1.32, "grad_norm": 0.6055700778961182, "learning_rate": 1.8951261759170724e-05, "loss": 0.0631, "step": 4245 }, { "epoch": 1.33, "grad_norm": 0.6735210418701172, "learning_rate": 1.8948833451362632e-05, "loss": 0.0633, "step": 4250 }, { "epoch": 1.33, "grad_norm": 0.7031700611114502, "learning_rate": 1.8946402491428778e-05, "loss": 0.0619, "step": 4255 }, { "epoch": 1.33, "grad_norm": 0.5998573899269104, "learning_rate": 1.8943968880089617e-05, "loss": 0.0671, "step": 4260 }, { "epoch": 1.33, "grad_norm": 0.5515586137771606, "learning_rate": 1.8941532618066385e-05, "loss": 0.0569, "step": 4265 }, { "epoch": 1.33, "grad_norm": 0.8202019333839417, "learning_rate": 1.893909370608111e-05, "loss": 0.0752, "step": 4270 }, { "epoch": 1.33, "grad_norm": 0.8019958734512329, "learning_rate": 1.89366521448566e-05, "loss": 0.0595, "step": 4275 }, { "epoch": 1.33, "grad_norm": 0.5395796298980713, "learning_rate": 1.8934207935116447e-05, "loss": 0.0788, "step": 4280 }, { "epoch": 1.34, "grad_norm": 0.8099183440208435, "learning_rate": 1.8931761077585037e-05, "loss": 0.0634, "step": 4285 }, { "epoch": 1.34, "grad_norm": 0.6752781867980957, "learning_rate": 1.8929311572987528e-05, "loss": 0.0872, "step": 4290 }, { "epoch": 1.34, "grad_norm": 0.7430433034896851, "learning_rate": 1.892685942204987e-05, "loss": 0.0779, "step": 4295 }, { "epoch": 1.34, "grad_norm": 0.45912253856658936, "learning_rate": 1.89244046254988e-05, "loss": 0.0703, "step": 4300 }, { "epoch": 1.34, "grad_norm": 0.6058170199394226, "learning_rate": 1.892194718406183e-05, "loss": 0.0602, "step": 4305 }, { "epoch": 1.34, "grad_norm": 0.5597983002662659, "learning_rate": 1.8919487098467263e-05, "loss": 0.0879, "step": 4310 }, { "epoch": 1.35, "grad_norm": 0.7414512038230896, "learning_rate": 1.891702436944418e-05, "loss": 0.0754, "step": 4315 }, { "epoch": 1.35, "grad_norm": 0.7263529300689697, "learning_rate": 1.891455899772246e-05, "loss": 0.0581, "step": 4320 }, { "epoch": 1.35, "grad_norm": 0.49107396602630615, "learning_rate": 1.891209098403274e-05, "loss": 0.0576, "step": 4325 }, { "epoch": 1.35, "grad_norm": 0.8759298324584961, "learning_rate": 1.8909620329106463e-05, "loss": 0.0685, "step": 4330 }, { "epoch": 1.35, "grad_norm": 0.5367811918258667, "learning_rate": 1.8907147033675847e-05, "loss": 0.0616, "step": 4335 }, { "epoch": 1.35, "grad_norm": 0.6136344075202942, "learning_rate": 1.8904671098473885e-05, "loss": 0.059, "step": 4340 }, { "epoch": 1.36, "grad_norm": 0.7203885912895203, "learning_rate": 1.890219252423436e-05, "loss": 0.0824, "step": 4345 }, { "epoch": 1.36, "grad_norm": 0.7837348580360413, "learning_rate": 1.8899711311691843e-05, "loss": 0.0721, "step": 4350 }, { "epoch": 1.36, "grad_norm": 0.9603199362754822, "learning_rate": 1.8897227461581673e-05, "loss": 0.0773, "step": 4355 }, { "epoch": 1.36, "grad_norm": 0.6584271788597107, "learning_rate": 1.889474097463998e-05, "loss": 0.0582, "step": 4360 }, { "epoch": 1.36, "grad_norm": 0.7230022549629211, "learning_rate": 1.889225185160367e-05, "loss": 0.0792, "step": 4365 }, { "epoch": 1.36, "grad_norm": 0.5705015063285828, "learning_rate": 1.888976009321044e-05, "loss": 0.0603, "step": 4370 }, { "epoch": 1.36, "grad_norm": 0.7777138352394104, "learning_rate": 1.8887265700198755e-05, "loss": 0.0595, "step": 4375 }, { "epoch": 1.37, "grad_norm": 0.6518700122833252, "learning_rate": 1.8884768673307867e-05, "loss": 0.0489, "step": 4380 }, { "epoch": 1.37, "grad_norm": 0.7976190447807312, "learning_rate": 1.8882269013277813e-05, "loss": 0.0698, "step": 4385 }, { "epoch": 1.37, "grad_norm": 0.5520892143249512, "learning_rate": 1.8879766720849404e-05, "loss": 0.0872, "step": 4390 }, { "epoch": 1.37, "grad_norm": 0.8835591673851013, "learning_rate": 1.8877261796764235e-05, "loss": 0.0721, "step": 4395 }, { "epoch": 1.37, "grad_norm": 0.8427669405937195, "learning_rate": 1.8874754241764676e-05, "loss": 0.0889, "step": 4400 }, { "epoch": 1.37, "grad_norm": 0.7144712805747986, "learning_rate": 1.887224405659388e-05, "loss": 0.0631, "step": 4405 }, { "epoch": 1.38, "grad_norm": 0.4469461739063263, "learning_rate": 1.886973124199578e-05, "loss": 0.0715, "step": 4410 }, { "epoch": 1.38, "grad_norm": 0.7447325587272644, "learning_rate": 1.886721579871509e-05, "loss": 0.0872, "step": 4415 }, { "epoch": 1.38, "grad_norm": 0.6704967021942139, "learning_rate": 1.8864697727497297e-05, "loss": 0.0783, "step": 4420 }, { "epoch": 1.38, "grad_norm": 0.5041611194610596, "learning_rate": 1.8862177029088674e-05, "loss": 0.0629, "step": 4425 }, { "epoch": 1.38, "grad_norm": 0.5847105383872986, "learning_rate": 1.885965370423627e-05, "loss": 0.0529, "step": 4430 }, { "epoch": 1.38, "grad_norm": 0.5569111108779907, "learning_rate": 1.8857127753687904e-05, "loss": 0.0537, "step": 4435 }, { "epoch": 1.38, "grad_norm": 0.5260555744171143, "learning_rate": 1.885459917819219e-05, "loss": 0.0511, "step": 4440 }, { "epoch": 1.39, "grad_norm": 0.490197092294693, "learning_rate": 1.8852067978498503e-05, "loss": 0.069, "step": 4445 }, { "epoch": 1.39, "grad_norm": 0.7321019172668457, "learning_rate": 1.884953415535701e-05, "loss": 0.0814, "step": 4450 }, { "epoch": 1.39, "grad_norm": 0.7469559907913208, "learning_rate": 1.8846997709518642e-05, "loss": 0.078, "step": 4455 }, { "epoch": 1.39, "grad_norm": 0.76277095079422, "learning_rate": 1.884445864173512e-05, "loss": 0.0596, "step": 4460 }, { "epoch": 1.39, "grad_norm": 0.583543062210083, "learning_rate": 1.8841916952758933e-05, "loss": 0.0636, "step": 4465 }, { "epoch": 1.39, "grad_norm": 0.7911220788955688, "learning_rate": 1.883937264334335e-05, "loss": 0.0634, "step": 4470 }, { "epoch": 1.4, "grad_norm": 0.6241563558578491, "learning_rate": 1.8836825714242417e-05, "loss": 0.0612, "step": 4475 }, { "epoch": 1.4, "grad_norm": 1.1762306690216064, "learning_rate": 1.883427616621096e-05, "loss": 0.0703, "step": 4480 }, { "epoch": 1.4, "grad_norm": 1.086243987083435, "learning_rate": 1.883172400000457e-05, "loss": 0.0843, "step": 4485 }, { "epoch": 1.4, "grad_norm": 0.8462454676628113, "learning_rate": 1.8829169216379623e-05, "loss": 0.0776, "step": 4490 }, { "epoch": 1.4, "grad_norm": 0.5264838933944702, "learning_rate": 1.8826611816093274e-05, "loss": 0.0611, "step": 4495 }, { "epoch": 1.4, "grad_norm": 0.6226249933242798, "learning_rate": 1.8824051799903442e-05, "loss": 0.0637, "step": 4500 }, { "epoch": 1.41, "grad_norm": 0.830549418926239, "learning_rate": 1.8821489168568832e-05, "loss": 0.0577, "step": 4505 }, { "epoch": 1.41, "grad_norm": 0.9017519354820251, "learning_rate": 1.8818923922848918e-05, "loss": 0.0698, "step": 4510 }, { "epoch": 1.41, "grad_norm": 0.8163900971412659, "learning_rate": 1.881635606350395e-05, "loss": 0.0724, "step": 4515 }, { "epoch": 1.41, "grad_norm": 0.715849757194519, "learning_rate": 1.8813785591294952e-05, "loss": 0.056, "step": 4520 }, { "epoch": 1.41, "grad_norm": 0.6897075176239014, "learning_rate": 1.8811212506983724e-05, "loss": 0.0697, "step": 4525 }, { "epoch": 1.41, "grad_norm": 0.8066615462303162, "learning_rate": 1.8808636811332843e-05, "loss": 0.076, "step": 4530 }, { "epoch": 1.41, "grad_norm": 0.8620597720146179, "learning_rate": 1.880605850510565e-05, "loss": 0.0802, "step": 4535 }, { "epoch": 1.42, "grad_norm": 0.921575665473938, "learning_rate": 1.8803477589066272e-05, "loss": 0.0776, "step": 4540 }, { "epoch": 1.42, "grad_norm": 0.7957607507705688, "learning_rate": 1.8800894063979602e-05, "loss": 0.0997, "step": 4545 }, { "epoch": 1.42, "grad_norm": 0.5893496870994568, "learning_rate": 1.8798307930611303e-05, "loss": 0.0613, "step": 4550 }, { "epoch": 1.42, "grad_norm": 0.9337874054908752, "learning_rate": 1.8795719189727824e-05, "loss": 0.07, "step": 4555 }, { "epoch": 1.42, "grad_norm": 0.7042413353919983, "learning_rate": 1.8793127842096373e-05, "loss": 0.0813, "step": 4560 }, { "epoch": 1.42, "grad_norm": 0.7177154421806335, "learning_rate": 1.8790533888484937e-05, "loss": 0.0447, "step": 4565 }, { "epoch": 1.43, "grad_norm": 0.9277735948562622, "learning_rate": 1.8787937329662273e-05, "loss": 0.0787, "step": 4570 }, { "epoch": 1.43, "grad_norm": 0.850438117980957, "learning_rate": 1.8785338166397917e-05, "loss": 0.0487, "step": 4575 }, { "epoch": 1.43, "grad_norm": 0.6746247410774231, "learning_rate": 1.878273639946216e-05, "loss": 0.0717, "step": 4580 }, { "epoch": 1.43, "grad_norm": 0.7712878584861755, "learning_rate": 1.8780132029626093e-05, "loss": 0.0586, "step": 4585 }, { "epoch": 1.43, "grad_norm": 0.8856165409088135, "learning_rate": 1.8777525057661547e-05, "loss": 0.0745, "step": 4590 }, { "epoch": 1.43, "grad_norm": 0.8087002038955688, "learning_rate": 1.8774915484341147e-05, "loss": 0.082, "step": 4595 }, { "epoch": 1.43, "grad_norm": 0.7779855728149414, "learning_rate": 1.8772303310438275e-05, "loss": 0.0775, "step": 4600 }, { "epoch": 1.44, "grad_norm": 0.7559125423431396, "learning_rate": 1.8769688536727094e-05, "loss": 0.0714, "step": 4605 }, { "epoch": 1.44, "grad_norm": 0.6574954986572266, "learning_rate": 1.8767071163982525e-05, "loss": 0.0695, "step": 4610 }, { "epoch": 1.44, "grad_norm": 0.8335013389587402, "learning_rate": 1.8764451192980278e-05, "loss": 0.0564, "step": 4615 }, { "epoch": 1.44, "grad_norm": 0.6353960633277893, "learning_rate": 1.8761828624496816e-05, "loss": 0.0765, "step": 4620 }, { "epoch": 1.44, "grad_norm": 0.7106569409370422, "learning_rate": 1.8759203459309375e-05, "loss": 0.0714, "step": 4625 }, { "epoch": 1.44, "grad_norm": 0.7167492508888245, "learning_rate": 1.8756575698195968e-05, "loss": 0.0664, "step": 4630 }, { "epoch": 1.45, "grad_norm": 0.6576385498046875, "learning_rate": 1.8753945341935376e-05, "loss": 0.0796, "step": 4635 }, { "epoch": 1.45, "grad_norm": 0.501663088798523, "learning_rate": 1.875131239130714e-05, "loss": 0.0755, "step": 4640 }, { "epoch": 1.45, "grad_norm": 0.831375777721405, "learning_rate": 1.8748676847091575e-05, "loss": 0.0801, "step": 4645 }, { "epoch": 1.45, "grad_norm": 0.9495288133621216, "learning_rate": 1.874603871006977e-05, "loss": 0.0841, "step": 4650 }, { "epoch": 1.45, "grad_norm": 0.5677212476730347, "learning_rate": 1.8743397981023574e-05, "loss": 0.0757, "step": 4655 }, { "epoch": 1.45, "grad_norm": 0.6868023872375488, "learning_rate": 1.8740754660735612e-05, "loss": 0.0695, "step": 4660 }, { "epoch": 1.45, "grad_norm": 0.635217010974884, "learning_rate": 1.873810874998927e-05, "loss": 0.0661, "step": 4665 }, { "epoch": 1.46, "grad_norm": 0.582550048828125, "learning_rate": 1.8735460249568708e-05, "loss": 0.047, "step": 4670 }, { "epoch": 1.46, "grad_norm": 0.8971719741821289, "learning_rate": 1.8732809160258846e-05, "loss": 0.0812, "step": 4675 }, { "epoch": 1.46, "grad_norm": 0.7387098073959351, "learning_rate": 1.8730155482845374e-05, "loss": 0.0844, "step": 4680 }, { "epoch": 1.46, "grad_norm": 0.7902190685272217, "learning_rate": 1.872749921811476e-05, "loss": 0.0813, "step": 4685 }, { "epoch": 1.46, "grad_norm": 0.6167246103286743, "learning_rate": 1.8724840366854218e-05, "loss": 0.0868, "step": 4690 }, { "epoch": 1.46, "grad_norm": 0.6898187398910522, "learning_rate": 1.8722178929851748e-05, "loss": 0.0692, "step": 4695 }, { "epoch": 1.47, "grad_norm": 0.6871895790100098, "learning_rate": 1.8719514907896104e-05, "loss": 0.0718, "step": 4700 }, { "epoch": 1.47, "grad_norm": 0.7227907776832581, "learning_rate": 1.871684830177681e-05, "loss": 0.0521, "step": 4705 }, { "epoch": 1.47, "grad_norm": 1.124375820159912, "learning_rate": 1.8714179112284162e-05, "loss": 0.091, "step": 4710 }, { "epoch": 1.47, "grad_norm": 0.9651860594749451, "learning_rate": 1.871150734020921e-05, "loss": 0.0891, "step": 4715 }, { "epoch": 1.47, "grad_norm": 0.6683818101882935, "learning_rate": 1.8708832986343775e-05, "loss": 0.049, "step": 4720 }, { "epoch": 1.47, "grad_norm": 0.743218719959259, "learning_rate": 1.8706156051480447e-05, "loss": 0.0792, "step": 4725 }, { "epoch": 1.48, "grad_norm": 0.8588913083076477, "learning_rate": 1.870347653641257e-05, "loss": 0.0704, "step": 4730 }, { "epoch": 1.48, "grad_norm": 0.6899648308753967, "learning_rate": 1.8700794441934272e-05, "loss": 0.073, "step": 4735 }, { "epoch": 1.48, "grad_norm": 0.5917913913726807, "learning_rate": 1.869810976884042e-05, "loss": 0.0985, "step": 4740 }, { "epoch": 1.48, "grad_norm": 0.697803795337677, "learning_rate": 1.869542251792667e-05, "loss": 0.0535, "step": 4745 }, { "epoch": 1.48, "grad_norm": 0.6701207160949707, "learning_rate": 1.8692732689989423e-05, "loss": 0.0529, "step": 4750 }, { "epoch": 1.48, "grad_norm": 0.58310467004776, "learning_rate": 1.8690040285825852e-05, "loss": 0.0826, "step": 4755 }, { "epoch": 1.48, "grad_norm": 0.6153090000152588, "learning_rate": 1.8687345306233896e-05, "loss": 0.0907, "step": 4760 }, { "epoch": 1.49, "grad_norm": 0.6057466864585876, "learning_rate": 1.8684647752012255e-05, "loss": 0.0619, "step": 4765 }, { "epoch": 1.49, "grad_norm": 0.7822586894035339, "learning_rate": 1.8681947623960387e-05, "loss": 0.0758, "step": 4770 }, { "epoch": 1.49, "grad_norm": 1.0661760568618774, "learning_rate": 1.8679244922878516e-05, "loss": 0.0641, "step": 4775 }, { "epoch": 1.49, "grad_norm": 0.7023093104362488, "learning_rate": 1.8676539649567636e-05, "loss": 0.0697, "step": 4780 }, { "epoch": 1.49, "grad_norm": 0.6697647571563721, "learning_rate": 1.867383180482949e-05, "loss": 0.0672, "step": 4785 }, { "epoch": 1.49, "grad_norm": 0.7119118571281433, "learning_rate": 1.8671121389466592e-05, "loss": 0.0619, "step": 4790 }, { "epoch": 1.5, "grad_norm": 0.7365993857383728, "learning_rate": 1.866840840428222e-05, "loss": 0.0655, "step": 4795 }, { "epoch": 1.5, "grad_norm": 0.8320115208625793, "learning_rate": 1.86656928500804e-05, "loss": 0.0563, "step": 4800 }, { "epoch": 1.5, "grad_norm": 1.1454790830612183, "learning_rate": 1.866297472766594e-05, "loss": 0.0685, "step": 4805 }, { "epoch": 1.5, "grad_norm": 0.6646517515182495, "learning_rate": 1.866025403784439e-05, "loss": 0.0621, "step": 4810 }, { "epoch": 1.5, "grad_norm": 0.848318874835968, "learning_rate": 1.8657530781422067e-05, "loss": 0.0708, "step": 4815 }, { "epoch": 1.5, "grad_norm": 0.9369855523109436, "learning_rate": 1.865480495920606e-05, "loss": 0.1123, "step": 4820 }, { "epoch": 1.5, "grad_norm": 1.175233006477356, "learning_rate": 1.8652076572004197e-05, "loss": 0.0645, "step": 4825 }, { "epoch": 1.51, "grad_norm": 0.8523390293121338, "learning_rate": 1.8649345620625087e-05, "loss": 0.0617, "step": 4830 }, { "epoch": 1.51, "grad_norm": 0.7075837254524231, "learning_rate": 1.8646612105878087e-05, "loss": 0.0706, "step": 4835 }, { "epoch": 1.51, "grad_norm": 0.7256972193717957, "learning_rate": 1.8643876028573312e-05, "loss": 0.0432, "step": 4840 }, { "epoch": 1.51, "grad_norm": 0.8678048253059387, "learning_rate": 1.8641137389521648e-05, "loss": 0.0854, "step": 4845 }, { "epoch": 1.51, "grad_norm": 0.7625493407249451, "learning_rate": 1.8638396189534728e-05, "loss": 0.0645, "step": 4850 }, { "epoch": 1.51, "grad_norm": 1.0163854360580444, "learning_rate": 1.8635652429424956e-05, "loss": 0.0796, "step": 4855 }, { "epoch": 1.52, "grad_norm": 0.7053332328796387, "learning_rate": 1.863290611000548e-05, "loss": 0.0609, "step": 4860 }, { "epoch": 1.52, "grad_norm": 0.7304905652999878, "learning_rate": 1.8630157232090215e-05, "loss": 0.0733, "step": 4865 }, { "epoch": 1.52, "grad_norm": 0.7250766754150391, "learning_rate": 1.8627405796493843e-05, "loss": 0.0482, "step": 4870 }, { "epoch": 1.52, "grad_norm": 0.680020272731781, "learning_rate": 1.8624651804031782e-05, "loss": 0.0683, "step": 4875 }, { "epoch": 1.52, "grad_norm": 0.7620866894721985, "learning_rate": 1.862189525552023e-05, "loss": 0.0884, "step": 4880 }, { "epoch": 1.52, "grad_norm": 0.6260718107223511, "learning_rate": 1.861913615177613e-05, "loss": 0.0577, "step": 4885 }, { "epoch": 1.53, "grad_norm": 0.621261477470398, "learning_rate": 1.8616374493617184e-05, "loss": 0.0598, "step": 4890 }, { "epoch": 1.53, "grad_norm": 0.8694585561752319, "learning_rate": 1.8613610281861853e-05, "loss": 0.0826, "step": 4895 }, { "epoch": 1.53, "grad_norm": 0.922036349773407, "learning_rate": 1.8610843517329353e-05, "loss": 0.0644, "step": 4900 }, { "epoch": 1.53, "grad_norm": 0.8628506064414978, "learning_rate": 1.8608074200839658e-05, "loss": 0.0769, "step": 4905 }, { "epoch": 1.53, "grad_norm": 0.8055910468101501, "learning_rate": 1.8605302333213497e-05, "loss": 0.0643, "step": 4910 }, { "epoch": 1.53, "grad_norm": 0.6417886018753052, "learning_rate": 1.860252791527236e-05, "loss": 0.0435, "step": 4915 }, { "epoch": 1.53, "grad_norm": 0.8504335880279541, "learning_rate": 1.859975094783849e-05, "loss": 0.0696, "step": 4920 }, { "epoch": 1.54, "grad_norm": 1.3809876441955566, "learning_rate": 1.8596971431734878e-05, "loss": 0.0743, "step": 4925 }, { "epoch": 1.54, "grad_norm": 0.6176561713218689, "learning_rate": 1.8594189367785276e-05, "loss": 0.0555, "step": 4930 }, { "epoch": 1.54, "grad_norm": 0.7021178603172302, "learning_rate": 1.85914047568142e-05, "loss": 0.0572, "step": 4935 }, { "epoch": 1.54, "grad_norm": 0.7485069632530212, "learning_rate": 1.8588617599646906e-05, "loss": 0.0726, "step": 4940 }, { "epoch": 1.54, "grad_norm": 0.8081046342849731, "learning_rate": 1.8585827897109415e-05, "loss": 0.0476, "step": 4945 }, { "epoch": 1.54, "grad_norm": 0.890154242515564, "learning_rate": 1.8583035650028497e-05, "loss": 0.0693, "step": 4950 }, { "epoch": 1.55, "grad_norm": 0.524368166923523, "learning_rate": 1.8580240859231677e-05, "loss": 0.0601, "step": 4955 }, { "epoch": 1.55, "grad_norm": 0.6123557686805725, "learning_rate": 1.8577443525547237e-05, "loss": 0.0771, "step": 4960 }, { "epoch": 1.55, "grad_norm": 0.5457882285118103, "learning_rate": 1.857464364980421e-05, "loss": 0.0533, "step": 4965 }, { "epoch": 1.55, "grad_norm": 0.6672859787940979, "learning_rate": 1.857184123283238e-05, "loss": 0.0605, "step": 4970 }, { "epoch": 1.55, "grad_norm": 0.5480169653892517, "learning_rate": 1.8569036275462287e-05, "loss": 0.0588, "step": 4975 }, { "epoch": 1.55, "grad_norm": 1.2137045860290527, "learning_rate": 1.8566228778525226e-05, "loss": 0.084, "step": 4980 }, { "epoch": 1.55, "grad_norm": 0.751510500907898, "learning_rate": 1.856341874285324e-05, "loss": 0.0779, "step": 4985 }, { "epoch": 1.56, "grad_norm": 0.9870890378952026, "learning_rate": 1.856060616927913e-05, "loss": 0.0772, "step": 4990 }, { "epoch": 1.56, "grad_norm": 0.8300113677978516, "learning_rate": 1.8557791058636444e-05, "loss": 0.0739, "step": 4995 }, { "epoch": 1.56, "grad_norm": 0.845734715461731, "learning_rate": 1.8554973411759484e-05, "loss": 0.0825, "step": 5000 }, { "epoch": 1.56, "grad_norm": 0.8794209361076355, "learning_rate": 1.8552153229483302e-05, "loss": 0.0462, "step": 5005 }, { "epoch": 1.56, "grad_norm": 0.5634300708770752, "learning_rate": 1.8549330512643707e-05, "loss": 0.0727, "step": 5010 }, { "epoch": 1.56, "grad_norm": 0.6634950041770935, "learning_rate": 1.8546505262077245e-05, "loss": 0.0674, "step": 5015 }, { "epoch": 1.57, "grad_norm": 0.9048967957496643, "learning_rate": 1.8543677478621236e-05, "loss": 0.0814, "step": 5020 }, { "epoch": 1.57, "grad_norm": 1.0099159479141235, "learning_rate": 1.854084716311373e-05, "loss": 0.0674, "step": 5025 }, { "epoch": 1.57, "grad_norm": 0.7578509449958801, "learning_rate": 1.8538014316393534e-05, "loss": 0.0722, "step": 5030 }, { "epoch": 1.57, "grad_norm": 0.8153797388076782, "learning_rate": 1.853517893930021e-05, "loss": 0.0811, "step": 5035 }, { "epoch": 1.57, "grad_norm": 0.6564578413963318, "learning_rate": 1.8532341032674063e-05, "loss": 0.0535, "step": 5040 }, { "epoch": 1.57, "grad_norm": 0.7237277030944824, "learning_rate": 1.8529500597356155e-05, "loss": 0.0774, "step": 5045 }, { "epoch": 1.58, "grad_norm": 0.7941708564758301, "learning_rate": 1.8526657634188288e-05, "loss": 0.0844, "step": 5050 }, { "epoch": 1.58, "grad_norm": 1.2459404468536377, "learning_rate": 1.852381214401302e-05, "loss": 0.1087, "step": 5055 }, { "epoch": 1.58, "grad_norm": 1.005991816520691, "learning_rate": 1.8520964127673658e-05, "loss": 0.087, "step": 5060 }, { "epoch": 1.58, "grad_norm": 0.7758017182350159, "learning_rate": 1.8518113586014253e-05, "loss": 0.095, "step": 5065 }, { "epoch": 1.58, "grad_norm": 0.7437559366226196, "learning_rate": 1.851526051987961e-05, "loss": 0.0649, "step": 5070 }, { "epoch": 1.58, "grad_norm": 0.8242641687393188, "learning_rate": 1.851240493011528e-05, "loss": 0.0678, "step": 5075 }, { "epoch": 1.58, "grad_norm": 1.0162962675094604, "learning_rate": 1.8509546817567553e-05, "loss": 0.0714, "step": 5080 }, { "epoch": 1.59, "grad_norm": 0.7507639527320862, "learning_rate": 1.850668618308349e-05, "loss": 0.0783, "step": 5085 }, { "epoch": 1.59, "grad_norm": 1.0634666681289673, "learning_rate": 1.850382302751087e-05, "loss": 0.0815, "step": 5090 }, { "epoch": 1.59, "grad_norm": 0.8045139312744141, "learning_rate": 1.8500957351698244e-05, "loss": 0.0585, "step": 5095 }, { "epoch": 1.59, "grad_norm": 0.7873166799545288, "learning_rate": 1.8498089156494895e-05, "loss": 0.0607, "step": 5100 }, { "epoch": 1.59, "grad_norm": 0.6100918650627136, "learning_rate": 1.849521844275085e-05, "loss": 0.0473, "step": 5105 }, { "epoch": 1.59, "grad_norm": 0.6852316856384277, "learning_rate": 1.8492345211316908e-05, "loss": 0.066, "step": 5110 }, { "epoch": 1.6, "grad_norm": 0.80754554271698, "learning_rate": 1.848946946304458e-05, "loss": 0.0838, "step": 5115 }, { "epoch": 1.6, "grad_norm": 0.8571280241012573, "learning_rate": 1.8486591198786143e-05, "loss": 0.084, "step": 5120 }, { "epoch": 1.6, "grad_norm": 0.5786952972412109, "learning_rate": 1.8483710419394616e-05, "loss": 0.0704, "step": 5125 }, { "epoch": 1.6, "grad_norm": 0.5984448194503784, "learning_rate": 1.8480827125723764e-05, "loss": 0.0946, "step": 5130 }, { "epoch": 1.6, "grad_norm": 0.7940416932106018, "learning_rate": 1.8477941318628093e-05, "loss": 0.0779, "step": 5135 }, { "epoch": 1.6, "grad_norm": 1.4904640913009644, "learning_rate": 1.8475052998962856e-05, "loss": 0.0994, "step": 5140 }, { "epoch": 1.6, "grad_norm": 0.8121379017829895, "learning_rate": 1.8472162167584058e-05, "loss": 0.0734, "step": 5145 }, { "epoch": 1.61, "grad_norm": 0.5815505981445312, "learning_rate": 1.8469268825348435e-05, "loss": 0.0556, "step": 5150 }, { "epoch": 1.61, "grad_norm": 0.8653185963630676, "learning_rate": 1.8466372973113476e-05, "loss": 0.0561, "step": 5155 }, { "epoch": 1.61, "grad_norm": 0.5899277329444885, "learning_rate": 1.8463474611737412e-05, "loss": 0.0752, "step": 5160 }, { "epoch": 1.61, "grad_norm": 0.6971949338912964, "learning_rate": 1.8460573742079216e-05, "loss": 0.061, "step": 5165 }, { "epoch": 1.61, "grad_norm": 0.7547398805618286, "learning_rate": 1.845767036499861e-05, "loss": 0.1052, "step": 5170 }, { "epoch": 1.61, "grad_norm": 0.8991611003875732, "learning_rate": 1.845476448135605e-05, "loss": 0.0631, "step": 5175 }, { "epoch": 1.62, "grad_norm": 1.0087960958480835, "learning_rate": 1.8451856092012743e-05, "loss": 0.0643, "step": 5180 }, { "epoch": 1.62, "grad_norm": 0.7402775287628174, "learning_rate": 1.8448945197830635e-05, "loss": 0.0755, "step": 5185 }, { "epoch": 1.62, "grad_norm": 0.6131215691566467, "learning_rate": 1.8446031799672412e-05, "loss": 0.0821, "step": 5190 }, { "epoch": 1.62, "grad_norm": 0.7232556939125061, "learning_rate": 1.8443115898401505e-05, "loss": 0.0816, "step": 5195 }, { "epoch": 1.62, "grad_norm": 0.876368522644043, "learning_rate": 1.844019749488209e-05, "loss": 0.0711, "step": 5200 }, { "epoch": 1.62, "grad_norm": 0.8438897728919983, "learning_rate": 1.843727658997908e-05, "loss": 0.0626, "step": 5205 }, { "epoch": 1.62, "grad_norm": 0.7738611102104187, "learning_rate": 1.843435318455813e-05, "loss": 0.0632, "step": 5210 }, { "epoch": 1.63, "grad_norm": 0.8397727012634277, "learning_rate": 1.8431427279485638e-05, "loss": 0.07, "step": 5215 }, { "epoch": 1.63, "grad_norm": 0.9025486707687378, "learning_rate": 1.842849887562873e-05, "loss": 0.0765, "step": 5220 }, { "epoch": 1.63, "grad_norm": 0.8906568884849548, "learning_rate": 1.8425567973855306e-05, "loss": 0.083, "step": 5225 }, { "epoch": 1.63, "grad_norm": 0.8898297548294067, "learning_rate": 1.842263457503397e-05, "loss": 0.0582, "step": 5230 }, { "epoch": 1.63, "grad_norm": 0.7294970750808716, "learning_rate": 1.8419698680034076e-05, "loss": 0.0804, "step": 5235 }, { "epoch": 1.63, "grad_norm": 0.635528028011322, "learning_rate": 1.8416760289725736e-05, "loss": 0.0662, "step": 5240 }, { "epoch": 1.64, "grad_norm": 2.0224263668060303, "learning_rate": 1.8413819404979776e-05, "loss": 0.0635, "step": 5245 }, { "epoch": 1.64, "grad_norm": 0.8897799253463745, "learning_rate": 1.841087602666778e-05, "loss": 0.077, "step": 5250 }, { "epoch": 1.64, "grad_norm": 0.9975351095199585, "learning_rate": 1.8407930155662056e-05, "loss": 0.0896, "step": 5255 }, { "epoch": 1.64, "grad_norm": 0.5616067051887512, "learning_rate": 1.8404981792835665e-05, "loss": 0.0556, "step": 5260 }, { "epoch": 1.64, "grad_norm": 1.4890785217285156, "learning_rate": 1.840262130905084e-05, "loss": 0.0705, "step": 5265 }, { "epoch": 1.64, "grad_norm": 0.7033029198646545, "learning_rate": 1.83996684631497e-05, "loss": 0.0816, "step": 5270 }, { "epoch": 1.65, "grad_norm": 0.6858252882957458, "learning_rate": 1.8396713127876373e-05, "loss": 0.0723, "step": 5275 }, { "epoch": 1.65, "grad_norm": 0.7298487424850464, "learning_rate": 1.839375530410672e-05, "loss": 0.0748, "step": 5280 }, { "epoch": 1.65, "grad_norm": 0.9825119376182556, "learning_rate": 1.8390794992717333e-05, "loss": 0.055, "step": 5285 }, { "epoch": 1.65, "grad_norm": 0.856184720993042, "learning_rate": 1.838783219458555e-05, "loss": 0.0636, "step": 5290 }, { "epoch": 1.65, "grad_norm": 0.7891317009925842, "learning_rate": 1.8384866910589442e-05, "loss": 0.0405, "step": 5295 }, { "epoch": 1.65, "grad_norm": 0.9762969017028809, "learning_rate": 1.8381899141607817e-05, "loss": 0.0707, "step": 5300 }, { "epoch": 1.65, "grad_norm": 0.5405159592628479, "learning_rate": 1.8378928888520216e-05, "loss": 0.0639, "step": 5305 }, { "epoch": 1.66, "grad_norm": 0.8812373876571655, "learning_rate": 1.8375956152206926e-05, "loss": 0.0606, "step": 5310 }, { "epoch": 1.66, "grad_norm": 0.6692848205566406, "learning_rate": 1.8372980933548957e-05, "loss": 0.0772, "step": 5315 }, { "epoch": 1.66, "grad_norm": 1.2220298051834106, "learning_rate": 1.8370003233428067e-05, "loss": 0.0653, "step": 5320 }, { "epoch": 1.66, "grad_norm": 0.8167205452919006, "learning_rate": 1.836702305272674e-05, "loss": 0.0746, "step": 5325 }, { "epoch": 1.66, "grad_norm": 0.952875018119812, "learning_rate": 1.8364040392328197e-05, "loss": 0.072, "step": 5330 }, { "epoch": 1.66, "grad_norm": 0.9271166324615479, "learning_rate": 1.83610552531164e-05, "loss": 0.0542, "step": 5335 }, { "epoch": 1.67, "grad_norm": 0.6192010641098022, "learning_rate": 1.835806763597604e-05, "loss": 0.073, "step": 5340 }, { "epoch": 1.67, "grad_norm": 0.7979380488395691, "learning_rate": 1.8355077541792543e-05, "loss": 0.0774, "step": 5345 }, { "epoch": 1.67, "grad_norm": 0.6753906011581421, "learning_rate": 1.835208497145207e-05, "loss": 0.0733, "step": 5350 }, { "epoch": 1.67, "grad_norm": 1.069570779800415, "learning_rate": 1.8349089925841516e-05, "loss": 0.0671, "step": 5355 }, { "epoch": 1.67, "grad_norm": 0.5559250712394714, "learning_rate": 1.8346092405848513e-05, "loss": 0.0641, "step": 5360 }, { "epoch": 1.67, "grad_norm": 1.2465052604675293, "learning_rate": 1.8343092412361414e-05, "loss": 0.0676, "step": 5365 }, { "epoch": 1.67, "grad_norm": 0.8193307518959045, "learning_rate": 1.834008994626932e-05, "loss": 0.0482, "step": 5370 }, { "epoch": 1.68, "grad_norm": 0.9308450818061829, "learning_rate": 1.8337085008462056e-05, "loss": 0.0897, "step": 5375 }, { "epoch": 1.68, "grad_norm": 0.6304412484169006, "learning_rate": 1.8334077599830188e-05, "loss": 0.0767, "step": 5380 }, { "epoch": 1.68, "grad_norm": 0.5548502206802368, "learning_rate": 1.8331067721265e-05, "loss": 0.0664, "step": 5385 }, { "epoch": 1.68, "grad_norm": 0.9302047491073608, "learning_rate": 1.8328055373658517e-05, "loss": 0.0734, "step": 5390 }, { "epoch": 1.68, "grad_norm": 0.5483214259147644, "learning_rate": 1.83250405579035e-05, "loss": 0.1012, "step": 5395 }, { "epoch": 1.68, "grad_norm": 0.8517399430274963, "learning_rate": 1.8322023274893435e-05, "loss": 0.0836, "step": 5400 }, { "epoch": 1.69, "grad_norm": 0.6601105332374573, "learning_rate": 1.8319003525522537e-05, "loss": 0.0459, "step": 5405 }, { "epoch": 1.69, "grad_norm": 0.583982527256012, "learning_rate": 1.8315981310685756e-05, "loss": 0.0694, "step": 5410 }, { "epoch": 1.69, "grad_norm": 1.3079158067703247, "learning_rate": 1.831295663127878e-05, "loss": 0.0832, "step": 5415 }, { "epoch": 1.69, "grad_norm": 0.6640372276306152, "learning_rate": 1.8309929488198012e-05, "loss": 0.0543, "step": 5420 }, { "epoch": 1.69, "grad_norm": 1.2028993368148804, "learning_rate": 1.8306899882340597e-05, "loss": 0.0812, "step": 5425 }, { "epoch": 1.69, "grad_norm": 0.8884703516960144, "learning_rate": 1.8303867814604402e-05, "loss": 0.0768, "step": 5430 }, { "epoch": 1.7, "grad_norm": 0.8457720875740051, "learning_rate": 1.830083328588803e-05, "loss": 0.0662, "step": 5435 }, { "epoch": 1.7, "grad_norm": 0.8695236444473267, "learning_rate": 1.8297796297090808e-05, "loss": 0.0609, "step": 5440 }, { "epoch": 1.7, "grad_norm": 0.763415515422821, "learning_rate": 1.82947568491128e-05, "loss": 0.0654, "step": 5445 }, { "epoch": 1.7, "grad_norm": 1.3075530529022217, "learning_rate": 1.8291714942854787e-05, "loss": 0.0789, "step": 5450 }, { "epoch": 1.7, "grad_norm": 0.7080858945846558, "learning_rate": 1.828867057921829e-05, "loss": 0.0663, "step": 5455 }, { "epoch": 1.7, "grad_norm": 0.6674808263778687, "learning_rate": 1.828562375910555e-05, "loss": 0.1035, "step": 5460 }, { "epoch": 1.7, "grad_norm": 1.0941526889801025, "learning_rate": 1.8282574483419544e-05, "loss": 0.0645, "step": 5465 }, { "epoch": 1.71, "grad_norm": 0.6409357190132141, "learning_rate": 1.8279522753063966e-05, "loss": 0.0806, "step": 5470 }, { "epoch": 1.71, "grad_norm": 0.7128939032554626, "learning_rate": 1.827646856894325e-05, "loss": 0.0675, "step": 5475 }, { "epoch": 1.71, "grad_norm": 0.8401421308517456, "learning_rate": 1.827341193196255e-05, "loss": 0.0792, "step": 5480 }, { "epoch": 1.71, "grad_norm": 1.0783287286758423, "learning_rate": 1.8270352843027744e-05, "loss": 0.0746, "step": 5485 }, { "epoch": 1.71, "grad_norm": 0.5906087160110474, "learning_rate": 1.8267291303045443e-05, "loss": 0.0572, "step": 5490 }, { "epoch": 1.71, "grad_norm": 0.6586350798606873, "learning_rate": 1.826422731292298e-05, "loss": 0.0708, "step": 5495 }, { "epoch": 1.72, "grad_norm": 0.787606418132782, "learning_rate": 1.826116087356842e-05, "loss": 0.0666, "step": 5500 }, { "epoch": 1.72, "grad_norm": 0.7802187204360962, "learning_rate": 1.8258091985890547e-05, "loss": 0.0648, "step": 5505 }, { "epoch": 1.72, "grad_norm": 0.8204883337020874, "learning_rate": 1.825502065079887e-05, "loss": 0.0664, "step": 5510 }, { "epoch": 1.72, "grad_norm": 0.5513288378715515, "learning_rate": 1.8251946869203635e-05, "loss": 0.0661, "step": 5515 }, { "epoch": 1.72, "grad_norm": 1.000407338142395, "learning_rate": 1.82488706420158e-05, "loss": 0.0769, "step": 5520 }, { "epoch": 1.72, "grad_norm": 0.827264666557312, "learning_rate": 1.824579197014705e-05, "loss": 0.0863, "step": 5525 }, { "epoch": 1.72, "grad_norm": 1.0817997455596924, "learning_rate": 1.8242710854509807e-05, "loss": 0.083, "step": 5530 }, { "epoch": 1.73, "grad_norm": 0.7575589418411255, "learning_rate": 1.8239627296017196e-05, "loss": 0.0823, "step": 5535 }, { "epoch": 1.73, "grad_norm": 0.9375240206718445, "learning_rate": 1.8236541295583083e-05, "loss": 0.0782, "step": 5540 }, { "epoch": 1.73, "grad_norm": 0.5978908538818359, "learning_rate": 1.8233452854122054e-05, "loss": 0.0817, "step": 5545 }, { "epoch": 1.73, "grad_norm": 0.9602009654045105, "learning_rate": 1.823036197254941e-05, "loss": 0.0771, "step": 5550 }, { "epoch": 1.73, "grad_norm": 0.5891567468643188, "learning_rate": 1.822726865178119e-05, "loss": 0.0677, "step": 5555 }, { "epoch": 1.73, "grad_norm": 0.8885820508003235, "learning_rate": 1.822417289273414e-05, "loss": 0.0575, "step": 5560 }, { "epoch": 1.74, "grad_norm": 0.7974045276641846, "learning_rate": 1.822107469632574e-05, "loss": 0.0884, "step": 5565 }, { "epoch": 1.74, "grad_norm": 0.5491849184036255, "learning_rate": 1.8217974063474196e-05, "loss": 0.0665, "step": 5570 }, { "epoch": 1.74, "grad_norm": 0.8097928166389465, "learning_rate": 1.821487099509841e-05, "loss": 0.0685, "step": 5575 }, { "epoch": 1.74, "grad_norm": 0.8077875971794128, "learning_rate": 1.8211765492118046e-05, "loss": 0.0668, "step": 5580 }, { "epoch": 1.74, "grad_norm": 0.768814206123352, "learning_rate": 1.820865755545345e-05, "loss": 0.0713, "step": 5585 }, { "epoch": 1.74, "grad_norm": 0.9624421000480652, "learning_rate": 1.8205547186025724e-05, "loss": 0.0679, "step": 5590 }, { "epoch": 1.75, "grad_norm": 0.6452382206916809, "learning_rate": 1.8202434384756656e-05, "loss": 0.0722, "step": 5595 }, { "epoch": 1.75, "grad_norm": 0.7202498912811279, "learning_rate": 1.8199319152568786e-05, "loss": 0.0656, "step": 5600 }, { "epoch": 1.75, "grad_norm": 0.7296916842460632, "learning_rate": 1.819620149038536e-05, "loss": 0.0849, "step": 5605 }, { "epoch": 1.75, "grad_norm": 0.9542629718780518, "learning_rate": 1.819308139913034e-05, "loss": 0.0948, "step": 5610 }, { "epoch": 1.75, "grad_norm": 0.7899646759033203, "learning_rate": 1.8189958879728418e-05, "loss": 0.0643, "step": 5615 }, { "epoch": 1.75, "grad_norm": 0.6902501583099365, "learning_rate": 1.8186833933105e-05, "loss": 0.0612, "step": 5620 }, { "epoch": 1.75, "grad_norm": 0.8489237427711487, "learning_rate": 1.8183706560186212e-05, "loss": 0.0723, "step": 5625 }, { "epoch": 1.76, "grad_norm": 0.46526649594306946, "learning_rate": 1.81805767618989e-05, "loss": 0.0549, "step": 5630 }, { "epoch": 1.76, "grad_norm": 0.7680943608283997, "learning_rate": 1.8177444539170627e-05, "loss": 0.0777, "step": 5635 }, { "epoch": 1.76, "grad_norm": 0.6338277459144592, "learning_rate": 1.817430989292968e-05, "loss": 0.0612, "step": 5640 }, { "epoch": 1.76, "grad_norm": 0.8167020082473755, "learning_rate": 1.8171172824105056e-05, "loss": 0.0579, "step": 5645 }, { "epoch": 1.76, "grad_norm": 0.9066957235336304, "learning_rate": 1.8168033333626474e-05, "loss": 0.0729, "step": 5650 }, { "epoch": 1.76, "grad_norm": 1.146350622177124, "learning_rate": 1.8164891422424372e-05, "loss": 0.1003, "step": 5655 }, { "epoch": 1.77, "grad_norm": 0.7212271690368652, "learning_rate": 1.8161747091429903e-05, "loss": 0.0733, "step": 5660 }, { "epoch": 1.77, "grad_norm": 0.6951694488525391, "learning_rate": 1.8158600341574946e-05, "loss": 0.0605, "step": 5665 }, { "epoch": 1.77, "grad_norm": 0.7327526211738586, "learning_rate": 1.8155451173792077e-05, "loss": 0.0617, "step": 5670 }, { "epoch": 1.77, "grad_norm": 1.4549367427825928, "learning_rate": 1.815229958901461e-05, "loss": 0.0796, "step": 5675 }, { "epoch": 1.77, "grad_norm": 0.9679293036460876, "learning_rate": 1.8149145588176557e-05, "loss": 0.0842, "step": 5680 }, { "epoch": 1.77, "grad_norm": 0.6489354372024536, "learning_rate": 1.8145989172212666e-05, "loss": 0.0769, "step": 5685 }, { "epoch": 1.77, "grad_norm": 0.8843717575073242, "learning_rate": 1.8142830342058383e-05, "loss": 0.0787, "step": 5690 }, { "epoch": 1.78, "grad_norm": 0.8279662132263184, "learning_rate": 1.8139669098649876e-05, "loss": 0.0563, "step": 5695 }, { "epoch": 1.78, "grad_norm": 0.659649670124054, "learning_rate": 1.8136505442924033e-05, "loss": 0.0679, "step": 5700 }, { "epoch": 1.78, "grad_norm": 0.9309002161026001, "learning_rate": 1.813333937581845e-05, "loss": 0.0748, "step": 5705 }, { "epoch": 1.78, "grad_norm": 0.8285350203514099, "learning_rate": 1.813017089827144e-05, "loss": 0.0567, "step": 5710 }, { "epoch": 1.78, "grad_norm": 0.6510990858078003, "learning_rate": 1.812700001122203e-05, "loss": 0.0616, "step": 5715 }, { "epoch": 1.78, "grad_norm": 0.7334113717079163, "learning_rate": 1.8123826715609963e-05, "loss": 0.0577, "step": 5720 }, { "epoch": 1.79, "grad_norm": 0.475523978471756, "learning_rate": 1.8120651012375694e-05, "loss": 0.0654, "step": 5725 }, { "epoch": 1.79, "grad_norm": 0.5141718983650208, "learning_rate": 1.811747290246039e-05, "loss": 0.0633, "step": 5730 }, { "epoch": 1.79, "grad_norm": 0.7106728553771973, "learning_rate": 1.8114292386805935e-05, "loss": 0.0583, "step": 5735 }, { "epoch": 1.79, "grad_norm": 0.6758169531822205, "learning_rate": 1.8111109466354926e-05, "loss": 0.0673, "step": 5740 }, { "epoch": 1.79, "grad_norm": 0.7288673520088196, "learning_rate": 1.810792414205067e-05, "loss": 0.065, "step": 5745 }, { "epoch": 1.79, "grad_norm": 1.0398705005645752, "learning_rate": 1.8104736414837183e-05, "loss": 0.0903, "step": 5750 }, { "epoch": 1.79, "grad_norm": 0.5896938443183899, "learning_rate": 1.81015462856592e-05, "loss": 0.0636, "step": 5755 }, { "epoch": 1.8, "grad_norm": 0.6056891679763794, "learning_rate": 1.809835375546217e-05, "loss": 0.0599, "step": 5760 }, { "epoch": 1.8, "grad_norm": 0.9472219347953796, "learning_rate": 1.8095158825192248e-05, "loss": 0.0695, "step": 5765 }, { "epoch": 1.8, "grad_norm": 0.7288747429847717, "learning_rate": 1.8091961495796294e-05, "loss": 0.0733, "step": 5770 }, { "epoch": 1.8, "grad_norm": 0.5857644081115723, "learning_rate": 1.8088761768221887e-05, "loss": 0.0613, "step": 5775 }, { "epoch": 1.8, "grad_norm": 1.1260770559310913, "learning_rate": 1.8085559643417326e-05, "loss": 0.082, "step": 5780 }, { "epoch": 1.8, "grad_norm": 0.5587033033370972, "learning_rate": 1.80823551223316e-05, "loss": 0.06, "step": 5785 }, { "epoch": 1.81, "grad_norm": 0.8423185348510742, "learning_rate": 1.8079148205914423e-05, "loss": 0.0546, "step": 5790 }, { "epoch": 1.81, "grad_norm": 1.0525543689727783, "learning_rate": 1.807593889511621e-05, "loss": 0.0681, "step": 5795 }, { "epoch": 1.81, "grad_norm": 0.8642443418502808, "learning_rate": 1.8072727190888102e-05, "loss": 0.0574, "step": 5800 }, { "epoch": 1.81, "grad_norm": 0.6415408253669739, "learning_rate": 1.8069513094181925e-05, "loss": 0.0632, "step": 5805 }, { "epoch": 1.81, "grad_norm": 0.572545051574707, "learning_rate": 1.8066296605950232e-05, "loss": 0.0705, "step": 5810 }, { "epoch": 1.81, "grad_norm": 0.7826864123344421, "learning_rate": 1.806307772714628e-05, "loss": 0.0686, "step": 5815 }, { "epoch": 1.82, "grad_norm": 0.5731449723243713, "learning_rate": 1.8059856458724028e-05, "loss": 0.0607, "step": 5820 }, { "epoch": 1.82, "grad_norm": 0.8855318427085876, "learning_rate": 1.8056632801638155e-05, "loss": 0.0749, "step": 5825 }, { "epoch": 1.82, "grad_norm": 1.1506929397583008, "learning_rate": 1.805340675684404e-05, "loss": 0.0887, "step": 5830 }, { "epoch": 1.82, "grad_norm": 0.8381348252296448, "learning_rate": 1.8050178325297767e-05, "loss": 0.0708, "step": 5835 }, { "epoch": 1.82, "grad_norm": 0.8033125996589661, "learning_rate": 1.8046947507956138e-05, "loss": 0.0649, "step": 5840 }, { "epoch": 1.82, "grad_norm": 0.6807813048362732, "learning_rate": 1.8043714305776657e-05, "loss": 0.0773, "step": 5845 }, { "epoch": 1.82, "grad_norm": 1.0260368585586548, "learning_rate": 1.804047871971753e-05, "loss": 0.0921, "step": 5850 }, { "epoch": 1.83, "grad_norm": 0.8171194195747375, "learning_rate": 1.803724075073767e-05, "loss": 0.0797, "step": 5855 }, { "epoch": 1.83, "grad_norm": 0.7568709254264832, "learning_rate": 1.8034000399796706e-05, "loss": 0.0578, "step": 5860 }, { "epoch": 1.83, "grad_norm": 0.6419884562492371, "learning_rate": 1.803075766785496e-05, "loss": 0.0658, "step": 5865 }, { "epoch": 1.83, "grad_norm": 0.9156310558319092, "learning_rate": 1.8027512555873472e-05, "loss": 0.0793, "step": 5870 }, { "epoch": 1.83, "grad_norm": 0.7987212538719177, "learning_rate": 1.802426506481398e-05, "loss": 0.0695, "step": 5875 }, { "epoch": 1.83, "grad_norm": 0.49292904138565063, "learning_rate": 1.8021015195638926e-05, "loss": 0.0472, "step": 5880 }, { "epoch": 1.84, "grad_norm": 0.8221922516822815, "learning_rate": 1.8017762949311456e-05, "loss": 0.0679, "step": 5885 }, { "epoch": 1.84, "grad_norm": 0.5154405832290649, "learning_rate": 1.8014508326795435e-05, "loss": 0.0732, "step": 5890 }, { "epoch": 1.84, "grad_norm": 0.601098895072937, "learning_rate": 1.801125132905541e-05, "loss": 0.0595, "step": 5895 }, { "epoch": 1.84, "grad_norm": 0.9453170895576477, "learning_rate": 1.800799195705665e-05, "loss": 0.0777, "step": 5900 }, { "epoch": 1.84, "grad_norm": 0.7187973260879517, "learning_rate": 1.800473021176511e-05, "loss": 0.0679, "step": 5905 }, { "epoch": 1.84, "grad_norm": 0.686946451663971, "learning_rate": 1.8001466094147472e-05, "loss": 0.058, "step": 5910 }, { "epoch": 1.84, "grad_norm": 0.6171877384185791, "learning_rate": 1.7998199605171102e-05, "loss": 0.0728, "step": 5915 }, { "epoch": 1.85, "grad_norm": 0.7061054110527039, "learning_rate": 1.7994930745804072e-05, "loss": 0.0567, "step": 5920 }, { "epoch": 1.85, "grad_norm": 0.7402878999710083, "learning_rate": 1.7991659517015166e-05, "loss": 0.0578, "step": 5925 }, { "epoch": 1.85, "grad_norm": 0.6647749543190002, "learning_rate": 1.7988385919773858e-05, "loss": 0.077, "step": 5930 }, { "epoch": 1.85, "grad_norm": 0.6972184181213379, "learning_rate": 1.7985109955050332e-05, "loss": 0.0601, "step": 5935 }, { "epoch": 1.85, "grad_norm": 1.0716334581375122, "learning_rate": 1.7981831623815465e-05, "loss": 0.0675, "step": 5940 }, { "epoch": 1.85, "grad_norm": 0.6735240817070007, "learning_rate": 1.7978550927040848e-05, "loss": 0.071, "step": 5945 }, { "epoch": 1.86, "grad_norm": 0.908909022808075, "learning_rate": 1.7975267865698765e-05, "loss": 0.073, "step": 5950 }, { "epoch": 1.86, "grad_norm": 0.6877546906471252, "learning_rate": 1.7971982440762204e-05, "loss": 0.0844, "step": 5955 }, { "epoch": 1.86, "grad_norm": 1.1860584020614624, "learning_rate": 1.796869465320485e-05, "loss": 0.0761, "step": 5960 }, { "epoch": 1.86, "grad_norm": 0.8039729595184326, "learning_rate": 1.7965404504001087e-05, "loss": 0.0551, "step": 5965 }, { "epoch": 1.86, "grad_norm": 0.7674239873886108, "learning_rate": 1.7962111994126004e-05, "loss": 0.0795, "step": 5970 }, { "epoch": 1.86, "grad_norm": 0.7315002679824829, "learning_rate": 1.795881712455539e-05, "loss": 0.0534, "step": 5975 }, { "epoch": 1.87, "grad_norm": 0.46990859508514404, "learning_rate": 1.7955519896265727e-05, "loss": 0.0599, "step": 5980 }, { "epoch": 1.87, "grad_norm": 0.890020489692688, "learning_rate": 1.7952220310234205e-05, "loss": 0.0724, "step": 5985 }, { "epoch": 1.87, "grad_norm": 0.7626070976257324, "learning_rate": 1.7948918367438697e-05, "loss": 0.0459, "step": 5990 }, { "epoch": 1.87, "grad_norm": 0.978101909160614, "learning_rate": 1.7945614068857797e-05, "loss": 0.0572, "step": 5995 }, { "epoch": 1.87, "grad_norm": 0.6801090836524963, "learning_rate": 1.7942307415470777e-05, "loss": 0.076, "step": 6000 }, { "epoch": 1.87, "grad_norm": 0.9605746269226074, "learning_rate": 1.7938998408257622e-05, "loss": 0.053, "step": 6005 }, { "epoch": 1.87, "grad_norm": 0.6000919938087463, "learning_rate": 1.7935687048199e-05, "loss": 0.0595, "step": 6010 }, { "epoch": 1.88, "grad_norm": 0.5320618152618408, "learning_rate": 1.7932373336276292e-05, "loss": 0.058, "step": 6015 }, { "epoch": 1.88, "grad_norm": 0.695774495601654, "learning_rate": 1.792905727347156e-05, "loss": 0.0906, "step": 6020 }, { "epoch": 1.88, "grad_norm": 0.6470620632171631, "learning_rate": 1.7925738860767574e-05, "loss": 0.0557, "step": 6025 }, { "epoch": 1.88, "grad_norm": 1.1034142971038818, "learning_rate": 1.79224180991478e-05, "loss": 0.0903, "step": 6030 }, { "epoch": 1.88, "grad_norm": 0.8370234966278076, "learning_rate": 1.7919094989596395e-05, "loss": 0.0655, "step": 6035 }, { "epoch": 1.88, "grad_norm": 0.8852375149726868, "learning_rate": 1.7915769533098214e-05, "loss": 0.0971, "step": 6040 }, { "epoch": 1.89, "grad_norm": 0.7123667597770691, "learning_rate": 1.7912441730638806e-05, "loss": 0.0835, "step": 6045 }, { "epoch": 1.89, "grad_norm": 0.6630333065986633, "learning_rate": 1.790911158320442e-05, "loss": 0.0695, "step": 6050 }, { "epoch": 1.89, "grad_norm": 0.7879079580307007, "learning_rate": 1.7905779091781996e-05, "loss": 0.0747, "step": 6055 }, { "epoch": 1.89, "grad_norm": 0.8282826542854309, "learning_rate": 1.7902444257359167e-05, "loss": 0.044, "step": 6060 }, { "epoch": 1.89, "grad_norm": 1.0165438652038574, "learning_rate": 1.789910708092427e-05, "loss": 0.0762, "step": 6065 }, { "epoch": 1.89, "grad_norm": 0.9525620937347412, "learning_rate": 1.789576756346632e-05, "loss": 0.0752, "step": 6070 }, { "epoch": 1.89, "grad_norm": 0.36143264174461365, "learning_rate": 1.789242570597504e-05, "loss": 0.051, "step": 6075 }, { "epoch": 1.9, "grad_norm": 0.7918498516082764, "learning_rate": 1.7889081509440843e-05, "loss": 0.0705, "step": 6080 }, { "epoch": 1.9, "grad_norm": 0.9866046905517578, "learning_rate": 1.7885734974854824e-05, "loss": 0.0664, "step": 6085 }, { "epoch": 1.9, "grad_norm": 0.7423363327980042, "learning_rate": 1.7882386103208795e-05, "loss": 0.0749, "step": 6090 }, { "epoch": 1.9, "grad_norm": 0.7900855541229248, "learning_rate": 1.787903489549523e-05, "loss": 0.1023, "step": 6095 }, { "epoch": 1.9, "grad_norm": 0.7296101450920105, "learning_rate": 1.787568135270733e-05, "loss": 0.0583, "step": 6100 }, { "epoch": 1.9, "grad_norm": 0.6891100406646729, "learning_rate": 1.7872325475838956e-05, "loss": 0.068, "step": 6105 }, { "epoch": 1.91, "grad_norm": 1.1211246252059937, "learning_rate": 1.7868967265884673e-05, "loss": 0.1065, "step": 6110 }, { "epoch": 1.91, "grad_norm": 0.8416553735733032, "learning_rate": 1.786560672383975e-05, "loss": 0.0785, "step": 6115 }, { "epoch": 1.91, "grad_norm": 0.5475817918777466, "learning_rate": 1.7862243850700126e-05, "loss": 0.0632, "step": 6120 }, { "epoch": 1.91, "grad_norm": 0.7562156319618225, "learning_rate": 1.7858878647462443e-05, "loss": 0.0705, "step": 6125 }, { "epoch": 1.91, "grad_norm": 0.5753487348556519, "learning_rate": 1.7855511115124035e-05, "loss": 0.0759, "step": 6130 }, { "epoch": 1.91, "grad_norm": 0.6446362137794495, "learning_rate": 1.785214125468292e-05, "loss": 0.0651, "step": 6135 }, { "epoch": 1.92, "grad_norm": 0.6154198050498962, "learning_rate": 1.7848769067137803e-05, "loss": 0.0901, "step": 6140 }, { "epoch": 1.92, "grad_norm": 0.6295109987258911, "learning_rate": 1.7845394553488093e-05, "loss": 0.0578, "step": 6145 }, { "epoch": 1.92, "grad_norm": 0.8939473032951355, "learning_rate": 1.7842017714733875e-05, "loss": 0.0557, "step": 6150 }, { "epoch": 1.92, "grad_norm": 0.9612230658531189, "learning_rate": 1.7838638551875927e-05, "loss": 0.071, "step": 6155 }, { "epoch": 1.92, "grad_norm": 0.8438981771469116, "learning_rate": 1.7835257065915722e-05, "loss": 0.0795, "step": 6160 }, { "epoch": 1.92, "grad_norm": 0.8041678071022034, "learning_rate": 1.7831873257855407e-05, "loss": 0.0652, "step": 6165 }, { "epoch": 1.92, "grad_norm": 0.49747854471206665, "learning_rate": 1.7828487128697832e-05, "loss": 0.0744, "step": 6170 }, { "epoch": 1.93, "grad_norm": 0.5716861486434937, "learning_rate": 1.7825098679446528e-05, "loss": 0.0697, "step": 6175 }, { "epoch": 1.93, "grad_norm": 0.8185078501701355, "learning_rate": 1.7821707911105716e-05, "loss": 0.0753, "step": 6180 }, { "epoch": 1.93, "grad_norm": 1.1662544012069702, "learning_rate": 1.78183148246803e-05, "loss": 0.0953, "step": 6185 }, { "epoch": 1.93, "grad_norm": 0.6486769914627075, "learning_rate": 1.7814919421175877e-05, "loss": 0.0829, "step": 6190 }, { "epoch": 1.93, "grad_norm": 1.0896178483963013, "learning_rate": 1.7811521701598723e-05, "loss": 0.0706, "step": 6195 }, { "epoch": 1.93, "grad_norm": 0.5178818106651306, "learning_rate": 1.7808121666955807e-05, "loss": 0.0658, "step": 6200 }, { "epoch": 1.94, "grad_norm": 0.69623202085495, "learning_rate": 1.7804719318254782e-05, "loss": 0.0651, "step": 6205 }, { "epoch": 1.94, "grad_norm": 0.9311226010322571, "learning_rate": 1.780131465650399e-05, "loss": 0.0608, "step": 6210 }, { "epoch": 1.94, "grad_norm": 0.5501450896263123, "learning_rate": 1.779790768271245e-05, "loss": 0.0662, "step": 6215 }, { "epoch": 1.94, "grad_norm": 0.9106300473213196, "learning_rate": 1.7794498397889873e-05, "loss": 0.0617, "step": 6220 }, { "epoch": 1.94, "grad_norm": 0.7548519372940063, "learning_rate": 1.779108680304666e-05, "loss": 0.0817, "step": 6225 }, { "epoch": 1.94, "grad_norm": 0.5478365421295166, "learning_rate": 1.7787672899193882e-05, "loss": 0.0485, "step": 6230 }, { "epoch": 1.94, "grad_norm": 0.9088108539581299, "learning_rate": 1.7784256687343308e-05, "loss": 0.0629, "step": 6235 }, { "epoch": 1.95, "grad_norm": 0.8210974931716919, "learning_rate": 1.778083816850738e-05, "loss": 0.0484, "step": 6240 }, { "epoch": 1.95, "grad_norm": 0.8573375940322876, "learning_rate": 1.777741734369923e-05, "loss": 0.0619, "step": 6245 }, { "epoch": 1.95, "grad_norm": 0.6718164682388306, "learning_rate": 1.777399421393268e-05, "loss": 0.0698, "step": 6250 }, { "epoch": 1.95, "grad_norm": 0.9652662873268127, "learning_rate": 1.777056878022222e-05, "loss": 0.0901, "step": 6255 }, { "epoch": 1.95, "grad_norm": 0.97198885679245, "learning_rate": 1.776714104358303e-05, "loss": 0.0844, "step": 6260 }, { "epoch": 1.95, "grad_norm": 0.513791561126709, "learning_rate": 1.7763711005030977e-05, "loss": 0.0638, "step": 6265 }, { "epoch": 1.96, "grad_norm": 0.7584071755409241, "learning_rate": 1.7760278665582606e-05, "loss": 0.0612, "step": 6270 }, { "epoch": 1.96, "grad_norm": 0.8927552103996277, "learning_rate": 1.7756844026255144e-05, "loss": 0.0713, "step": 6275 }, { "epoch": 1.96, "grad_norm": 0.7739115953445435, "learning_rate": 1.7753407088066494e-05, "loss": 0.072, "step": 6280 }, { "epoch": 1.96, "grad_norm": 1.0160481929779053, "learning_rate": 1.7749967852035257e-05, "loss": 0.1012, "step": 6285 }, { "epoch": 1.96, "grad_norm": 0.8235144019126892, "learning_rate": 1.774652631918069e-05, "loss": 0.0784, "step": 6290 }, { "epoch": 1.96, "grad_norm": 0.7568656802177429, "learning_rate": 1.774308249052276e-05, "loss": 0.0891, "step": 6295 }, { "epoch": 1.96, "grad_norm": 0.6839597821235657, "learning_rate": 1.7739636367082087e-05, "loss": 0.0881, "step": 6300 }, { "epoch": 1.97, "grad_norm": 0.680732786655426, "learning_rate": 1.7736187949879992e-05, "loss": 0.0871, "step": 6305 }, { "epoch": 1.97, "grad_norm": 0.8405312895774841, "learning_rate": 1.7732737239938455e-05, "loss": 0.0988, "step": 6310 }, { "epoch": 1.97, "grad_norm": 0.7258217334747314, "learning_rate": 1.7729284238280163e-05, "loss": 0.0622, "step": 6315 }, { "epoch": 1.97, "grad_norm": 0.7236860394477844, "learning_rate": 1.7725828945928457e-05, "loss": 0.0849, "step": 6320 }, { "epoch": 1.97, "grad_norm": 0.48804813623428345, "learning_rate": 1.7722371363907366e-05, "loss": 0.0567, "step": 6325 }, { "epoch": 1.97, "grad_norm": 0.7919111847877502, "learning_rate": 1.77189114932416e-05, "loss": 0.044, "step": 6330 }, { "epoch": 1.98, "grad_norm": 0.82615727186203, "learning_rate": 1.7715449334956547e-05, "loss": 0.0798, "step": 6335 }, { "epoch": 1.98, "grad_norm": 0.9560028314590454, "learning_rate": 1.771198489007827e-05, "loss": 0.0804, "step": 6340 }, { "epoch": 1.98, "grad_norm": 1.0600428581237793, "learning_rate": 1.770851815963351e-05, "loss": 0.0926, "step": 6345 }, { "epoch": 1.98, "grad_norm": 2.0330913066864014, "learning_rate": 1.770504914464969e-05, "loss": 0.0753, "step": 6350 }, { "epoch": 1.98, "grad_norm": 0.5192542672157288, "learning_rate": 1.7701577846154903e-05, "loss": 0.0773, "step": 6355 }, { "epoch": 1.98, "grad_norm": 0.9962630867958069, "learning_rate": 1.7698104265177925e-05, "loss": 0.1456, "step": 6360 }, { "epoch": 1.99, "grad_norm": 0.7099242210388184, "learning_rate": 1.7694628402748203e-05, "loss": 0.0738, "step": 6365 }, { "epoch": 1.99, "grad_norm": 1.23063325881958, "learning_rate": 1.7691150259895865e-05, "loss": 0.0753, "step": 6370 }, { "epoch": 1.99, "grad_norm": 0.5459660887718201, "learning_rate": 1.7687669837651707e-05, "loss": 0.0606, "step": 6375 }, { "epoch": 1.99, "grad_norm": 0.8204454183578491, "learning_rate": 1.7684187137047214e-05, "loss": 0.0663, "step": 6380 }, { "epoch": 1.99, "grad_norm": 1.3700021505355835, "learning_rate": 1.7680702159114537e-05, "loss": 0.0893, "step": 6385 }, { "epoch": 1.99, "grad_norm": 0.9551741480827332, "learning_rate": 1.7677214904886496e-05, "loss": 0.068, "step": 6390 }, { "epoch": 1.99, "grad_norm": 1.195141077041626, "learning_rate": 1.76737253753966e-05, "loss": 0.0781, "step": 6395 }, { "epoch": 2.0, "grad_norm": 1.2067859172821045, "learning_rate": 1.7670233571679025e-05, "loss": 0.097, "step": 6400 }, { "epoch": 2.0, "grad_norm": 0.8323922753334045, "learning_rate": 1.7666739494768622e-05, "loss": 0.0658, "step": 6405 }, { "epoch": 2.0, "grad_norm": 0.7578905820846558, "learning_rate": 1.766324314570091e-05, "loss": 0.0901, "step": 6410 }, { "epoch": 2.0, "grad_norm": 0.9802036285400391, "learning_rate": 1.765974452551209e-05, "loss": 0.0394, "step": 6415 }, { "epoch": 2.0, "grad_norm": 0.6872645616531372, "learning_rate": 1.765624363523903e-05, "loss": 0.0387, "step": 6420 }, { "epoch": 2.0, "grad_norm": 0.9163408875465393, "learning_rate": 1.7652740475919275e-05, "loss": 0.0289, "step": 6425 }, { "epoch": 2.01, "grad_norm": 0.6171188950538635, "learning_rate": 1.7649235048591044e-05, "loss": 0.0283, "step": 6430 }, { "epoch": 2.01, "grad_norm": 0.5377746820449829, "learning_rate": 1.7645727354293218e-05, "loss": 0.0304, "step": 6435 }, { "epoch": 2.01, "grad_norm": 0.9587904214859009, "learning_rate": 1.764221739406536e-05, "loss": 0.0331, "step": 6440 }, { "epoch": 2.01, "grad_norm": 1.0330588817596436, "learning_rate": 1.7638705168947702e-05, "loss": 0.0486, "step": 6445 }, { "epoch": 2.01, "grad_norm": 1.749311089515686, "learning_rate": 1.7635190679981146e-05, "loss": 0.035, "step": 6450 }, { "epoch": 2.01, "grad_norm": 0.9288537502288818, "learning_rate": 1.763167392820726e-05, "loss": 0.0366, "step": 6455 }, { "epoch": 2.01, "grad_norm": 1.7338812351226807, "learning_rate": 1.7628154914668297e-05, "loss": 0.0421, "step": 6460 }, { "epoch": 2.02, "grad_norm": 0.4167785048484802, "learning_rate": 1.762463364040717e-05, "loss": 0.026, "step": 6465 }, { "epoch": 2.02, "grad_norm": 0.8623115420341492, "learning_rate": 1.7621110106467458e-05, "loss": 0.039, "step": 6470 }, { "epoch": 2.02, "grad_norm": 0.7953075766563416, "learning_rate": 1.7617584313893417e-05, "loss": 0.0418, "step": 6475 }, { "epoch": 2.02, "grad_norm": 0.6112560629844666, "learning_rate": 1.761405626372997e-05, "loss": 0.0305, "step": 6480 }, { "epoch": 2.02, "grad_norm": 0.6794976592063904, "learning_rate": 1.7610525957022712e-05, "loss": 0.0351, "step": 6485 }, { "epoch": 2.02, "grad_norm": 0.6737608313560486, "learning_rate": 1.7606993394817903e-05, "loss": 0.0337, "step": 6490 }, { "epoch": 2.03, "grad_norm": 1.1872364282608032, "learning_rate": 1.760345857816247e-05, "loss": 0.0461, "step": 6495 }, { "epoch": 2.03, "grad_norm": 0.8426690697669983, "learning_rate": 1.7599921508104018e-05, "loss": 0.0306, "step": 6500 }, { "epoch": 2.03, "grad_norm": 0.825894832611084, "learning_rate": 1.7596382185690802e-05, "loss": 0.0419, "step": 6505 }, { "epoch": 2.03, "grad_norm": 0.8509790897369385, "learning_rate": 1.7592840611971765e-05, "loss": 0.0366, "step": 6510 }, { "epoch": 2.03, "grad_norm": 0.7118868231773376, "learning_rate": 1.7589296787996503e-05, "loss": 0.0491, "step": 6515 }, { "epoch": 2.03, "grad_norm": 0.9368664026260376, "learning_rate": 1.758575071481528e-05, "loss": 0.0305, "step": 6520 }, { "epoch": 2.04, "grad_norm": 0.8669412732124329, "learning_rate": 1.7582202393479037e-05, "loss": 0.0427, "step": 6525 }, { "epoch": 2.04, "grad_norm": 0.7687955498695374, "learning_rate": 1.757865182503937e-05, "loss": 0.0378, "step": 6530 }, { "epoch": 2.04, "grad_norm": 0.9307061433792114, "learning_rate": 1.7575099010548543e-05, "loss": 0.0396, "step": 6535 }, { "epoch": 2.04, "grad_norm": 0.670947790145874, "learning_rate": 1.7571543951059493e-05, "loss": 0.0451, "step": 6540 }, { "epoch": 2.04, "grad_norm": 0.6400846838951111, "learning_rate": 1.7567986647625813e-05, "loss": 0.0339, "step": 6545 }, { "epoch": 2.04, "grad_norm": 1.0342981815338135, "learning_rate": 1.7564427101301767e-05, "loss": 0.0347, "step": 6550 }, { "epoch": 2.04, "grad_norm": 1.1251486539840698, "learning_rate": 1.756086531314228e-05, "loss": 0.028, "step": 6555 }, { "epoch": 2.05, "grad_norm": 1.1043672561645508, "learning_rate": 1.7557301284202947e-05, "loss": 0.0357, "step": 6560 }, { "epoch": 2.05, "grad_norm": 0.6798744797706604, "learning_rate": 1.7553735015540022e-05, "loss": 0.0299, "step": 6565 }, { "epoch": 2.05, "grad_norm": 0.8796838521957397, "learning_rate": 1.755016650821042e-05, "loss": 0.0404, "step": 6570 }, { "epoch": 2.05, "grad_norm": 0.696263313293457, "learning_rate": 1.754659576327173e-05, "loss": 0.0434, "step": 6575 }, { "epoch": 2.05, "grad_norm": 0.7451015114784241, "learning_rate": 1.754302278178219e-05, "loss": 0.0362, "step": 6580 }, { "epoch": 2.05, "grad_norm": 0.7645043730735779, "learning_rate": 1.7539447564800714e-05, "loss": 0.037, "step": 6585 }, { "epoch": 2.06, "grad_norm": 0.9263461828231812, "learning_rate": 1.7535870113386873e-05, "loss": 0.028, "step": 6590 }, { "epoch": 2.06, "grad_norm": 0.7381824254989624, "learning_rate": 1.75322904286009e-05, "loss": 0.0425, "step": 6595 }, { "epoch": 2.06, "grad_norm": 0.8345758318901062, "learning_rate": 1.752870851150369e-05, "loss": 0.0413, "step": 6600 }, { "epoch": 2.06, "grad_norm": 0.4364377558231354, "learning_rate": 1.7525124363156796e-05, "loss": 0.0384, "step": 6605 }, { "epoch": 2.06, "grad_norm": 0.7056708931922913, "learning_rate": 1.7521537984622444e-05, "loss": 0.0344, "step": 6610 }, { "epoch": 2.06, "grad_norm": 0.7152959704399109, "learning_rate": 1.7517949376963503e-05, "loss": 0.0348, "step": 6615 }, { "epoch": 2.06, "grad_norm": 0.824553370475769, "learning_rate": 1.7514358541243518e-05, "loss": 0.0351, "step": 6620 }, { "epoch": 2.07, "grad_norm": 0.7878523468971252, "learning_rate": 1.751076547852669e-05, "loss": 0.0337, "step": 6625 }, { "epoch": 2.07, "grad_norm": 0.6791169047355652, "learning_rate": 1.750717018987788e-05, "loss": 0.03, "step": 6630 }, { "epoch": 2.07, "grad_norm": 0.6202698945999146, "learning_rate": 1.75035726763626e-05, "loss": 0.0367, "step": 6635 }, { "epoch": 2.07, "grad_norm": 0.7299143075942993, "learning_rate": 1.749997293904704e-05, "loss": 0.0423, "step": 6640 }, { "epoch": 2.07, "grad_norm": 0.9602674841880798, "learning_rate": 1.749637097899803e-05, "loss": 0.0367, "step": 6645 }, { "epoch": 2.07, "grad_norm": 1.9754753112792969, "learning_rate": 1.749276679728307e-05, "loss": 0.0367, "step": 6650 }, { "epoch": 2.08, "grad_norm": 0.645286500453949, "learning_rate": 1.7489160394970314e-05, "loss": 0.0391, "step": 6655 }, { "epoch": 2.08, "grad_norm": 0.5990299582481384, "learning_rate": 1.7485551773128574e-05, "loss": 0.0327, "step": 6660 }, { "epoch": 2.08, "grad_norm": 1.062504529953003, "learning_rate": 1.7481940932827324e-05, "loss": 0.0433, "step": 6665 }, { "epoch": 2.08, "grad_norm": 0.6965630650520325, "learning_rate": 1.747832787513669e-05, "loss": 0.0411, "step": 6670 }, { "epoch": 2.08, "grad_norm": 0.509995698928833, "learning_rate": 1.747471260112746e-05, "loss": 0.0338, "step": 6675 }, { "epoch": 2.08, "grad_norm": 0.990881621837616, "learning_rate": 1.7471095111871076e-05, "loss": 0.0375, "step": 6680 }, { "epoch": 2.09, "grad_norm": 0.8508270382881165, "learning_rate": 1.7467475408439636e-05, "loss": 0.0384, "step": 6685 }, { "epoch": 2.09, "grad_norm": 0.7476538419723511, "learning_rate": 1.74638534919059e-05, "loss": 0.0408, "step": 6690 }, { "epoch": 2.09, "grad_norm": 0.6179473400115967, "learning_rate": 1.746022936334327e-05, "loss": 0.0395, "step": 6695 }, { "epoch": 2.09, "grad_norm": 0.7255807518959045, "learning_rate": 1.745660302382582e-05, "loss": 0.033, "step": 6700 }, { "epoch": 2.09, "grad_norm": 0.7685034871101379, "learning_rate": 1.745297447442827e-05, "loss": 0.0269, "step": 6705 }, { "epoch": 2.09, "grad_norm": 0.8111138343811035, "learning_rate": 1.7449343716225998e-05, "loss": 0.0358, "step": 6710 }, { "epoch": 2.09, "grad_norm": 0.9519994854927063, "learning_rate": 1.7445710750295034e-05, "loss": 0.0482, "step": 6715 }, { "epoch": 2.1, "grad_norm": 0.8502340316772461, "learning_rate": 1.7442075577712064e-05, "loss": 0.0441, "step": 6720 }, { "epoch": 2.1, "grad_norm": 0.5589675903320312, "learning_rate": 1.743843819955443e-05, "loss": 0.0267, "step": 6725 }, { "epoch": 2.1, "grad_norm": 0.6813309788703918, "learning_rate": 1.7434798616900122e-05, "loss": 0.0298, "step": 6730 }, { "epoch": 2.1, "grad_norm": 0.6360641717910767, "learning_rate": 1.743115683082779e-05, "loss": 0.0391, "step": 6735 }, { "epoch": 2.1, "grad_norm": 0.5595855712890625, "learning_rate": 1.742751284241673e-05, "loss": 0.0285, "step": 6740 }, { "epoch": 2.1, "grad_norm": Infinity, "learning_rate": 1.7424596066729708e-05, "loss": 0.1078, "step": 6745 }, { "epoch": 2.11, "grad_norm": 0.6349712014198303, "learning_rate": 1.7420948116830864e-05, "loss": 0.0482, "step": 6750 }, { "epoch": 2.11, "grad_norm": 0.8908722400665283, "learning_rate": 1.7417297967618806e-05, "loss": 0.0535, "step": 6755 }, { "epoch": 2.11, "grad_norm": 1.2324929237365723, "learning_rate": 1.7413645620175313e-05, "loss": 0.0334, "step": 6760 }, { "epoch": 2.11, "grad_norm": 0.7902947664260864, "learning_rate": 1.740999107558281e-05, "loss": 0.0339, "step": 6765 }, { "epoch": 2.11, "grad_norm": 0.8780156970024109, "learning_rate": 1.7406334334924386e-05, "loss": 0.0568, "step": 6770 }, { "epoch": 2.11, "grad_norm": 0.7588701248168945, "learning_rate": 1.7402675399283762e-05, "loss": 0.0384, "step": 6775 }, { "epoch": 2.11, "grad_norm": 0.9184271693229675, "learning_rate": 1.7399014269745332e-05, "loss": 0.0348, "step": 6780 }, { "epoch": 2.12, "grad_norm": 0.895255982875824, "learning_rate": 1.7395350947394127e-05, "loss": 0.0226, "step": 6785 }, { "epoch": 2.12, "grad_norm": 0.495691180229187, "learning_rate": 1.739168543331582e-05, "loss": 0.0316, "step": 6790 }, { "epoch": 2.12, "grad_norm": 0.8194505572319031, "learning_rate": 1.7388017728596752e-05, "loss": 0.0396, "step": 6795 }, { "epoch": 2.12, "grad_norm": 0.6622453927993774, "learning_rate": 1.73843478343239e-05, "loss": 0.0344, "step": 6800 }, { "epoch": 2.12, "grad_norm": 0.5545173287391663, "learning_rate": 1.73806757515849e-05, "loss": 0.0454, "step": 6805 }, { "epoch": 2.12, "grad_norm": 0.9523460268974304, "learning_rate": 1.7377001481468017e-05, "loss": 0.0372, "step": 6810 }, { "epoch": 2.13, "grad_norm": 0.9859544634819031, "learning_rate": 1.7373325025062197e-05, "loss": 0.0368, "step": 6815 }, { "epoch": 2.13, "grad_norm": 0.64875727891922, "learning_rate": 1.7369646383457e-05, "loss": 0.0375, "step": 6820 }, { "epoch": 2.13, "grad_norm": 0.5710052847862244, "learning_rate": 1.7365965557742658e-05, "loss": 0.0306, "step": 6825 }, { "epoch": 2.13, "grad_norm": 0.690978467464447, "learning_rate": 1.736228254901003e-05, "loss": 0.0387, "step": 6830 }, { "epoch": 2.13, "grad_norm": 0.9192633628845215, "learning_rate": 1.7358597358350646e-05, "loss": 0.035, "step": 6835 }, { "epoch": 2.13, "grad_norm": 15.089350700378418, "learning_rate": 1.7354909986856664e-05, "loss": 0.059, "step": 6840 }, { "epoch": 2.13, "grad_norm": 1.888706088066101, "learning_rate": 1.735122043562089e-05, "loss": 0.0421, "step": 6845 }, { "epoch": 2.14, "grad_norm": 1.481156587600708, "learning_rate": 1.734752870573678e-05, "loss": 0.0639, "step": 6850 }, { "epoch": 2.14, "grad_norm": 1.3059687614440918, "learning_rate": 1.734383479829844e-05, "loss": 0.0459, "step": 6855 }, { "epoch": 2.14, "grad_norm": 2.741889238357544, "learning_rate": 1.7340138714400616e-05, "loss": 0.0677, "step": 6860 }, { "epoch": 2.14, "grad_norm": 0.8907132744789124, "learning_rate": 1.7336440455138695e-05, "loss": 0.0521, "step": 6865 }, { "epoch": 2.14, "grad_norm": 1.2927290201187134, "learning_rate": 1.7332740021608722e-05, "loss": 0.0434, "step": 6870 }, { "epoch": 2.14, "grad_norm": 0.7092378735542297, "learning_rate": 1.7329037414907374e-05, "loss": 0.0414, "step": 6875 }, { "epoch": 2.15, "grad_norm": 1.9778251647949219, "learning_rate": 1.732533263613197e-05, "loss": 0.0402, "step": 6880 }, { "epoch": 2.15, "grad_norm": 1.6234639883041382, "learning_rate": 1.732162568638049e-05, "loss": 0.0401, "step": 6885 }, { "epoch": 2.15, "grad_norm": 1.4143224954605103, "learning_rate": 1.7317916566751537e-05, "loss": 0.0458, "step": 6890 }, { "epoch": 2.15, "grad_norm": 0.670375645160675, "learning_rate": 1.7314205278344372e-05, "loss": 0.0365, "step": 6895 }, { "epoch": 2.15, "grad_norm": 1.0456308126449585, "learning_rate": 1.731049182225889e-05, "loss": 0.0475, "step": 6900 }, { "epoch": 2.15, "grad_norm": 0.8300198316574097, "learning_rate": 1.730677619959563e-05, "loss": 0.0385, "step": 6905 }, { "epoch": 2.16, "grad_norm": 0.7571542263031006, "learning_rate": 1.7303058411455774e-05, "loss": 0.0291, "step": 6910 }, { "epoch": 2.16, "grad_norm": 1.3860149383544922, "learning_rate": 1.729933845894115e-05, "loss": 0.0425, "step": 6915 }, { "epoch": 2.16, "grad_norm": 1.4162564277648926, "learning_rate": 1.7295616343154224e-05, "loss": 0.0488, "step": 6920 }, { "epoch": 2.16, "grad_norm": 0.6601019501686096, "learning_rate": 1.72918920651981e-05, "loss": 0.0461, "step": 6925 }, { "epoch": 2.16, "grad_norm": 0.9042936563491821, "learning_rate": 1.728816562617653e-05, "loss": 0.0331, "step": 6930 }, { "epoch": 2.16, "grad_norm": 0.8178083896636963, "learning_rate": 1.728443702719389e-05, "loss": 0.0459, "step": 6935 }, { "epoch": 2.16, "grad_norm": 1.077895164489746, "learning_rate": 1.7280706269355222e-05, "loss": 0.0413, "step": 6940 }, { "epoch": 2.17, "grad_norm": 0.7887989282608032, "learning_rate": 1.7276973353766187e-05, "loss": 0.0478, "step": 6945 }, { "epoch": 2.17, "grad_norm": 1.1501586437225342, "learning_rate": 1.7273238281533097e-05, "loss": 0.0378, "step": 6950 }, { "epoch": 2.17, "grad_norm": 1.1501015424728394, "learning_rate": 1.7269501053762896e-05, "loss": 0.0338, "step": 6955 }, { "epoch": 2.17, "grad_norm": 1.111640453338623, "learning_rate": 1.7265761671563167e-05, "loss": 0.039, "step": 6960 }, { "epoch": 2.17, "grad_norm": 0.6609378457069397, "learning_rate": 1.7262020136042138e-05, "loss": 0.0403, "step": 6965 }, { "epoch": 2.17, "grad_norm": 0.8092169165611267, "learning_rate": 1.7258276448308672e-05, "loss": 0.0411, "step": 6970 }, { "epoch": 2.18, "grad_norm": 0.6540818810462952, "learning_rate": 1.725453060947226e-05, "loss": 0.0359, "step": 6975 }, { "epoch": 2.18, "grad_norm": 1.087636947631836, "learning_rate": 1.7250782620643055e-05, "loss": 0.0426, "step": 6980 }, { "epoch": 2.18, "grad_norm": 0.8799102902412415, "learning_rate": 1.7247032482931817e-05, "loss": 0.0377, "step": 6985 }, { "epoch": 2.18, "grad_norm": 0.72962486743927, "learning_rate": 1.7243280197449963e-05, "loss": 0.041, "step": 6990 }, { "epoch": 2.18, "grad_norm": 0.7479212284088135, "learning_rate": 1.7239525765309543e-05, "loss": 0.0276, "step": 6995 }, { "epoch": 2.18, "grad_norm": 0.8470936417579651, "learning_rate": 1.723576918762324e-05, "loss": 0.0428, "step": 7000 }, { "epoch": 2.18, "grad_norm": 0.8011202216148376, "learning_rate": 1.7232010465504376e-05, "loss": 0.0392, "step": 7005 }, { "epoch": 2.19, "grad_norm": 1.085710048675537, "learning_rate": 1.72282496000669e-05, "loss": 0.0585, "step": 7010 }, { "epoch": 2.19, "grad_norm": 0.6330513954162598, "learning_rate": 1.7224486592425407e-05, "loss": 0.033, "step": 7015 }, { "epoch": 2.19, "grad_norm": 0.7976099252700806, "learning_rate": 1.7220721443695126e-05, "loss": 0.0331, "step": 7020 }, { "epoch": 2.19, "grad_norm": 1.0006788969039917, "learning_rate": 1.7216954154991917e-05, "loss": 0.0399, "step": 7025 }, { "epoch": 2.19, "grad_norm": 0.693641185760498, "learning_rate": 1.7213184727432267e-05, "loss": 0.0336, "step": 7030 }, { "epoch": 2.19, "grad_norm": 0.958171010017395, "learning_rate": 1.720941316213331e-05, "loss": 0.0476, "step": 7035 }, { "epoch": 2.2, "grad_norm": 0.7659122347831726, "learning_rate": 1.7205639460212807e-05, "loss": 0.05, "step": 7040 }, { "epoch": 2.2, "grad_norm": 0.8464353680610657, "learning_rate": 1.7201863622789158e-05, "loss": 0.047, "step": 7045 }, { "epoch": 2.2, "grad_norm": 0.8198861479759216, "learning_rate": 1.719808565098138e-05, "loss": 0.041, "step": 7050 }, { "epoch": 2.2, "grad_norm": 0.4542665183544159, "learning_rate": 1.7194305545909146e-05, "loss": 0.0388, "step": 7055 }, { "epoch": 2.2, "grad_norm": 0.7713673710823059, "learning_rate": 1.719052330869274e-05, "loss": 0.0338, "step": 7060 }, { "epoch": 2.2, "grad_norm": 0.8043854236602783, "learning_rate": 1.7186738940453092e-05, "loss": 0.026, "step": 7065 }, { "epoch": 2.21, "grad_norm": 0.8994637131690979, "learning_rate": 1.7182952442311752e-05, "loss": 0.0435, "step": 7070 }, { "epoch": 2.21, "grad_norm": 0.6519581079483032, "learning_rate": 1.7179163815390912e-05, "loss": 0.031, "step": 7075 }, { "epoch": 2.21, "grad_norm": 1.0966442823410034, "learning_rate": 1.7175373060813396e-05, "loss": 0.0342, "step": 7080 }, { "epoch": 2.21, "grad_norm": 6.207887172698975, "learning_rate": 1.717158017970264e-05, "loss": 0.0642, "step": 7085 }, { "epoch": 2.21, "grad_norm": 0.5999218821525574, "learning_rate": 1.7167785173182734e-05, "loss": 0.0343, "step": 7090 }, { "epoch": 2.21, "grad_norm": 0.7367837429046631, "learning_rate": 1.716398804237838e-05, "loss": 0.0468, "step": 7095 }, { "epoch": 2.21, "grad_norm": 1.121308445930481, "learning_rate": 1.7160188788414923e-05, "loss": 0.0411, "step": 7100 }, { "epoch": 2.22, "grad_norm": 0.8983088135719299, "learning_rate": 1.7156387412418328e-05, "loss": 0.0345, "step": 7105 }, { "epoch": 2.22, "grad_norm": 0.9664926528930664, "learning_rate": 1.7152583915515188e-05, "loss": 0.0378, "step": 7110 }, { "epoch": 2.22, "grad_norm": 0.8459317088127136, "learning_rate": 1.7148778298832737e-05, "loss": 0.0403, "step": 7115 }, { "epoch": 2.22, "grad_norm": 1.0068672895431519, "learning_rate": 1.714497056349882e-05, "loss": 0.0497, "step": 7120 }, { "epoch": 2.22, "grad_norm": 1.114620327949524, "learning_rate": 1.714116071064192e-05, "loss": 0.042, "step": 7125 }, { "epoch": 2.22, "grad_norm": 0.9400085806846619, "learning_rate": 1.7137348741391155e-05, "loss": 0.0432, "step": 7130 }, { "epoch": 2.23, "grad_norm": 7.140718936920166, "learning_rate": 1.713353465687625e-05, "loss": 0.0664, "step": 7135 }, { "epoch": 2.23, "grad_norm": 2.289583206176758, "learning_rate": 1.7129718458227574e-05, "loss": 0.0388, "step": 7140 }, { "epoch": 2.23, "grad_norm": 25.73358917236328, "learning_rate": 1.7125900146576112e-05, "loss": 0.0408, "step": 7145 }, { "epoch": 2.23, "grad_norm": 0.9193993210792542, "learning_rate": 1.7122079723053486e-05, "loss": 0.0376, "step": 7150 }, { "epoch": 2.23, "grad_norm": 3.834202289581299, "learning_rate": 1.711825718879194e-05, "loss": 0.0444, "step": 7155 }, { "epoch": 2.23, "grad_norm": 1.4898897409439087, "learning_rate": 1.711443254492433e-05, "loss": 0.0416, "step": 7160 }, { "epoch": 2.23, "grad_norm": 1.7508093118667603, "learning_rate": 1.7110605792584157e-05, "loss": 0.055, "step": 7165 }, { "epoch": 2.24, "grad_norm": 0.8517021536827087, "learning_rate": 1.7106776932905535e-05, "loss": 0.0457, "step": 7170 }, { "epoch": 2.24, "grad_norm": 0.6700179576873779, "learning_rate": 1.710294596702321e-05, "loss": 0.034, "step": 7175 }, { "epoch": 2.24, "grad_norm": 0.8382918238639832, "learning_rate": 1.7099112896072546e-05, "loss": 0.0483, "step": 7180 }, { "epoch": 2.24, "grad_norm": 0.9156063795089722, "learning_rate": 1.709527772118953e-05, "loss": 0.0423, "step": 7185 }, { "epoch": 2.24, "grad_norm": 0.8718563318252563, "learning_rate": 1.7091440443510784e-05, "loss": 0.0397, "step": 7190 }, { "epoch": 2.24, "grad_norm": 0.9186751246452332, "learning_rate": 1.7087601064173535e-05, "loss": 0.0402, "step": 7195 }, { "epoch": 2.25, "grad_norm": 0.8474892377853394, "learning_rate": 1.7083759584315648e-05, "loss": 0.0404, "step": 7200 }, { "epoch": 2.25, "grad_norm": 6.393630027770996, "learning_rate": 1.7079916005075604e-05, "loss": 0.0432, "step": 7205 }, { "epoch": 2.25, "grad_norm": 0.7541725635528564, "learning_rate": 1.707607032759251e-05, "loss": 0.0417, "step": 7210 }, { "epoch": 2.25, "grad_norm": 0.794830858707428, "learning_rate": 1.7072222553006086e-05, "loss": 0.0446, "step": 7215 }, { "epoch": 2.25, "grad_norm": 0.8188607692718506, "learning_rate": 1.7068372682456686e-05, "loss": 0.0395, "step": 7220 }, { "epoch": 2.25, "grad_norm": 0.7906588912010193, "learning_rate": 1.7064520717085276e-05, "loss": 0.0305, "step": 7225 }, { "epoch": 2.26, "grad_norm": 0.7647044062614441, "learning_rate": 1.706066665803345e-05, "loss": 0.0422, "step": 7230 }, { "epoch": 2.26, "grad_norm": 0.5535922050476074, "learning_rate": 1.705681050644341e-05, "loss": 0.0426, "step": 7235 }, { "epoch": 2.26, "grad_norm": 0.8452866077423096, "learning_rate": 1.7052952263457993e-05, "loss": 0.0404, "step": 7240 }, { "epoch": 2.26, "grad_norm": 0.6865875720977783, "learning_rate": 1.7049091930220647e-05, "loss": 0.0505, "step": 7245 }, { "epoch": 2.26, "grad_norm": 0.904468834400177, "learning_rate": 1.7045229507875443e-05, "loss": 0.0345, "step": 7250 }, { "epoch": 2.26, "grad_norm": 0.7027440071105957, "learning_rate": 1.7041364997567064e-05, "loss": 0.0424, "step": 7255 }, { "epoch": 2.26, "grad_norm": 0.8030436038970947, "learning_rate": 1.703749840044083e-05, "loss": 0.0385, "step": 7260 }, { "epoch": 2.27, "grad_norm": 0.8274464011192322, "learning_rate": 1.7033629717642656e-05, "loss": 0.0431, "step": 7265 }, { "epoch": 2.27, "grad_norm": 0.9646808505058289, "learning_rate": 1.702975895031909e-05, "loss": 0.0376, "step": 7270 }, { "epoch": 2.27, "grad_norm": 0.9568483233451843, "learning_rate": 1.7025886099617293e-05, "loss": 0.0328, "step": 7275 }, { "epoch": 2.27, "grad_norm": 0.6486111879348755, "learning_rate": 1.7022011166685046e-05, "loss": 0.035, "step": 7280 }, { "epoch": 2.27, "grad_norm": 5.381010055541992, "learning_rate": 1.7018134152670745e-05, "loss": 0.0321, "step": 7285 }, { "epoch": 2.27, "grad_norm": 1.302991509437561, "learning_rate": 1.7014255058723403e-05, "loss": 0.0423, "step": 7290 }, { "epoch": 2.28, "grad_norm": 0.7025021910667419, "learning_rate": 1.701037388599265e-05, "loss": 0.0482, "step": 7295 }, { "epoch": 2.28, "grad_norm": 0.8713043928146362, "learning_rate": 1.7006490635628733e-05, "loss": 0.0457, "step": 7300 }, { "epoch": 2.28, "grad_norm": 21.033382415771484, "learning_rate": 1.700260530878251e-05, "loss": 0.0937, "step": 7305 }, { "epoch": 2.28, "grad_norm": 0.8523858189582825, "learning_rate": 1.699871790660546e-05, "loss": 0.0375, "step": 7310 }, { "epoch": 2.28, "grad_norm": 1.4806874990463257, "learning_rate": 1.6994828430249676e-05, "loss": 0.0477, "step": 7315 }, { "epoch": 2.28, "grad_norm": 0.6420323252677917, "learning_rate": 1.6990936880867863e-05, "loss": 0.0303, "step": 7320 }, { "epoch": 2.28, "grad_norm": 0.7890008687973022, "learning_rate": 1.698704325961334e-05, "loss": 0.0437, "step": 7325 }, { "epoch": 2.29, "grad_norm": 1.2356423139572144, "learning_rate": 1.698314756764005e-05, "loss": 0.0398, "step": 7330 }, { "epoch": 2.29, "grad_norm": 0.7098721265792847, "learning_rate": 1.6979249806102535e-05, "loss": 0.0411, "step": 7335 }, { "epoch": 2.29, "grad_norm": 0.9438479542732239, "learning_rate": 1.697534997615596e-05, "loss": 0.0373, "step": 7340 }, { "epoch": 2.29, "grad_norm": 0.758943498134613, "learning_rate": 1.6971448078956098e-05, "loss": 0.0446, "step": 7345 }, { "epoch": 2.29, "grad_norm": 0.7948986291885376, "learning_rate": 1.6967544115659344e-05, "loss": 0.0359, "step": 7350 }, { "epoch": 2.29, "grad_norm": 1.1407995223999023, "learning_rate": 1.6963638087422686e-05, "loss": 0.0414, "step": 7355 }, { "epoch": 2.3, "grad_norm": 0.8077024221420288, "learning_rate": 1.6959729995403743e-05, "loss": 0.0407, "step": 7360 }, { "epoch": 2.3, "grad_norm": 1.3187479972839355, "learning_rate": 1.6955819840760736e-05, "loss": 0.0369, "step": 7365 }, { "epoch": 2.3, "grad_norm": 0.6189730763435364, "learning_rate": 1.6951907624652506e-05, "loss": 0.0364, "step": 7370 }, { "epoch": 2.3, "grad_norm": 0.8181635141372681, "learning_rate": 1.6947993348238492e-05, "loss": 0.038, "step": 7375 }, { "epoch": 2.3, "grad_norm": 1.5456126928329468, "learning_rate": 1.6944077012678758e-05, "loss": 0.0479, "step": 7380 }, { "epoch": 2.3, "grad_norm": 2.996551036834717, "learning_rate": 1.6940158619133965e-05, "loss": 0.0402, "step": 7385 }, { "epoch": 2.3, "grad_norm": 0.6797996759414673, "learning_rate": 1.693623816876539e-05, "loss": 0.0552, "step": 7390 }, { "epoch": 2.31, "grad_norm": 0.6198251843452454, "learning_rate": 1.693231566273492e-05, "loss": 0.0368, "step": 7395 }, { "epoch": 2.31, "grad_norm": 0.764711320400238, "learning_rate": 1.692839110220505e-05, "loss": 0.0427, "step": 7400 }, { "epoch": 2.31, "grad_norm": 0.841666042804718, "learning_rate": 1.692446448833889e-05, "loss": 0.0401, "step": 7405 }, { "epoch": 2.31, "grad_norm": 0.7802782654762268, "learning_rate": 1.6920535822300145e-05, "loss": 0.0464, "step": 7410 }, { "epoch": 2.31, "grad_norm": 0.9961202144622803, "learning_rate": 1.6916605105253144e-05, "loss": 0.0443, "step": 7415 }, { "epoch": 2.31, "grad_norm": 0.8209372758865356, "learning_rate": 1.691267233836281e-05, "loss": 0.0442, "step": 7420 }, { "epoch": 2.32, "grad_norm": 0.8145427703857422, "learning_rate": 1.6908737522794682e-05, "loss": 0.037, "step": 7425 }, { "epoch": 2.32, "grad_norm": 0.7817115187644958, "learning_rate": 1.6904800659714902e-05, "loss": 0.0333, "step": 7430 }, { "epoch": 2.32, "grad_norm": 0.8446786403656006, "learning_rate": 1.690086175029022e-05, "loss": 0.0337, "step": 7435 }, { "epoch": 2.32, "grad_norm": 1.0260515213012695, "learning_rate": 1.6896920795687998e-05, "loss": 0.0423, "step": 7440 }, { "epoch": 2.32, "grad_norm": 1.1411818265914917, "learning_rate": 1.6892977797076197e-05, "loss": 0.0281, "step": 7445 }, { "epoch": 2.32, "grad_norm": 0.8356356024742126, "learning_rate": 1.6889032755623382e-05, "loss": 0.0392, "step": 7450 }, { "epoch": 2.33, "grad_norm": 0.90280681848526, "learning_rate": 1.6885085672498726e-05, "loss": 0.0372, "step": 7455 }, { "epoch": 2.33, "grad_norm": 0.8793005347251892, "learning_rate": 1.688113654887202e-05, "loss": 0.0345, "step": 7460 }, { "epoch": 2.33, "grad_norm": 1.5199700593948364, "learning_rate": 1.6877185385913636e-05, "loss": 0.0291, "step": 7465 }, { "epoch": 2.33, "grad_norm": 0.762761652469635, "learning_rate": 1.6873232184794566e-05, "loss": 0.0432, "step": 7470 }, { "epoch": 2.33, "grad_norm": 1.3521060943603516, "learning_rate": 1.686927694668641e-05, "loss": 0.0395, "step": 7475 }, { "epoch": 2.33, "grad_norm": 0.9418859481811523, "learning_rate": 1.686531967276135e-05, "loss": 0.0475, "step": 7480 }, { "epoch": 2.33, "grad_norm": 0.8807092905044556, "learning_rate": 1.6861360364192195e-05, "loss": 0.0473, "step": 7485 }, { "epoch": 2.34, "grad_norm": 0.8254885673522949, "learning_rate": 1.685739902215235e-05, "loss": 0.0348, "step": 7490 }, { "epoch": 2.34, "grad_norm": 1.0653108358383179, "learning_rate": 1.6853435647815812e-05, "loss": 0.0506, "step": 7495 }, { "epoch": 2.34, "grad_norm": 0.9384851455688477, "learning_rate": 1.6849470242357197e-05, "loss": 0.0391, "step": 7500 }, { "epoch": 2.34, "grad_norm": 0.7083196640014648, "learning_rate": 1.6845502806951706e-05, "loss": 0.0252, "step": 7505 }, { "epoch": 2.34, "grad_norm": 0.5872582793235779, "learning_rate": 1.6841533342775153e-05, "loss": 0.0424, "step": 7510 }, { "epoch": 2.34, "grad_norm": 0.6886513829231262, "learning_rate": 1.6837561851003956e-05, "loss": 0.0551, "step": 7515 }, { "epoch": 2.35, "grad_norm": 0.934777557849884, "learning_rate": 1.683358833281512e-05, "loss": 0.0367, "step": 7520 }, { "epoch": 2.35, "grad_norm": 1.0215473175048828, "learning_rate": 1.6829612789386265e-05, "loss": 0.0424, "step": 7525 }, { "epoch": 2.35, "grad_norm": 0.6930414438247681, "learning_rate": 1.68256352218956e-05, "loss": 0.0299, "step": 7530 }, { "epoch": 2.35, "grad_norm": 1.0204532146453857, "learning_rate": 1.6821655631521947e-05, "loss": 0.0628, "step": 7535 }, { "epoch": 2.35, "grad_norm": 1.1268620491027832, "learning_rate": 1.681767401944471e-05, "loss": 0.0364, "step": 7540 }, { "epoch": 2.35, "grad_norm": 0.9842634201049805, "learning_rate": 1.6813690386843905e-05, "loss": 0.0375, "step": 7545 }, { "epoch": 2.35, "grad_norm": 0.8676565885543823, "learning_rate": 1.6809704734900148e-05, "loss": 0.0338, "step": 7550 }, { "epoch": 2.36, "grad_norm": 0.8421193361282349, "learning_rate": 1.680571706479464e-05, "loss": 0.0419, "step": 7555 }, { "epoch": 2.36, "grad_norm": 1.695347547531128, "learning_rate": 1.6801727377709195e-05, "loss": 0.0341, "step": 7560 }, { "epoch": 2.36, "grad_norm": 0.8723491430282593, "learning_rate": 1.679773567482622e-05, "loss": 0.0373, "step": 7565 }, { "epoch": 2.36, "grad_norm": 0.7483975291252136, "learning_rate": 1.6793741957328713e-05, "loss": 0.0489, "step": 7570 }, { "epoch": 2.36, "grad_norm": 1.1286433935165405, "learning_rate": 1.6789746226400277e-05, "loss": 0.0392, "step": 7575 }, { "epoch": 2.36, "grad_norm": 0.7481610774993896, "learning_rate": 1.678574848322511e-05, "loss": 0.0394, "step": 7580 }, { "epoch": 2.37, "grad_norm": 0.645449697971344, "learning_rate": 1.6781748728988003e-05, "loss": 0.0386, "step": 7585 }, { "epoch": 2.37, "grad_norm": 0.9019328951835632, "learning_rate": 1.6777746964874346e-05, "loss": 0.0327, "step": 7590 }, { "epoch": 2.37, "grad_norm": 0.5962875485420227, "learning_rate": 1.6773743192070124e-05, "loss": 0.0369, "step": 7595 }, { "epoch": 2.37, "grad_norm": 0.7119027376174927, "learning_rate": 1.6769737411761917e-05, "loss": 0.0344, "step": 7600 }, { "epoch": 2.37, "grad_norm": 0.7260536551475525, "learning_rate": 1.6765729625136906e-05, "loss": 0.0356, "step": 7605 }, { "epoch": 2.37, "grad_norm": 0.9319799542427063, "learning_rate": 1.6761719833382846e-05, "loss": 0.0405, "step": 7610 }, { "epoch": 2.38, "grad_norm": 0.6937612891197205, "learning_rate": 1.6757708037688117e-05, "loss": 0.0273, "step": 7615 }, { "epoch": 2.38, "grad_norm": 0.7246429324150085, "learning_rate": 1.675369423924167e-05, "loss": 0.0343, "step": 7620 }, { "epoch": 2.38, "grad_norm": 0.7110769748687744, "learning_rate": 1.6749678439233058e-05, "loss": 0.0354, "step": 7625 }, { "epoch": 2.38, "grad_norm": 0.9897186160087585, "learning_rate": 1.674566063885242e-05, "loss": 0.0541, "step": 7630 }, { "epoch": 2.38, "grad_norm": 0.7339702248573303, "learning_rate": 1.67416408392905e-05, "loss": 0.0364, "step": 7635 }, { "epoch": 2.38, "grad_norm": 1.0443443059921265, "learning_rate": 1.673761904173863e-05, "loss": 0.0384, "step": 7640 }, { "epoch": 2.38, "grad_norm": 0.766381561756134, "learning_rate": 1.6733595247388723e-05, "loss": 0.0261, "step": 7645 }, { "epoch": 2.39, "grad_norm": 0.9241385459899902, "learning_rate": 1.67295694574333e-05, "loss": 0.0387, "step": 7650 }, { "epoch": 2.39, "grad_norm": 0.869219958782196, "learning_rate": 1.6725541673065465e-05, "loss": 0.0335, "step": 7655 }, { "epoch": 2.39, "grad_norm": 1.7011229991912842, "learning_rate": 1.6721511895478915e-05, "loss": 0.0434, "step": 7660 }, { "epoch": 2.39, "grad_norm": 1.0889408588409424, "learning_rate": 1.6717480125867936e-05, "loss": 0.0436, "step": 7665 }, { "epoch": 2.39, "grad_norm": 0.7818471789360046, "learning_rate": 1.6713446365427402e-05, "loss": 0.0361, "step": 7670 }, { "epoch": 2.39, "grad_norm": 0.7618464231491089, "learning_rate": 1.6709410615352794e-05, "loss": 0.0352, "step": 7675 }, { "epoch": 2.4, "grad_norm": 1.0346121788024902, "learning_rate": 1.670537287684015e-05, "loss": 0.0339, "step": 7680 }, { "epoch": 2.4, "grad_norm": 0.872925877571106, "learning_rate": 1.670133315108613e-05, "loss": 0.039, "step": 7685 }, { "epoch": 2.4, "grad_norm": 0.7551829814910889, "learning_rate": 1.6697291439287965e-05, "loss": 0.0359, "step": 7690 }, { "epoch": 2.4, "grad_norm": 1.0588226318359375, "learning_rate": 1.669324774264348e-05, "loss": 0.0284, "step": 7695 }, { "epoch": 2.4, "grad_norm": 0.8457269668579102, "learning_rate": 1.6689202062351086e-05, "loss": 0.0409, "step": 7700 }, { "epoch": 2.4, "grad_norm": 0.787465512752533, "learning_rate": 1.6685154399609787e-05, "loss": 0.0293, "step": 7705 }, { "epoch": 2.4, "grad_norm": 0.9415322542190552, "learning_rate": 1.6681104755619166e-05, "loss": 0.0391, "step": 7710 }, { "epoch": 2.41, "grad_norm": 0.9193680286407471, "learning_rate": 1.6677053131579397e-05, "loss": 0.0399, "step": 7715 }, { "epoch": 2.41, "grad_norm": 1.6151878833770752, "learning_rate": 1.6672999528691245e-05, "loss": 0.0372, "step": 7720 }, { "epoch": 2.41, "grad_norm": 0.5558480620384216, "learning_rate": 1.666894394815606e-05, "loss": 0.0279, "step": 7725 }, { "epoch": 2.41, "grad_norm": 0.7361923456192017, "learning_rate": 1.6664886391175775e-05, "loss": 0.0357, "step": 7730 }, { "epoch": 2.41, "grad_norm": 0.6120460033416748, "learning_rate": 1.6660826858952902e-05, "loss": 0.0471, "step": 7735 }, { "epoch": 2.41, "grad_norm": 0.5619087815284729, "learning_rate": 1.665676535269056e-05, "loss": 0.0265, "step": 7740 }, { "epoch": 2.42, "grad_norm": 1.0988489389419556, "learning_rate": 1.665270187359243e-05, "loss": 0.0401, "step": 7745 }, { "epoch": 2.42, "grad_norm": 0.712315559387207, "learning_rate": 1.664863642286279e-05, "loss": 0.0414, "step": 7750 }, { "epoch": 2.42, "grad_norm": 1.0724822282791138, "learning_rate": 1.6644569001706502e-05, "loss": 0.0332, "step": 7755 }, { "epoch": 2.42, "grad_norm": 0.9803611040115356, "learning_rate": 1.6640499611329003e-05, "loss": 0.0334, "step": 7760 }, { "epoch": 2.42, "grad_norm": 1.6442325115203857, "learning_rate": 1.6636428252936326e-05, "loss": 0.0294, "step": 7765 }, { "epoch": 2.42, "grad_norm": 1.1340879201889038, "learning_rate": 1.6632354927735076e-05, "loss": 0.0272, "step": 7770 }, { "epoch": 2.42, "grad_norm": 1.017544150352478, "learning_rate": 1.6628279636932453e-05, "loss": 0.0426, "step": 7775 }, { "epoch": 2.43, "grad_norm": 1.0248128175735474, "learning_rate": 1.6624202381736226e-05, "loss": 0.0414, "step": 7780 }, { "epoch": 2.43, "grad_norm": 0.7086641788482666, "learning_rate": 1.662012316335476e-05, "loss": 0.0396, "step": 7785 }, { "epoch": 2.43, "grad_norm": 0.7434113621711731, "learning_rate": 1.6616041982996986e-05, "loss": 0.0437, "step": 7790 }, { "epoch": 2.43, "grad_norm": 0.7166509628295898, "learning_rate": 1.6611958841872427e-05, "loss": 0.0305, "step": 7795 }, { "epoch": 2.43, "grad_norm": 0.7706720232963562, "learning_rate": 1.6607873741191197e-05, "loss": 0.0342, "step": 7800 }, { "epoch": 2.43, "grad_norm": 0.6073039174079895, "learning_rate": 1.6603786682163964e-05, "loss": 0.0418, "step": 7805 }, { "epoch": 2.44, "grad_norm": 0.849389910697937, "learning_rate": 1.6599697666001998e-05, "loss": 0.0389, "step": 7810 }, { "epoch": 2.44, "grad_norm": 0.6125224232673645, "learning_rate": 1.659560669391714e-05, "loss": 0.0318, "step": 7815 }, { "epoch": 2.44, "grad_norm": 0.7484531402587891, "learning_rate": 1.659151376712182e-05, "loss": 0.0497, "step": 7820 }, { "epoch": 2.44, "grad_norm": 0.7708289623260498, "learning_rate": 1.6587418886829034e-05, "loss": 0.0401, "step": 7825 }, { "epoch": 2.44, "grad_norm": 0.958131730556488, "learning_rate": 1.6583322054252363e-05, "loss": 0.0336, "step": 7830 }, { "epoch": 2.44, "grad_norm": 0.7831741571426392, "learning_rate": 1.6579223270605968e-05, "loss": 0.0298, "step": 7835 }, { "epoch": 2.45, "grad_norm": 1.0524176359176636, "learning_rate": 1.657512253710459e-05, "loss": 0.0572, "step": 7840 }, { "epoch": 2.45, "grad_norm": 1.0298731327056885, "learning_rate": 1.657101985496354e-05, "loss": 0.0452, "step": 7845 }, { "epoch": 2.45, "grad_norm": 0.9892203211784363, "learning_rate": 1.656691522539872e-05, "loss": 0.0332, "step": 7850 }, { "epoch": 2.45, "grad_norm": 0.8172314763069153, "learning_rate": 1.656280864962659e-05, "loss": 0.0485, "step": 7855 }, { "epoch": 2.45, "grad_norm": 0.6813853979110718, "learning_rate": 1.6558700128864206e-05, "loss": 0.0351, "step": 7860 }, { "epoch": 2.45, "grad_norm": 0.6233561635017395, "learning_rate": 1.6554589664329186e-05, "loss": 0.0336, "step": 7865 }, { "epoch": 2.45, "grad_norm": 1.1553077697753906, "learning_rate": 1.6550477257239733e-05, "loss": 0.034, "step": 7870 }, { "epoch": 2.46, "grad_norm": 0.5354089140892029, "learning_rate": 1.6546362908814623e-05, "loss": 0.0292, "step": 7875 }, { "epoch": 2.46, "grad_norm": 0.7513230443000793, "learning_rate": 1.654224662027321e-05, "loss": 0.0286, "step": 7880 }, { "epoch": 2.46, "grad_norm": 0.7038884162902832, "learning_rate": 1.653812839283541e-05, "loss": 0.0404, "step": 7885 }, { "epoch": 2.46, "grad_norm": 0.9296578764915466, "learning_rate": 1.653400822772173e-05, "loss": 0.0377, "step": 7890 }, { "epoch": 2.46, "grad_norm": 0.7138383388519287, "learning_rate": 1.652988612615325e-05, "loss": 0.0371, "step": 7895 }, { "epoch": 2.46, "grad_norm": 0.8308192491531372, "learning_rate": 1.652576208935161e-05, "loss": 0.0422, "step": 7900 }, { "epoch": 2.47, "grad_norm": 0.6145361661911011, "learning_rate": 1.652163611853904e-05, "loss": 0.0247, "step": 7905 }, { "epoch": 2.47, "grad_norm": 0.931335985660553, "learning_rate": 1.6517508214938328e-05, "loss": 0.0348, "step": 7910 }, { "epoch": 2.47, "grad_norm": 0.8879193067550659, "learning_rate": 1.651337837977285e-05, "loss": 0.0389, "step": 7915 }, { "epoch": 2.47, "grad_norm": 0.9264506101608276, "learning_rate": 1.6509246614266535e-05, "loss": 0.0344, "step": 7920 }, { "epoch": 2.47, "grad_norm": 0.5814990997314453, "learning_rate": 1.650511291964391e-05, "loss": 0.0334, "step": 7925 }, { "epoch": 2.47, "grad_norm": 0.6779055595397949, "learning_rate": 1.6500977297130054e-05, "loss": 0.0335, "step": 7930 }, { "epoch": 2.47, "grad_norm": 0.857691764831543, "learning_rate": 1.649683974795062e-05, "loss": 0.0377, "step": 7935 }, { "epoch": 2.48, "grad_norm": 0.7544268369674683, "learning_rate": 1.6492700273331835e-05, "loss": 0.036, "step": 7940 }, { "epoch": 2.48, "grad_norm": 0.754554808139801, "learning_rate": 1.6488558874500503e-05, "loss": 0.0423, "step": 7945 }, { "epoch": 2.48, "grad_norm": 0.6650204658508301, "learning_rate": 1.6484415552683986e-05, "loss": 0.0466, "step": 7950 }, { "epoch": 2.48, "grad_norm": 0.9802659153938293, "learning_rate": 1.6480270309110224e-05, "loss": 0.0414, "step": 7955 }, { "epoch": 2.48, "grad_norm": 0.8128763437271118, "learning_rate": 1.647612314500772e-05, "loss": 0.0415, "step": 7960 }, { "epoch": 2.48, "grad_norm": 0.7706061005592346, "learning_rate": 1.6471974061605556e-05, "loss": 0.0298, "step": 7965 }, { "epoch": 2.49, "grad_norm": 0.544342577457428, "learning_rate": 1.6467823060133378e-05, "loss": 0.0357, "step": 7970 }, { "epoch": 2.49, "grad_norm": 0.7460711598396301, "learning_rate": 1.6463670141821394e-05, "loss": 0.0255, "step": 7975 }, { "epoch": 2.49, "grad_norm": 0.8606274724006653, "learning_rate": 1.645951530790039e-05, "loss": 0.0526, "step": 7980 }, { "epoch": 2.49, "grad_norm": 1.019179105758667, "learning_rate": 1.6455358559601715e-05, "loss": 0.0355, "step": 7985 }, { "epoch": 2.49, "grad_norm": 0.9450494050979614, "learning_rate": 1.6451199898157288e-05, "loss": 0.0398, "step": 7990 }, { "epoch": 2.49, "grad_norm": 0.6935722231864929, "learning_rate": 1.6447039324799585e-05, "loss": 0.0446, "step": 7995 }, { "epoch": 2.5, "grad_norm": 0.8222050666809082, "learning_rate": 1.6442876840761667e-05, "loss": 0.0281, "step": 8000 }, { "epoch": 2.5, "grad_norm": 0.6318336129188538, "learning_rate": 1.6438712447277142e-05, "loss": 0.0219, "step": 8005 }, { "epoch": 2.5, "grad_norm": 0.866169810295105, "learning_rate": 1.64345461455802e-05, "loss": 0.0468, "step": 8010 }, { "epoch": 2.5, "grad_norm": 0.8264434337615967, "learning_rate": 1.643037793690558e-05, "loss": 0.0434, "step": 8015 }, { "epoch": 2.5, "grad_norm": 1.024613857269287, "learning_rate": 1.6426207822488603e-05, "loss": 0.0386, "step": 8020 }, { "epoch": 2.5, "grad_norm": 0.7298322319984436, "learning_rate": 1.642203580356515e-05, "loss": 0.0321, "step": 8025 }, { "epoch": 2.5, "grad_norm": 1.083125352859497, "learning_rate": 1.6417861881371654e-05, "loss": 0.0382, "step": 8030 }, { "epoch": 2.51, "grad_norm": 0.6637572050094604, "learning_rate": 1.641368605714513e-05, "loss": 0.0504, "step": 8035 }, { "epoch": 2.51, "grad_norm": 0.6951263546943665, "learning_rate": 1.6409508332123138e-05, "loss": 0.0429, "step": 8040 }, { "epoch": 2.51, "grad_norm": 0.8588437438011169, "learning_rate": 1.6405328707543823e-05, "loss": 0.0212, "step": 8045 }, { "epoch": 2.51, "grad_norm": 0.7385882139205933, "learning_rate": 1.6401147184645877e-05, "loss": 0.0315, "step": 8050 }, { "epoch": 2.51, "grad_norm": 0.523358941078186, "learning_rate": 1.639696376466856e-05, "loss": 0.0245, "step": 8055 }, { "epoch": 2.51, "grad_norm": 0.707894504070282, "learning_rate": 1.639277844885169e-05, "loss": 0.0351, "step": 8060 }, { "epoch": 2.52, "grad_norm": 0.8531205654144287, "learning_rate": 1.6388591238435654e-05, "loss": 0.0412, "step": 8065 }, { "epoch": 2.52, "grad_norm": 0.8357705473899841, "learning_rate": 1.6384402134661397e-05, "loss": 0.0429, "step": 8070 }, { "epoch": 2.52, "grad_norm": 0.7158061861991882, "learning_rate": 1.6380211138770424e-05, "loss": 0.0326, "step": 8075 }, { "epoch": 2.52, "grad_norm": 1.0381453037261963, "learning_rate": 1.63760182520048e-05, "loss": 0.031, "step": 8080 }, { "epoch": 2.52, "grad_norm": 0.9476194381713867, "learning_rate": 1.6371823475607154e-05, "loss": 0.0345, "step": 8085 }, { "epoch": 2.52, "grad_norm": 0.6989636421203613, "learning_rate": 1.6367626810820676e-05, "loss": 0.0266, "step": 8090 }, { "epoch": 2.52, "grad_norm": 0.7869669795036316, "learning_rate": 1.6363428258889108e-05, "loss": 0.04, "step": 8095 }, { "epoch": 2.53, "grad_norm": 1.0312131643295288, "learning_rate": 1.6359227821056752e-05, "loss": 0.0448, "step": 8100 }, { "epoch": 2.53, "grad_norm": 0.9612884521484375, "learning_rate": 1.6355025498568485e-05, "loss": 0.0416, "step": 8105 }, { "epoch": 2.53, "grad_norm": 0.8276956677436829, "learning_rate": 1.6350821292669724e-05, "loss": 0.0361, "step": 8110 }, { "epoch": 2.53, "grad_norm": 0.6667450666427612, "learning_rate": 1.6346615204606448e-05, "loss": 0.0316, "step": 8115 }, { "epoch": 2.53, "grad_norm": 0.8874323964118958, "learning_rate": 1.6342407235625203e-05, "loss": 0.0581, "step": 8120 }, { "epoch": 2.53, "grad_norm": 0.7738590240478516, "learning_rate": 1.633819738697308e-05, "loss": 0.0431, "step": 8125 }, { "epoch": 2.54, "grad_norm": 0.710171639919281, "learning_rate": 1.6333985659897737e-05, "loss": 0.0315, "step": 8130 }, { "epoch": 2.54, "grad_norm": 0.6763156056404114, "learning_rate": 1.632977205564738e-05, "loss": 0.0395, "step": 8135 }, { "epoch": 2.54, "grad_norm": 0.6096729636192322, "learning_rate": 1.632555657547078e-05, "loss": 0.0346, "step": 8140 }, { "epoch": 2.54, "grad_norm": 1.097630500793457, "learning_rate": 1.6321339220617262e-05, "loss": 0.0467, "step": 8145 }, { "epoch": 2.54, "grad_norm": 0.7507611513137817, "learning_rate": 1.6317119992336698e-05, "loss": 0.031, "step": 8150 }, { "epoch": 2.54, "grad_norm": 0.8424447178840637, "learning_rate": 1.631289889187953e-05, "loss": 0.0444, "step": 8155 }, { "epoch": 2.55, "grad_norm": 1.0239499807357788, "learning_rate": 1.630867592049674e-05, "loss": 0.031, "step": 8160 }, { "epoch": 2.55, "grad_norm": 0.6097200512886047, "learning_rate": 1.630445107943987e-05, "loss": 0.0436, "step": 8165 }, { "epoch": 2.55, "grad_norm": 0.662420928478241, "learning_rate": 1.6300224369961022e-05, "loss": 0.0239, "step": 8170 }, { "epoch": 2.55, "grad_norm": 0.47109031677246094, "learning_rate": 1.629599579331285e-05, "loss": 0.0251, "step": 8175 }, { "epoch": 2.55, "grad_norm": 0.7203395366668701, "learning_rate": 1.6291765350748554e-05, "loss": 0.0403, "step": 8180 }, { "epoch": 2.55, "grad_norm": 0.7731189727783203, "learning_rate": 1.6287533043521888e-05, "loss": 0.046, "step": 8185 }, { "epoch": 2.55, "grad_norm": 0.7585968971252441, "learning_rate": 1.6283298872887166e-05, "loss": 0.0396, "step": 8190 }, { "epoch": 2.56, "grad_norm": 0.827131450176239, "learning_rate": 1.6279062840099248e-05, "loss": 0.0289, "step": 8195 }, { "epoch": 2.56, "grad_norm": 0.6474844813346863, "learning_rate": 1.6274824946413552e-05, "loss": 0.0239, "step": 8200 }, { "epoch": 2.56, "grad_norm": 0.7048923969268799, "learning_rate": 1.627058519308604e-05, "loss": 0.0396, "step": 8205 }, { "epoch": 2.56, "grad_norm": 0.568031370639801, "learning_rate": 1.626634358137323e-05, "loss": 0.0452, "step": 8210 }, { "epoch": 2.56, "grad_norm": 0.8100217580795288, "learning_rate": 1.6262100112532194e-05, "loss": 0.039, "step": 8215 }, { "epoch": 2.56, "grad_norm": 0.9056006669998169, "learning_rate": 1.625785478782054e-05, "loss": 0.038, "step": 8220 }, { "epoch": 2.57, "grad_norm": 0.7772448658943176, "learning_rate": 1.6253607608496444e-05, "loss": 0.0369, "step": 8225 }, { "epoch": 2.57, "grad_norm": 0.9542365670204163, "learning_rate": 1.6249358575818618e-05, "loss": 0.0297, "step": 8230 }, { "epoch": 2.57, "grad_norm": 0.48814138770103455, "learning_rate": 1.6245107691046335e-05, "loss": 0.0372, "step": 8235 }, { "epoch": 2.57, "grad_norm": 1.1438723802566528, "learning_rate": 1.6240854955439402e-05, "loss": 0.0459, "step": 8240 }, { "epoch": 2.57, "grad_norm": 0.5761513113975525, "learning_rate": 1.6236600370258193e-05, "loss": 0.0291, "step": 8245 }, { "epoch": 2.57, "grad_norm": 0.7332009077072144, "learning_rate": 1.6232343936763614e-05, "loss": 0.0346, "step": 8250 }, { "epoch": 2.57, "grad_norm": 0.6568591594696045, "learning_rate": 1.6228085656217133e-05, "loss": 0.036, "step": 8255 }, { "epoch": 2.58, "grad_norm": 0.6697298884391785, "learning_rate": 1.6223825529880742e-05, "loss": 0.0268, "step": 8260 }, { "epoch": 2.58, "grad_norm": 0.789516031742096, "learning_rate": 1.621956355901701e-05, "loss": 0.0442, "step": 8265 }, { "epoch": 2.58, "grad_norm": 0.6526065468788147, "learning_rate": 1.621529974488904e-05, "loss": 0.0284, "step": 8270 }, { "epoch": 2.58, "grad_norm": 2.229318857192993, "learning_rate": 1.621103408876046e-05, "loss": 0.0346, "step": 8275 }, { "epoch": 2.58, "grad_norm": 0.6409262418746948, "learning_rate": 1.6206766591895486e-05, "loss": 0.0274, "step": 8280 }, { "epoch": 2.58, "grad_norm": 0.7699871063232422, "learning_rate": 1.6202497255558844e-05, "loss": 0.023, "step": 8285 }, { "epoch": 2.59, "grad_norm": 0.9219619631767273, "learning_rate": 1.6198226081015823e-05, "loss": 0.0511, "step": 8290 }, { "epoch": 2.59, "grad_norm": 0.7852801084518433, "learning_rate": 1.619395306953225e-05, "loss": 0.0361, "step": 8295 }, { "epoch": 2.59, "grad_norm": 0.8048036098480225, "learning_rate": 1.61896782223745e-05, "loss": 0.0308, "step": 8300 }, { "epoch": 2.59, "grad_norm": 0.6462746262550354, "learning_rate": 1.618540154080949e-05, "loss": 0.0314, "step": 8305 }, { "epoch": 2.59, "grad_norm": 0.5217623114585876, "learning_rate": 1.6181123026104674e-05, "loss": 0.0378, "step": 8310 }, { "epoch": 2.59, "grad_norm": 1.0315415859222412, "learning_rate": 1.6176842679528068e-05, "loss": 0.0479, "step": 8315 }, { "epoch": 2.59, "grad_norm": 0.7805516719818115, "learning_rate": 1.6172560502348213e-05, "loss": 0.0318, "step": 8320 }, { "epoch": 2.6, "grad_norm": 0.886910617351532, "learning_rate": 1.61682764958342e-05, "loss": 0.0359, "step": 8325 }, { "epoch": 2.6, "grad_norm": 0.6613612771034241, "learning_rate": 1.616399066125566e-05, "loss": 0.0381, "step": 8330 }, { "epoch": 2.6, "grad_norm": 0.9460324645042419, "learning_rate": 1.6159702999882763e-05, "loss": 0.036, "step": 8335 }, { "epoch": 2.6, "grad_norm": 0.9295498132705688, "learning_rate": 1.615541351298623e-05, "loss": 0.0335, "step": 8340 }, { "epoch": 2.6, "grad_norm": 0.7159280776977539, "learning_rate": 1.6151122201837318e-05, "loss": 0.0375, "step": 8345 }, { "epoch": 2.6, "grad_norm": 0.9947106242179871, "learning_rate": 1.6146829067707818e-05, "loss": 0.0509, "step": 8350 }, { "epoch": 2.61, "grad_norm": 0.8763145208358765, "learning_rate": 1.6142534111870067e-05, "loss": 0.0387, "step": 8355 }, { "epoch": 2.61, "grad_norm": 0.8634383678436279, "learning_rate": 1.613823733559695e-05, "loss": 0.0398, "step": 8360 }, { "epoch": 2.61, "grad_norm": 11.380885124206543, "learning_rate": 1.613393874016187e-05, "loss": 0.035, "step": 8365 }, { "epoch": 2.61, "grad_norm": 0.8703393936157227, "learning_rate": 1.6129638326838796e-05, "loss": 0.038, "step": 8370 }, { "epoch": 2.61, "grad_norm": 1.8195749521255493, "learning_rate": 1.6125336096902214e-05, "loss": 0.0459, "step": 8375 }, { "epoch": 2.61, "grad_norm": 1.143809199333191, "learning_rate": 1.6121032051627164e-05, "loss": 0.0419, "step": 8380 }, { "epoch": 2.62, "grad_norm": 0.9704199433326721, "learning_rate": 1.6116726192289207e-05, "loss": 0.0438, "step": 8385 }, { "epoch": 2.62, "grad_norm": 0.7254340648651123, "learning_rate": 1.611241852016446e-05, "loss": 0.0351, "step": 8390 }, { "epoch": 2.62, "grad_norm": 0.8501814603805542, "learning_rate": 1.6108109036529563e-05, "loss": 0.0456, "step": 8395 }, { "epoch": 2.62, "grad_norm": 3.679652452468872, "learning_rate": 1.6103797742661698e-05, "loss": 0.0321, "step": 8400 }, { "epoch": 2.62, "grad_norm": 1.1169507503509521, "learning_rate": 1.609948463983859e-05, "loss": 0.041, "step": 8405 }, { "epoch": 2.62, "grad_norm": 0.7015743851661682, "learning_rate": 1.6095169729338485e-05, "loss": 0.0355, "step": 8410 }, { "epoch": 2.62, "grad_norm": 0.7521771788597107, "learning_rate": 1.6090853012440183e-05, "loss": 0.032, "step": 8415 }, { "epoch": 2.63, "grad_norm": 0.8876381516456604, "learning_rate": 1.6086534490423002e-05, "loss": 0.0406, "step": 8420 }, { "epoch": 2.63, "grad_norm": 1.3464659452438354, "learning_rate": 1.6082214164566807e-05, "loss": 0.0404, "step": 8425 }, { "epoch": 2.63, "grad_norm": 1.1670219898223877, "learning_rate": 1.6077892036151995e-05, "loss": 0.039, "step": 8430 }, { "epoch": 2.63, "grad_norm": 2.5168001651763916, "learning_rate": 1.60735681064595e-05, "loss": 0.0525, "step": 8435 }, { "epoch": 2.63, "grad_norm": 1.0316020250320435, "learning_rate": 1.6069242376770772e-05, "loss": 0.0376, "step": 8440 }, { "epoch": 2.63, "grad_norm": 0.7442283034324646, "learning_rate": 1.6064914848367818e-05, "loss": 0.0313, "step": 8445 }, { "epoch": 2.64, "grad_norm": 0.9239617586135864, "learning_rate": 1.606058552253317e-05, "loss": 0.0366, "step": 8450 }, { "epoch": 2.64, "grad_norm": 0.3136714994907379, "learning_rate": 1.6056254400549885e-05, "loss": 0.0343, "step": 8455 }, { "epoch": 2.64, "grad_norm": 0.7645418047904968, "learning_rate": 1.605192148370156e-05, "loss": 0.0302, "step": 8460 }, { "epoch": 2.64, "grad_norm": 0.49683353304862976, "learning_rate": 1.6047586773272322e-05, "loss": 0.0404, "step": 8465 }, { "epoch": 2.64, "grad_norm": 0.589810311794281, "learning_rate": 1.6043250270546832e-05, "loss": 0.0346, "step": 8470 }, { "epoch": 2.64, "grad_norm": 0.5914163589477539, "learning_rate": 1.603891197681028e-05, "loss": 0.023, "step": 8475 }, { "epoch": 2.64, "grad_norm": 16.47276496887207, "learning_rate": 1.603457189334838e-05, "loss": 0.0321, "step": 8480 }, { "epoch": 2.65, "grad_norm": 0.7461623549461365, "learning_rate": 1.6030230021447396e-05, "loss": 0.0394, "step": 8485 }, { "epoch": 2.65, "grad_norm": 0.5548524856567383, "learning_rate": 1.60258863623941e-05, "loss": 0.0351, "step": 8490 }, { "epoch": 2.65, "grad_norm": 0.5185383558273315, "learning_rate": 1.60215409174758e-05, "loss": 0.0349, "step": 8495 }, { "epoch": 2.65, "grad_norm": 0.8972347378730774, "learning_rate": 1.601719368798034e-05, "loss": 0.0356, "step": 8500 }, { "epoch": 2.65, "grad_norm": 0.979608952999115, "learning_rate": 1.60128446751961e-05, "loss": 0.0493, "step": 8505 }, { "epoch": 2.65, "grad_norm": 0.9665581583976746, "learning_rate": 1.6008493880411956e-05, "loss": 0.0342, "step": 8510 }, { "epoch": 2.66, "grad_norm": 0.772299587726593, "learning_rate": 1.6004141304917347e-05, "loss": 0.041, "step": 8515 }, { "epoch": 2.66, "grad_norm": 0.6392544507980347, "learning_rate": 1.5999786950002227e-05, "loss": 0.039, "step": 8520 }, { "epoch": 2.66, "grad_norm": 0.9064784049987793, "learning_rate": 1.599543081695707e-05, "loss": 0.0574, "step": 8525 }, { "epoch": 2.66, "grad_norm": 0.8364819288253784, "learning_rate": 1.5991072907072886e-05, "loss": 0.0358, "step": 8530 }, { "epoch": 2.66, "grad_norm": 0.5332923531532288, "learning_rate": 1.5986713221641213e-05, "loss": 0.0312, "step": 8535 }, { "epoch": 2.66, "grad_norm": 0.8230621218681335, "learning_rate": 1.5982351761954113e-05, "loss": 0.0318, "step": 8540 }, { "epoch": 2.67, "grad_norm": 0.9251091480255127, "learning_rate": 1.5977988529304162e-05, "loss": 0.0429, "step": 8545 }, { "epoch": 2.67, "grad_norm": 0.81324702501297, "learning_rate": 1.5973623524984477e-05, "loss": 0.0442, "step": 8550 }, { "epoch": 2.67, "grad_norm": 0.5638898611068726, "learning_rate": 1.5969256750288702e-05, "loss": 0.0434, "step": 8555 }, { "epoch": 2.67, "grad_norm": 2.012404441833496, "learning_rate": 1.596488820651099e-05, "loss": 0.0361, "step": 8560 }, { "epoch": 2.67, "grad_norm": 0.6309984922409058, "learning_rate": 1.5960517894946027e-05, "loss": 0.0313, "step": 8565 }, { "epoch": 2.67, "grad_norm": 1.140384554862976, "learning_rate": 1.5956145816889028e-05, "loss": 0.0411, "step": 8570 }, { "epoch": 2.67, "grad_norm": 1.0145649909973145, "learning_rate": 1.5951771973635722e-05, "loss": 0.0314, "step": 8575 }, { "epoch": 2.68, "grad_norm": 0.9595271944999695, "learning_rate": 1.5947396366482365e-05, "loss": 0.0427, "step": 8580 }, { "epoch": 2.68, "grad_norm": 0.4232027530670166, "learning_rate": 1.5943018996725738e-05, "loss": 0.0325, "step": 8585 }, { "epoch": 2.68, "grad_norm": 0.8087257146835327, "learning_rate": 1.5938639865663142e-05, "loss": 0.0344, "step": 8590 }, { "epoch": 2.68, "grad_norm": 0.615680456161499, "learning_rate": 1.5934258974592402e-05, "loss": 0.0327, "step": 8595 }, { "epoch": 2.68, "grad_norm": 0.7677449584007263, "learning_rate": 1.5929876324811862e-05, "loss": 0.0409, "step": 8600 }, { "epoch": 2.68, "grad_norm": 0.9178354144096375, "learning_rate": 1.5925491917620384e-05, "loss": 0.0303, "step": 8605 }, { "epoch": 2.69, "grad_norm": 0.8850105404853821, "learning_rate": 1.5921105754317364e-05, "loss": 0.0299, "step": 8610 }, { "epoch": 2.69, "grad_norm": 3.832343101501465, "learning_rate": 1.5916717836202705e-05, "loss": 0.053, "step": 8615 }, { "epoch": 2.69, "grad_norm": 0.7948322892189026, "learning_rate": 1.591232816457683e-05, "loss": 0.0353, "step": 8620 }, { "epoch": 2.69, "grad_norm": 0.8507640957832336, "learning_rate": 1.5907936740740693e-05, "loss": 0.041, "step": 8625 }, { "epoch": 2.69, "grad_norm": 0.45911988615989685, "learning_rate": 1.590354356599576e-05, "loss": 0.043, "step": 8630 }, { "epoch": 2.69, "grad_norm": 0.5260618925094604, "learning_rate": 1.5899148641644015e-05, "loss": 0.0317, "step": 8635 }, { "epoch": 2.69, "grad_norm": 0.6386368274688721, "learning_rate": 1.5894751968987956e-05, "loss": 0.0435, "step": 8640 }, { "epoch": 2.7, "grad_norm": 0.847206711769104, "learning_rate": 1.589035354933062e-05, "loss": 0.0453, "step": 8645 }, { "epoch": 2.7, "grad_norm": 0.6548765897750854, "learning_rate": 1.588595338397553e-05, "loss": 0.0366, "step": 8650 }, { "epoch": 2.7, "grad_norm": 0.4999925196170807, "learning_rate": 1.5881551474226754e-05, "loss": 0.0373, "step": 8655 }, { "epoch": 2.7, "grad_norm": 0.6970962882041931, "learning_rate": 1.587714782138887e-05, "loss": 0.0367, "step": 8660 }, { "epoch": 2.7, "grad_norm": 1.4393469095230103, "learning_rate": 1.5872742426766955e-05, "loss": 0.0408, "step": 8665 }, { "epoch": 2.7, "grad_norm": 0.5475620627403259, "learning_rate": 1.5868335291666628e-05, "loss": 0.0215, "step": 8670 }, { "epoch": 2.71, "grad_norm": 1.145965814590454, "learning_rate": 1.5863926417394004e-05, "loss": 0.0453, "step": 8675 }, { "epoch": 2.71, "grad_norm": 0.7778739929199219, "learning_rate": 1.5859515805255728e-05, "loss": 0.0309, "step": 8680 }, { "epoch": 2.71, "grad_norm": 0.993541955947876, "learning_rate": 1.5855103456558953e-05, "loss": 0.0418, "step": 8685 }, { "epoch": 2.71, "grad_norm": 0.7978174686431885, "learning_rate": 1.5850689372611342e-05, "loss": 0.046, "step": 8690 }, { "epoch": 2.71, "grad_norm": 0.6382090449333191, "learning_rate": 1.5846273554721078e-05, "loss": 0.037, "step": 8695 }, { "epoch": 2.71, "grad_norm": 0.6067830324172974, "learning_rate": 1.5841856004196863e-05, "loss": 0.0505, "step": 8700 }, { "epoch": 2.72, "grad_norm": 0.8111315965652466, "learning_rate": 1.5837436722347902e-05, "loss": 0.0422, "step": 8705 }, { "epoch": 2.72, "grad_norm": 0.6436445116996765, "learning_rate": 1.5833015710483918e-05, "loss": 0.0327, "step": 8710 }, { "epoch": 2.72, "grad_norm": 0.6014158725738525, "learning_rate": 1.5828592969915145e-05, "loss": 0.0358, "step": 8715 }, { "epoch": 2.72, "grad_norm": 0.9966750741004944, "learning_rate": 1.5824168501952337e-05, "loss": 0.0407, "step": 8720 }, { "epoch": 2.72, "grad_norm": 1.0554747581481934, "learning_rate": 1.5819742307906744e-05, "loss": 0.0501, "step": 8725 }, { "epoch": 2.72, "grad_norm": 1.0621410608291626, "learning_rate": 1.5815314389090147e-05, "loss": 0.0267, "step": 8730 }, { "epoch": 2.72, "grad_norm": 0.9338129758834839, "learning_rate": 1.5810884746814825e-05, "loss": 0.0459, "step": 8735 }, { "epoch": 2.73, "grad_norm": 0.8834397196769714, "learning_rate": 1.5806453382393573e-05, "loss": 0.0505, "step": 8740 }, { "epoch": 2.73, "grad_norm": 0.958518922328949, "learning_rate": 1.580202029713969e-05, "loss": 0.0535, "step": 8745 }, { "epoch": 2.73, "grad_norm": 0.6857681274414062, "learning_rate": 1.5797585492366993e-05, "loss": 0.0501, "step": 8750 }, { "epoch": 2.73, "grad_norm": 0.6894921660423279, "learning_rate": 1.5793148969389802e-05, "loss": 0.0288, "step": 8755 }, { "epoch": 2.73, "grad_norm": 0.6848697662353516, "learning_rate": 1.5788710729522953e-05, "loss": 0.0331, "step": 8760 }, { "epoch": 2.73, "grad_norm": 0.7238781452178955, "learning_rate": 1.578427077408179e-05, "loss": 0.0358, "step": 8765 }, { "epoch": 2.74, "grad_norm": 0.8410157561302185, "learning_rate": 1.5779829104382154e-05, "loss": 0.0368, "step": 8770 }, { "epoch": 2.74, "grad_norm": 0.7383372187614441, "learning_rate": 1.5775385721740413e-05, "loss": 0.0316, "step": 8775 }, { "epoch": 2.74, "grad_norm": 0.5548520088195801, "learning_rate": 1.5770940627473423e-05, "loss": 0.0395, "step": 8780 }, { "epoch": 2.74, "grad_norm": 0.9778690934181213, "learning_rate": 1.576649382289856e-05, "loss": 0.0483, "step": 8785 }, { "epoch": 2.74, "grad_norm": 0.892069399356842, "learning_rate": 1.5762045309333707e-05, "loss": 0.0348, "step": 8790 }, { "epoch": 2.74, "grad_norm": 0.5767723321914673, "learning_rate": 1.575759508809725e-05, "loss": 0.0346, "step": 8795 }, { "epoch": 2.74, "grad_norm": 0.6027498245239258, "learning_rate": 1.5753143160508076e-05, "loss": 0.0408, "step": 8800 }, { "epoch": 2.75, "grad_norm": 0.7998393177986145, "learning_rate": 1.5748689527885587e-05, "loss": 0.0283, "step": 8805 }, { "epoch": 2.75, "grad_norm": 0.7058614492416382, "learning_rate": 1.5744234191549685e-05, "loss": 0.0349, "step": 8810 }, { "epoch": 2.75, "grad_norm": 0.7166592478752136, "learning_rate": 1.5739777152820775e-05, "loss": 0.0539, "step": 8815 }, { "epoch": 2.75, "grad_norm": 0.6599798202514648, "learning_rate": 1.5735318413019775e-05, "loss": 0.0337, "step": 8820 }, { "epoch": 2.75, "grad_norm": 0.5327255129814148, "learning_rate": 1.57308579734681e-05, "loss": 0.03, "step": 8825 }, { "epoch": 2.75, "grad_norm": 0.7015475630760193, "learning_rate": 1.572639583548767e-05, "loss": 0.0438, "step": 8830 }, { "epoch": 2.76, "grad_norm": 1.1382325887680054, "learning_rate": 1.5721932000400908e-05, "loss": 0.0299, "step": 8835 }, { "epoch": 2.76, "grad_norm": 0.5569730401039124, "learning_rate": 1.571746646953074e-05, "loss": 0.0309, "step": 8840 }, { "epoch": 2.76, "grad_norm": 0.7418985962867737, "learning_rate": 1.57129992442006e-05, "loss": 0.0415, "step": 8845 }, { "epoch": 2.76, "grad_norm": 0.6939743161201477, "learning_rate": 1.5708530325734413e-05, "loss": 0.0315, "step": 8850 }, { "epoch": 2.76, "grad_norm": 0.8243491053581238, "learning_rate": 1.570405971545662e-05, "loss": 0.0397, "step": 8855 }, { "epoch": 2.76, "grad_norm": 0.8243434429168701, "learning_rate": 1.569958741469215e-05, "loss": 0.0512, "step": 8860 }, { "epoch": 2.76, "grad_norm": 0.8044533729553223, "learning_rate": 1.569511342476644e-05, "loss": 0.0292, "step": 8865 }, { "epoch": 2.77, "grad_norm": 0.7083048224449158, "learning_rate": 1.5690637747005427e-05, "loss": 0.0424, "step": 8870 }, { "epoch": 2.77, "grad_norm": 0.6713657379150391, "learning_rate": 1.568616038273555e-05, "loss": 0.0304, "step": 8875 }, { "epoch": 2.77, "grad_norm": 0.6863850355148315, "learning_rate": 1.5681681333283742e-05, "loss": 0.0329, "step": 8880 }, { "epoch": 2.77, "grad_norm": 0.7204381227493286, "learning_rate": 1.567720059997744e-05, "loss": 0.0344, "step": 8885 }, { "epoch": 2.77, "grad_norm": 1.0264719724655151, "learning_rate": 1.567271818414458e-05, "loss": 0.045, "step": 8890 }, { "epoch": 2.77, "grad_norm": 0.983029305934906, "learning_rate": 1.566823408711359e-05, "loss": 0.0392, "step": 8895 }, { "epoch": 2.78, "grad_norm": 0.7892402410507202, "learning_rate": 1.566374831021341e-05, "loss": 0.0343, "step": 8900 }, { "epoch": 2.78, "grad_norm": 0.6004326343536377, "learning_rate": 1.5659260854773465e-05, "loss": 0.0382, "step": 8905 }, { "epoch": 2.78, "grad_norm": 0.7688624262809753, "learning_rate": 1.5654771722123684e-05, "loss": 0.0387, "step": 8910 }, { "epoch": 2.78, "grad_norm": 0.6235817074775696, "learning_rate": 1.565028091359449e-05, "loss": 0.0385, "step": 8915 }, { "epoch": 2.78, "grad_norm": 0.8388769030570984, "learning_rate": 1.56457884305168e-05, "loss": 0.0333, "step": 8920 }, { "epoch": 2.78, "grad_norm": 0.8253457546234131, "learning_rate": 1.564129427422204e-05, "loss": 0.031, "step": 8925 }, { "epoch": 2.79, "grad_norm": 0.765497088432312, "learning_rate": 1.5636798446042116e-05, "loss": 0.0305, "step": 8930 }, { "epoch": 2.79, "grad_norm": 1.0711227655410767, "learning_rate": 1.5632300947309437e-05, "loss": 0.0349, "step": 8935 }, { "epoch": 2.79, "grad_norm": 0.7266090512275696, "learning_rate": 1.562780177935691e-05, "loss": 0.033, "step": 8940 }, { "epoch": 2.79, "grad_norm": 0.9013699293136597, "learning_rate": 1.562330094351793e-05, "loss": 0.0403, "step": 8945 }, { "epoch": 2.79, "grad_norm": 0.77614825963974, "learning_rate": 1.5618798441126392e-05, "loss": 0.031, "step": 8950 }, { "epoch": 2.79, "grad_norm": 0.7244963049888611, "learning_rate": 1.5614294273516684e-05, "loss": 0.0459, "step": 8955 }, { "epoch": 2.79, "grad_norm": 0.6892554759979248, "learning_rate": 1.560978844202368e-05, "loss": 0.0402, "step": 8960 }, { "epoch": 2.8, "grad_norm": 0.7315613627433777, "learning_rate": 1.560528094798276e-05, "loss": 0.0422, "step": 8965 }, { "epoch": 2.8, "grad_norm": 1.277847170829773, "learning_rate": 1.5600771792729782e-05, "loss": 0.045, "step": 8970 }, { "epoch": 2.8, "grad_norm": 0.834357500076294, "learning_rate": 1.5596260977601113e-05, "loss": 0.0397, "step": 8975 }, { "epoch": 2.8, "grad_norm": 0.7074677348136902, "learning_rate": 1.55917485039336e-05, "loss": 0.0314, "step": 8980 }, { "epoch": 2.8, "grad_norm": 0.9965816736221313, "learning_rate": 1.5587234373064577e-05, "loss": 0.0384, "step": 8985 }, { "epoch": 2.8, "grad_norm": 0.8888028264045715, "learning_rate": 1.5582718586331888e-05, "loss": 0.0366, "step": 8990 }, { "epoch": 2.81, "grad_norm": 0.8134787082672119, "learning_rate": 1.557820114507385e-05, "loss": 0.0442, "step": 8995 }, { "epoch": 2.81, "grad_norm": 0.8666115403175354, "learning_rate": 1.557368205062928e-05, "loss": 0.0277, "step": 9000 }, { "epoch": 2.81, "grad_norm": 0.7883780598640442, "learning_rate": 1.556916130433748e-05, "loss": 0.0452, "step": 9005 }, { "epoch": 2.81, "grad_norm": 1.1791362762451172, "learning_rate": 1.5564638907538246e-05, "loss": 0.0391, "step": 9010 }, { "epoch": 2.81, "grad_norm": 0.7088319063186646, "learning_rate": 1.5560114861571853e-05, "loss": 0.0289, "step": 9015 }, { "epoch": 2.81, "grad_norm": 0.5044339895248413, "learning_rate": 1.555558916777908e-05, "loss": 0.0339, "step": 9020 }, { "epoch": 2.81, "grad_norm": 0.5306674838066101, "learning_rate": 1.5551061827501185e-05, "loss": 0.0318, "step": 9025 }, { "epoch": 2.82, "grad_norm": 0.4773883819580078, "learning_rate": 1.5546532842079917e-05, "loss": 0.0394, "step": 9030 }, { "epoch": 2.82, "grad_norm": 1.0266894102096558, "learning_rate": 1.5542002212857506e-05, "loss": 0.0397, "step": 9035 }, { "epoch": 2.82, "grad_norm": 0.6027147173881531, "learning_rate": 1.553746994117668e-05, "loss": 0.0353, "step": 9040 }, { "epoch": 2.82, "grad_norm": 0.8161146640777588, "learning_rate": 1.5532936028380642e-05, "loss": 0.0288, "step": 9045 }, { "epoch": 2.82, "grad_norm": 0.6301555633544922, "learning_rate": 1.5528400475813093e-05, "loss": 0.039, "step": 9050 }, { "epoch": 2.82, "grad_norm": 0.7754611968994141, "learning_rate": 1.552386328481821e-05, "loss": 0.0396, "step": 9055 }, { "epoch": 2.83, "grad_norm": 0.8263349533081055, "learning_rate": 1.5519324456740665e-05, "loss": 0.036, "step": 9060 }, { "epoch": 2.83, "grad_norm": 0.5884903073310852, "learning_rate": 1.5514783992925607e-05, "loss": 0.0289, "step": 9065 }, { "epoch": 2.83, "grad_norm": 0.5677090883255005, "learning_rate": 1.551024189471867e-05, "loss": 0.0342, "step": 9070 }, { "epoch": 2.83, "grad_norm": 0.6854045391082764, "learning_rate": 1.5505698163465986e-05, "loss": 0.0365, "step": 9075 }, { "epoch": 2.83, "grad_norm": 0.9792048931121826, "learning_rate": 1.5501152800514147e-05, "loss": 0.033, "step": 9080 }, { "epoch": 2.83, "grad_norm": 0.9109053015708923, "learning_rate": 1.5496605807210253e-05, "loss": 0.0429, "step": 9085 }, { "epoch": 2.84, "grad_norm": 0.7893013954162598, "learning_rate": 1.5492057184901867e-05, "loss": 0.0392, "step": 9090 }, { "epoch": 2.84, "grad_norm": 0.5471526384353638, "learning_rate": 1.5487506934937047e-05, "loss": 0.0398, "step": 9095 }, { "epoch": 2.84, "grad_norm": 1.0387884378433228, "learning_rate": 1.5482955058664337e-05, "loss": 0.0432, "step": 9100 }, { "epoch": 2.84, "grad_norm": 0.7424880266189575, "learning_rate": 1.547840155743275e-05, "loss": 0.023, "step": 9105 }, { "epoch": 2.84, "grad_norm": 0.5438539981842041, "learning_rate": 1.547384643259178e-05, "loss": 0.0294, "step": 9110 }, { "epoch": 2.84, "grad_norm": 0.5898480415344238, "learning_rate": 1.5469289685491422e-05, "loss": 0.0261, "step": 9115 }, { "epoch": 2.84, "grad_norm": 0.6443936228752136, "learning_rate": 1.546473131748213e-05, "loss": 0.0413, "step": 9120 }, { "epoch": 2.85, "grad_norm": 0.6669382452964783, "learning_rate": 1.5460171329914856e-05, "loss": 0.049, "step": 9125 }, { "epoch": 2.85, "grad_norm": 0.5739983320236206, "learning_rate": 1.545560972414101e-05, "loss": 0.0321, "step": 9130 }, { "epoch": 2.85, "grad_norm": 0.9604806303977966, "learning_rate": 1.5451046501512505e-05, "loss": 0.0473, "step": 9135 }, { "epoch": 2.85, "grad_norm": 0.5344032049179077, "learning_rate": 1.544648166338172e-05, "loss": 0.0317, "step": 9140 }, { "epoch": 2.85, "grad_norm": 0.9302431344985962, "learning_rate": 1.5441915211101518e-05, "loss": 0.039, "step": 9145 }, { "epoch": 2.85, "grad_norm": 0.7158989906311035, "learning_rate": 1.543734714602523e-05, "loss": 0.031, "step": 9150 }, { "epoch": 2.86, "grad_norm": 0.6064074635505676, "learning_rate": 1.5432777469506683e-05, "loss": 0.0263, "step": 9155 }, { "epoch": 2.86, "grad_norm": 0.8086766004562378, "learning_rate": 1.5428206182900165e-05, "loss": 0.0398, "step": 9160 }, { "epoch": 2.86, "grad_norm": 0.8926895260810852, "learning_rate": 1.542363328756045e-05, "loss": 0.0323, "step": 9165 }, { "epoch": 2.86, "grad_norm": 0.9440407752990723, "learning_rate": 1.541905878484279e-05, "loss": 0.0348, "step": 9170 }, { "epoch": 2.86, "grad_norm": 0.7965365648269653, "learning_rate": 1.5414482676102905e-05, "loss": 0.0382, "step": 9175 }, { "epoch": 2.86, "grad_norm": 0.6618263125419617, "learning_rate": 1.5409904962696994e-05, "loss": 0.0309, "step": 9180 }, { "epoch": 2.86, "grad_norm": 0.8952043652534485, "learning_rate": 1.540532564598174e-05, "loss": 0.0383, "step": 9185 }, { "epoch": 2.87, "grad_norm": 2.7503764629364014, "learning_rate": 1.540074472731429e-05, "loss": 0.0354, "step": 9190 }, { "epoch": 2.87, "grad_norm": 0.796486496925354, "learning_rate": 1.539616220805227e-05, "loss": 0.0404, "step": 9195 }, { "epoch": 2.87, "grad_norm": 0.6163582801818848, "learning_rate": 1.5391578089553785e-05, "loss": 0.0379, "step": 9200 }, { "epoch": 2.87, "grad_norm": 0.635602593421936, "learning_rate": 1.5386992373177406e-05, "loss": 0.0462, "step": 9205 }, { "epoch": 2.87, "grad_norm": 0.812752366065979, "learning_rate": 1.5382405060282187e-05, "loss": 0.0385, "step": 9210 }, { "epoch": 2.87, "grad_norm": 0.7380738854408264, "learning_rate": 1.537781615222764e-05, "loss": 0.0322, "step": 9215 }, { "epoch": 2.88, "grad_norm": 0.7037160992622375, "learning_rate": 1.5373225650373763e-05, "loss": 0.0338, "step": 9220 }, { "epoch": 2.88, "grad_norm": 0.8600029349327087, "learning_rate": 1.536863355608103e-05, "loss": 0.0361, "step": 9225 }, { "epoch": 2.88, "grad_norm": 0.7166641354560852, "learning_rate": 1.5364039870710368e-05, "loss": 0.0361, "step": 9230 }, { "epoch": 2.88, "grad_norm": 1.3040114641189575, "learning_rate": 1.5359444595623192e-05, "loss": 0.0556, "step": 9235 }, { "epoch": 2.88, "grad_norm": 0.7275735139846802, "learning_rate": 1.5354847732181384e-05, "loss": 0.0467, "step": 9240 }, { "epoch": 2.88, "grad_norm": 0.890828549861908, "learning_rate": 1.5350249281747297e-05, "loss": 0.05, "step": 9245 }, { "epoch": 2.89, "grad_norm": 0.6557976007461548, "learning_rate": 1.5345649245683748e-05, "loss": 0.0439, "step": 9250 }, { "epoch": 2.89, "grad_norm": 0.8745369911193848, "learning_rate": 1.5341047625354033e-05, "loss": 0.0373, "step": 9255 }, { "epoch": 2.89, "grad_norm": 0.7743590474128723, "learning_rate": 1.5336444422121913e-05, "loss": 0.0293, "step": 9260 }, { "epoch": 2.89, "grad_norm": 1.1002082824707031, "learning_rate": 1.5331839637351623e-05, "loss": 0.0362, "step": 9265 }, { "epoch": 2.89, "grad_norm": 0.8608964681625366, "learning_rate": 1.532723327240785e-05, "loss": 0.0396, "step": 9270 }, { "epoch": 2.89, "grad_norm": 0.35328683257102966, "learning_rate": 1.5322625328655776e-05, "loss": 0.0317, "step": 9275 }, { "epoch": 2.89, "grad_norm": 0.7710239887237549, "learning_rate": 1.531801580746103e-05, "loss": 0.0373, "step": 9280 }, { "epoch": 2.9, "grad_norm": 0.8235945105552673, "learning_rate": 1.5313404710189715e-05, "loss": 0.0372, "step": 9285 }, { "epoch": 2.9, "grad_norm": 0.777231752872467, "learning_rate": 1.5308792038208406e-05, "loss": 0.0353, "step": 9290 }, { "epoch": 2.9, "grad_norm": 1.1153934001922607, "learning_rate": 1.530417779288413e-05, "loss": 0.0429, "step": 9295 }, { "epoch": 2.9, "grad_norm": 1.0209358930587769, "learning_rate": 1.52995619755844e-05, "loss": 0.0405, "step": 9300 }, { "epoch": 2.9, "grad_norm": 1.0184146165847778, "learning_rate": 1.529494458767718e-05, "loss": 0.0435, "step": 9305 }, { "epoch": 2.9, "grad_norm": 0.7970442771911621, "learning_rate": 1.5290325630530906e-05, "loss": 0.0537, "step": 9310 }, { "epoch": 2.91, "grad_norm": 0.7759125828742981, "learning_rate": 1.528570510551448e-05, "loss": 0.0372, "step": 9315 }, { "epoch": 2.91, "grad_norm": 0.9187195301055908, "learning_rate": 1.5281083013997262e-05, "loss": 0.0328, "step": 9320 }, { "epoch": 2.91, "grad_norm": 1.1653680801391602, "learning_rate": 1.5276459357349082e-05, "loss": 0.0424, "step": 9325 }, { "epoch": 2.91, "grad_norm": 1.6130503416061401, "learning_rate": 1.5271834136940235e-05, "loss": 0.036, "step": 9330 }, { "epoch": 2.91, "grad_norm": 0.6654280424118042, "learning_rate": 1.5267207354141474e-05, "loss": 0.0251, "step": 9335 }, { "epoch": 2.91, "grad_norm": 0.9446947574615479, "learning_rate": 1.5262579010324025e-05, "loss": 0.0378, "step": 9340 }, { "epoch": 2.91, "grad_norm": 0.7537127733230591, "learning_rate": 1.5257949106859555e-05, "loss": 0.0372, "step": 9345 }, { "epoch": 2.92, "grad_norm": 0.6928290128707886, "learning_rate": 1.5253317645120218e-05, "loss": 0.0382, "step": 9350 }, { "epoch": 2.92, "grad_norm": 1.0862854719161987, "learning_rate": 1.5248684626478622e-05, "loss": 0.045, "step": 9355 }, { "epoch": 2.92, "grad_norm": 1.0388673543930054, "learning_rate": 1.5244050052307829e-05, "loss": 0.0293, "step": 9360 }, { "epoch": 2.92, "grad_norm": 0.8689137101173401, "learning_rate": 1.5239413923981365e-05, "loss": 0.0383, "step": 9365 }, { "epoch": 2.92, "grad_norm": 1.059084177017212, "learning_rate": 1.5234776242873226e-05, "loss": 0.045, "step": 9370 }, { "epoch": 2.92, "grad_norm": 0.9204691052436829, "learning_rate": 1.5230137010357852e-05, "loss": 0.0314, "step": 9375 }, { "epoch": 2.93, "grad_norm": 0.969298779964447, "learning_rate": 1.522549622781016e-05, "loss": 0.0368, "step": 9380 }, { "epoch": 2.93, "grad_norm": 1.2767212390899658, "learning_rate": 1.5220853896605508e-05, "loss": 0.0552, "step": 9385 }, { "epoch": 2.93, "grad_norm": 0.731755256652832, "learning_rate": 1.5216210018119735e-05, "loss": 0.0419, "step": 9390 }, { "epoch": 2.93, "grad_norm": 0.8874984383583069, "learning_rate": 1.5211564593729118e-05, "loss": 0.0392, "step": 9395 }, { "epoch": 2.93, "grad_norm": 1.087302803993225, "learning_rate": 1.5206917624810404e-05, "loss": 0.0398, "step": 9400 }, { "epoch": 2.93, "grad_norm": 0.6841315031051636, "learning_rate": 1.5202269112740795e-05, "loss": 0.0283, "step": 9405 }, { "epoch": 2.93, "grad_norm": 0.8801833987236023, "learning_rate": 1.5197619058897945e-05, "loss": 0.041, "step": 9410 }, { "epoch": 2.94, "grad_norm": 1.006748080253601, "learning_rate": 1.5192967464659976e-05, "loss": 0.048, "step": 9415 }, { "epoch": 2.94, "grad_norm": 0.7825761437416077, "learning_rate": 1.5188314331405454e-05, "loss": 0.0514, "step": 9420 }, { "epoch": 2.94, "grad_norm": 0.8253602981567383, "learning_rate": 1.5183659660513415e-05, "loss": 0.0369, "step": 9425 }, { "epoch": 2.94, "grad_norm": 0.8670963048934937, "learning_rate": 1.5179003453363334e-05, "loss": 0.0303, "step": 9430 }, { "epoch": 2.94, "grad_norm": 0.697066605091095, "learning_rate": 1.5174345711335157e-05, "loss": 0.0349, "step": 9435 }, { "epoch": 2.94, "grad_norm": 0.9415462613105774, "learning_rate": 1.5169686435809275e-05, "loss": 0.039, "step": 9440 }, { "epoch": 2.95, "grad_norm": 1.189955472946167, "learning_rate": 1.5165025628166538e-05, "loss": 0.0465, "step": 9445 }, { "epoch": 2.95, "grad_norm": 0.801723837852478, "learning_rate": 1.5160363289788249e-05, "loss": 0.0294, "step": 9450 }, { "epoch": 2.95, "grad_norm": 0.7355833649635315, "learning_rate": 1.515569942205616e-05, "loss": 0.0416, "step": 9455 }, { "epoch": 2.95, "grad_norm": 0.7931221723556519, "learning_rate": 1.5151034026352492e-05, "loss": 0.0344, "step": 9460 }, { "epoch": 2.95, "grad_norm": 0.8118154406547546, "learning_rate": 1.514636710405989e-05, "loss": 0.0364, "step": 9465 }, { "epoch": 2.95, "grad_norm": 0.6183436512947083, "learning_rate": 1.5141698656561486e-05, "loss": 0.0297, "step": 9470 }, { "epoch": 2.96, "grad_norm": 1.0786751508712769, "learning_rate": 1.5137028685240838e-05, "loss": 0.0464, "step": 9475 }, { "epoch": 2.96, "grad_norm": 1.1322855949401855, "learning_rate": 1.5132357191481966e-05, "loss": 0.0668, "step": 9480 }, { "epoch": 2.96, "grad_norm": 0.7116437554359436, "learning_rate": 1.512768417666934e-05, "loss": 0.0533, "step": 9485 }, { "epoch": 2.96, "grad_norm": 0.8672776818275452, "learning_rate": 1.5123009642187879e-05, "loss": 0.0419, "step": 9490 }, { "epoch": 2.96, "grad_norm": 0.6347663402557373, "learning_rate": 1.5118333589422953e-05, "loss": 0.0448, "step": 9495 }, { "epoch": 2.96, "grad_norm": 0.9350044131278992, "learning_rate": 1.511365601976039e-05, "loss": 0.0416, "step": 9500 }, { "epoch": 2.96, "grad_norm": 0.8266491889953613, "learning_rate": 1.5108976934586452e-05, "loss": 0.0377, "step": 9505 }, { "epoch": 2.97, "grad_norm": 0.4777359962463379, "learning_rate": 1.5104296335287863e-05, "loss": 0.0273, "step": 9510 }, { "epoch": 2.97, "grad_norm": 0.7290952801704407, "learning_rate": 1.5099614223251793e-05, "loss": 0.0391, "step": 9515 }, { "epoch": 2.97, "grad_norm": 0.606194019317627, "learning_rate": 1.5094930599865852e-05, "loss": 0.0306, "step": 9520 }, { "epoch": 2.97, "grad_norm": 0.8185514211654663, "learning_rate": 1.5090245466518112e-05, "loss": 0.0402, "step": 9525 }, { "epoch": 2.97, "grad_norm": 1.31226646900177, "learning_rate": 1.5085558824597076e-05, "loss": 0.0432, "step": 9530 }, { "epoch": 2.97, "grad_norm": 0.9394533634185791, "learning_rate": 1.5080870675491714e-05, "loss": 0.0419, "step": 9535 }, { "epoch": 2.98, "grad_norm": 0.8299448490142822, "learning_rate": 1.5076181020591425e-05, "loss": 0.0301, "step": 9540 }, { "epoch": 2.98, "grad_norm": 0.7116577625274658, "learning_rate": 1.5071489861286062e-05, "loss": 0.0246, "step": 9545 }, { "epoch": 2.98, "grad_norm": 0.9290896058082581, "learning_rate": 1.5066797198965923e-05, "loss": 0.0386, "step": 9550 }, { "epoch": 2.98, "grad_norm": 0.9579975008964539, "learning_rate": 1.5062103035021754e-05, "loss": 0.0434, "step": 9555 }, { "epoch": 2.98, "grad_norm": 0.7418085336685181, "learning_rate": 1.5057407370844736e-05, "loss": 0.0359, "step": 9560 }, { "epoch": 2.98, "grad_norm": 0.9520347118377686, "learning_rate": 1.5052710207826511e-05, "loss": 0.0497, "step": 9565 }, { "epoch": 2.98, "grad_norm": 0.5138440132141113, "learning_rate": 1.504801154735915e-05, "loss": 0.027, "step": 9570 }, { "epoch": 2.99, "grad_norm": 0.5888009071350098, "learning_rate": 1.5043311390835178e-05, "loss": 0.0337, "step": 9575 }, { "epoch": 2.99, "grad_norm": 0.5701594352722168, "learning_rate": 1.5038609739647559e-05, "loss": 0.0351, "step": 9580 }, { "epoch": 2.99, "grad_norm": 1.0097880363464355, "learning_rate": 1.5033906595189698e-05, "loss": 0.0332, "step": 9585 }, { "epoch": 2.99, "grad_norm": 1.3016626834869385, "learning_rate": 1.5029201958855445e-05, "loss": 0.0408, "step": 9590 }, { "epoch": 2.99, "grad_norm": 0.9115636348724365, "learning_rate": 1.5024495832039091e-05, "loss": 0.0517, "step": 9595 }, { "epoch": 2.99, "grad_norm": 0.7627093195915222, "learning_rate": 1.5019788216135376e-05, "loss": 0.0424, "step": 9600 }, { "epoch": 3.0, "grad_norm": 0.6963844895362854, "learning_rate": 1.5015079112539468e-05, "loss": 0.0339, "step": 9605 }, { "epoch": 3.0, "grad_norm": 0.7055118083953857, "learning_rate": 1.5010368522646986e-05, "loss": 0.0383, "step": 9610 }, { "epoch": 3.0, "grad_norm": 0.6209856271743774, "learning_rate": 1.5005656447853984e-05, "loss": 0.0436, "step": 9615 }, { "epoch": 3.0, "grad_norm": 0.5300926566123962, "learning_rate": 1.5000942889556965e-05, "loss": 0.0292, "step": 9620 }, { "epoch": 3.0, "grad_norm": 0.6936347484588623, "learning_rate": 1.499622784915286e-05, "loss": 0.0238, "step": 9625 }, { "epoch": 3.0, "grad_norm": 0.4455910325050354, "learning_rate": 1.4991511328039046e-05, "loss": 0.0218, "step": 9630 }, { "epoch": 3.01, "grad_norm": 0.6577441096305847, "learning_rate": 1.4986793327613334e-05, "loss": 0.0235, "step": 9635 }, { "epoch": 3.01, "grad_norm": 0.698058009147644, "learning_rate": 1.498207384927398e-05, "loss": 0.0157, "step": 9640 }, { "epoch": 3.01, "grad_norm": 0.3731990158557892, "learning_rate": 1.4977352894419674e-05, "loss": 0.0199, "step": 9645 }, { "epoch": 3.01, "grad_norm": 0.8323873281478882, "learning_rate": 1.4972630464449547e-05, "loss": 0.0175, "step": 9650 }, { "epoch": 3.01, "grad_norm": 0.7002973556518555, "learning_rate": 1.4967906560763158e-05, "loss": 0.024, "step": 9655 }, { "epoch": 3.01, "grad_norm": 0.751404345035553, "learning_rate": 1.4963181184760516e-05, "loss": 0.025, "step": 9660 }, { "epoch": 3.01, "grad_norm": 0.7667729258537292, "learning_rate": 1.4958454337842054e-05, "loss": 0.0234, "step": 9665 }, { "epoch": 3.02, "grad_norm": 0.39397770166397095, "learning_rate": 1.495372602140865e-05, "loss": 0.0135, "step": 9670 }, { "epoch": 3.02, "grad_norm": 1.5528745651245117, "learning_rate": 1.4948996236861613e-05, "loss": 0.0168, "step": 9675 }, { "epoch": 3.02, "grad_norm": 0.8898313045501709, "learning_rate": 1.494426498560269e-05, "loss": 0.019, "step": 9680 }, { "epoch": 3.02, "grad_norm": 0.6876255869865417, "learning_rate": 1.4939532269034058e-05, "loss": 0.025, "step": 9685 }, { "epoch": 3.02, "grad_norm": 0.4512489438056946, "learning_rate": 1.493479808855833e-05, "loss": 0.0208, "step": 9690 }, { "epoch": 3.02, "grad_norm": 0.8061213493347168, "learning_rate": 1.4930062445578557e-05, "loss": 0.0211, "step": 9695 }, { "epoch": 3.03, "grad_norm": 0.3963974118232727, "learning_rate": 1.492532534149822e-05, "loss": 0.0144, "step": 9700 }, { "epoch": 3.03, "grad_norm": 0.4634009301662445, "learning_rate": 1.4920586777721231e-05, "loss": 0.0178, "step": 9705 }, { "epoch": 3.03, "grad_norm": 0.7113550901412964, "learning_rate": 1.491584675565194e-05, "loss": 0.0161, "step": 9710 }, { "epoch": 3.03, "grad_norm": 0.6108189821243286, "learning_rate": 1.4911105276695126e-05, "loss": 0.0185, "step": 9715 }, { "epoch": 3.03, "grad_norm": 0.8891080021858215, "learning_rate": 1.4906362342255996e-05, "loss": 0.0229, "step": 9720 }, { "epoch": 3.03, "grad_norm": 0.5516975522041321, "learning_rate": 1.4901617953740196e-05, "loss": 0.0235, "step": 9725 }, { "epoch": 3.03, "grad_norm": 0.7460342049598694, "learning_rate": 1.48968721125538e-05, "loss": 0.0264, "step": 9730 }, { "epoch": 3.04, "grad_norm": 0.5245157480239868, "learning_rate": 1.4892124820103305e-05, "loss": 0.0217, "step": 9735 }, { "epoch": 3.04, "grad_norm": 0.48179706931114197, "learning_rate": 1.4887376077795651e-05, "loss": 0.0146, "step": 9740 }, { "epoch": 3.04, "grad_norm": 1.0343332290649414, "learning_rate": 1.4882625887038202e-05, "loss": 0.0228, "step": 9745 }, { "epoch": 3.04, "grad_norm": 0.7103240489959717, "learning_rate": 1.4877874249238751e-05, "loss": 0.0198, "step": 9750 }, { "epoch": 3.04, "grad_norm": 0.5932292938232422, "learning_rate": 1.4873121165805516e-05, "loss": 0.023, "step": 9755 }, { "epoch": 3.04, "grad_norm": 0.48035189509391785, "learning_rate": 1.4868366638147145e-05, "loss": 0.0192, "step": 9760 }, { "epoch": 3.05, "grad_norm": 0.2648720145225525, "learning_rate": 1.4863610667672723e-05, "loss": 0.0146, "step": 9765 }, { "epoch": 3.05, "grad_norm": 0.3342938721179962, "learning_rate": 1.4858853255791754e-05, "loss": 0.0151, "step": 9770 }, { "epoch": 3.05, "grad_norm": 0.8612216711044312, "learning_rate": 1.4854094403914167e-05, "loss": 0.0222, "step": 9775 }, { "epoch": 3.05, "grad_norm": 0.5093660950660706, "learning_rate": 1.4849334113450322e-05, "loss": 0.0125, "step": 9780 }, { "epoch": 3.05, "grad_norm": 0.5830073952674866, "learning_rate": 1.484457238581101e-05, "loss": 0.0205, "step": 9785 }, { "epoch": 3.05, "grad_norm": 0.3554871678352356, "learning_rate": 1.4839809222407443e-05, "loss": 0.0193, "step": 9790 }, { "epoch": 3.06, "grad_norm": 0.6692444086074829, "learning_rate": 1.4835044624651252e-05, "loss": 0.0181, "step": 9795 }, { "epoch": 3.06, "grad_norm": 0.7528447508811951, "learning_rate": 1.4830278593954501e-05, "loss": 0.0182, "step": 9800 }, { "epoch": 3.06, "grad_norm": 0.5975103974342346, "learning_rate": 1.4825511131729684e-05, "loss": 0.0216, "step": 9805 }, { "epoch": 3.06, "grad_norm": 0.7612311840057373, "learning_rate": 1.4820742239389705e-05, "loss": 0.0207, "step": 9810 }, { "epoch": 3.06, "grad_norm": 0.3679894208908081, "learning_rate": 1.48159719183479e-05, "loss": 0.016, "step": 9815 }, { "epoch": 3.06, "grad_norm": 0.4748607575893402, "learning_rate": 1.4811200170018032e-05, "loss": 0.0219, "step": 9820 }, { "epoch": 3.06, "grad_norm": 0.6697176098823547, "learning_rate": 1.480642699581428e-05, "loss": 0.0271, "step": 9825 }, { "epoch": 3.07, "grad_norm": 0.5894784331321716, "learning_rate": 1.480165239715125e-05, "loss": 0.0217, "step": 9830 }, { "epoch": 3.07, "grad_norm": 0.6295979619026184, "learning_rate": 1.4796876375443966e-05, "loss": 0.0236, "step": 9835 }, { "epoch": 3.07, "grad_norm": 0.8625422716140747, "learning_rate": 1.4792098932107877e-05, "loss": 0.0214, "step": 9840 }, { "epoch": 3.07, "grad_norm": 0.5024229288101196, "learning_rate": 1.4787320068558853e-05, "loss": 0.0191, "step": 9845 }, { "epoch": 3.07, "grad_norm": 0.44266295433044434, "learning_rate": 1.4782539786213184e-05, "loss": 0.0175, "step": 9850 }, { "epoch": 3.07, "grad_norm": 0.623786985874176, "learning_rate": 1.4777758086487581e-05, "loss": 0.0204, "step": 9855 }, { "epoch": 3.08, "grad_norm": 0.5398198366165161, "learning_rate": 1.4772974970799177e-05, "loss": 0.0212, "step": 9860 }, { "epoch": 3.08, "grad_norm": 0.4434782564640045, "learning_rate": 1.4768190440565517e-05, "loss": 0.0144, "step": 9865 }, { "epoch": 3.08, "grad_norm": 0.8131916522979736, "learning_rate": 1.4763404497204578e-05, "loss": 0.0177, "step": 9870 }, { "epoch": 3.08, "grad_norm": 0.6429072618484497, "learning_rate": 1.4758617142134749e-05, "loss": 0.022, "step": 9875 }, { "epoch": 3.08, "grad_norm": 0.4914596974849701, "learning_rate": 1.4753828376774823e-05, "loss": 0.0151, "step": 9880 }, { "epoch": 3.08, "grad_norm": 0.4990901052951813, "learning_rate": 1.4749038202544044e-05, "loss": 0.0187, "step": 9885 }, { "epoch": 3.08, "grad_norm": 0.7438767552375793, "learning_rate": 1.4744246620862041e-05, "loss": 0.0179, "step": 9890 }, { "epoch": 3.09, "grad_norm": 0.7084353566169739, "learning_rate": 1.4739453633148881e-05, "loss": 0.0201, "step": 9895 }, { "epoch": 3.09, "grad_norm": 0.22215887904167175, "learning_rate": 1.4734659240825037e-05, "loss": 0.0173, "step": 9900 }, { "epoch": 3.09, "grad_norm": 0.5357792377471924, "learning_rate": 1.47298634453114e-05, "loss": 0.0163, "step": 9905 }, { "epoch": 3.09, "grad_norm": 0.48687347769737244, "learning_rate": 1.4725066248029283e-05, "loss": 0.0225, "step": 9910 }, { "epoch": 3.09, "grad_norm": 0.421177476644516, "learning_rate": 1.472026765040041e-05, "loss": 0.0136, "step": 9915 }, { "epoch": 3.09, "grad_norm": 0.3397599160671234, "learning_rate": 1.4715467653846917e-05, "loss": 0.019, "step": 9920 }, { "epoch": 3.1, "grad_norm": 0.616377592086792, "learning_rate": 1.4710666259791353e-05, "loss": 0.0237, "step": 9925 }, { "epoch": 3.1, "grad_norm": 0.5014125108718872, "learning_rate": 1.4705863469656697e-05, "loss": 0.0187, "step": 9930 }, { "epoch": 3.1, "grad_norm": 0.8148841261863708, "learning_rate": 1.4701059284866323e-05, "loss": 0.0195, "step": 9935 }, { "epoch": 3.1, "grad_norm": 0.9623835682868958, "learning_rate": 1.4696253706844026e-05, "loss": 0.0259, "step": 9940 }, { "epoch": 3.1, "grad_norm": 0.41041064262390137, "learning_rate": 1.4691446737014012e-05, "loss": 0.0216, "step": 9945 }, { "epoch": 3.1, "grad_norm": 0.5692448019981384, "learning_rate": 1.4686638376800907e-05, "loss": 0.0166, "step": 9950 }, { "epoch": 3.1, "grad_norm": 0.5151691436767578, "learning_rate": 1.4681828627629742e-05, "loss": 0.0151, "step": 9955 }, { "epoch": 3.11, "grad_norm": 0.6894605755805969, "learning_rate": 1.4677017490925956e-05, "loss": 0.0165, "step": 9960 }, { "epoch": 3.11, "grad_norm": 0.5428750514984131, "learning_rate": 1.4672204968115407e-05, "loss": 0.0209, "step": 9965 }, { "epoch": 3.11, "grad_norm": 0.6880739331245422, "learning_rate": 1.4667391060624364e-05, "loss": 0.0249, "step": 9970 }, { "epoch": 3.11, "grad_norm": 0.49203822016716003, "learning_rate": 1.4662575769879497e-05, "loss": 0.0216, "step": 9975 }, { "epoch": 3.11, "grad_norm": 0.5015491247177124, "learning_rate": 1.4657759097307895e-05, "loss": 0.015, "step": 9980 }, { "epoch": 3.11, "grad_norm": 0.303508996963501, "learning_rate": 1.4652941044337057e-05, "loss": 0.0198, "step": 9985 }, { "epoch": 3.12, "grad_norm": 0.5856837034225464, "learning_rate": 1.4648121612394884e-05, "loss": 0.0166, "step": 9990 }, { "epoch": 3.12, "grad_norm": 0.7183772921562195, "learning_rate": 1.4643300802909687e-05, "loss": 0.0258, "step": 9995 }, { "epoch": 3.12, "grad_norm": 0.8566582202911377, "learning_rate": 1.4638478617310194e-05, "loss": 0.027, "step": 10000 }, { "epoch": 3.12, "grad_norm": 0.7371096014976501, "learning_rate": 1.4633655057025533e-05, "loss": 0.0208, "step": 10005 }, { "epoch": 3.12, "grad_norm": 0.5916391015052795, "learning_rate": 1.4628830123485239e-05, "loss": 0.0172, "step": 10010 }, { "epoch": 3.12, "grad_norm": 0.5476992726325989, "learning_rate": 1.4624003818119255e-05, "loss": 0.0142, "step": 10015 }, { "epoch": 3.13, "grad_norm": 0.7170031070709229, "learning_rate": 1.4619176142357936e-05, "loss": 0.0203, "step": 10020 }, { "epoch": 3.13, "grad_norm": 0.463932603597641, "learning_rate": 1.4614347097632031e-05, "loss": 0.0157, "step": 10025 }, { "epoch": 3.13, "grad_norm": 1.2391690015792847, "learning_rate": 1.4609516685372711e-05, "loss": 0.025, "step": 10030 }, { "epoch": 3.13, "grad_norm": 0.7538567781448364, "learning_rate": 1.4604684907011539e-05, "loss": 0.0164, "step": 10035 }, { "epoch": 3.13, "grad_norm": 0.7815136909484863, "learning_rate": 1.4599851763980493e-05, "loss": 0.0223, "step": 10040 }, { "epoch": 3.13, "grad_norm": 0.5280672907829285, "learning_rate": 1.459501725771194e-05, "loss": 0.0199, "step": 10045 }, { "epoch": 3.13, "grad_norm": 0.6533296704292297, "learning_rate": 1.4590181389638667e-05, "loss": 0.0218, "step": 10050 }, { "epoch": 3.14, "grad_norm": 0.8878480792045593, "learning_rate": 1.4585344161193862e-05, "loss": 0.0255, "step": 10055 }, { "epoch": 3.14, "grad_norm": 0.6210035085678101, "learning_rate": 1.4580505573811105e-05, "loss": 0.0202, "step": 10060 }, { "epoch": 3.14, "grad_norm": 0.6659073233604431, "learning_rate": 1.4575665628924392e-05, "loss": 0.0173, "step": 10065 }, { "epoch": 3.14, "grad_norm": 0.6288996338844299, "learning_rate": 1.4570824327968111e-05, "loss": 0.0178, "step": 10070 }, { "epoch": 3.14, "grad_norm": 0.5575762987136841, "learning_rate": 1.4565981672377065e-05, "loss": 0.0207, "step": 10075 }, { "epoch": 3.14, "grad_norm": 0.6056348085403442, "learning_rate": 1.4561137663586444e-05, "loss": 0.0223, "step": 10080 }, { "epoch": 3.15, "grad_norm": 0.4476945102214813, "learning_rate": 1.4556292303031847e-05, "loss": 0.0164, "step": 10085 }, { "epoch": 3.15, "grad_norm": 0.841842532157898, "learning_rate": 1.4551445592149267e-05, "loss": 0.0203, "step": 10090 }, { "epoch": 3.15, "grad_norm": 0.7189118266105652, "learning_rate": 1.454659753237511e-05, "loss": 0.0236, "step": 10095 }, { "epoch": 3.15, "grad_norm": 0.5335758924484253, "learning_rate": 1.4541748125146172e-05, "loss": 0.0185, "step": 10100 }, { "epoch": 3.15, "grad_norm": 0.668975293636322, "learning_rate": 1.4536897371899643e-05, "loss": 0.0192, "step": 10105 }, { "epoch": 3.15, "grad_norm": 0.6786960363388062, "learning_rate": 1.4532045274073125e-05, "loss": 0.0167, "step": 10110 }, { "epoch": 3.15, "grad_norm": 0.5628133416175842, "learning_rate": 1.4527191833104614e-05, "loss": 0.0158, "step": 10115 }, { "epoch": 3.16, "grad_norm": 0.44337987899780273, "learning_rate": 1.4522337050432502e-05, "loss": 0.0126, "step": 10120 }, { "epoch": 3.16, "grad_norm": 0.6237801313400269, "learning_rate": 1.4517480927495575e-05, "loss": 0.0164, "step": 10125 }, { "epoch": 3.16, "grad_norm": 0.5822309255599976, "learning_rate": 1.4512623465733025e-05, "loss": 0.016, "step": 10130 }, { "epoch": 3.16, "grad_norm": 0.6812189817428589, "learning_rate": 1.4507764666584431e-05, "loss": 0.0272, "step": 10135 }, { "epoch": 3.16, "grad_norm": 0.6809319257736206, "learning_rate": 1.4502904531489778e-05, "loss": 0.018, "step": 10140 }, { "epoch": 3.16, "grad_norm": 0.6676437854766846, "learning_rate": 1.4498043061889445e-05, "loss": 0.0127, "step": 10145 }, { "epoch": 3.17, "grad_norm": 0.7798243761062622, "learning_rate": 1.4493180259224195e-05, "loss": 0.0167, "step": 10150 }, { "epoch": 3.17, "grad_norm": 0.7573854327201843, "learning_rate": 1.4488316124935205e-05, "loss": 0.0202, "step": 10155 }, { "epoch": 3.17, "grad_norm": 0.5917245745658875, "learning_rate": 1.448345066046403e-05, "loss": 0.0192, "step": 10160 }, { "epoch": 3.17, "grad_norm": 0.6839116215705872, "learning_rate": 1.4478583867252629e-05, "loss": 0.0223, "step": 10165 }, { "epoch": 3.17, "grad_norm": 0.5750868916511536, "learning_rate": 1.447371574674335e-05, "loss": 0.0231, "step": 10170 }, { "epoch": 3.17, "grad_norm": 0.6704385876655579, "learning_rate": 1.4468846300378936e-05, "loss": 0.0201, "step": 10175 }, { "epoch": 3.18, "grad_norm": 0.6824901103973389, "learning_rate": 1.4463975529602525e-05, "loss": 0.0225, "step": 10180 }, { "epoch": 3.18, "grad_norm": 0.6210573315620422, "learning_rate": 1.4459103435857649e-05, "loss": 0.0223, "step": 10185 }, { "epoch": 3.18, "grad_norm": 0.6658128499984741, "learning_rate": 1.445423002058822e-05, "loss": 0.0224, "step": 10190 }, { "epoch": 3.18, "grad_norm": 0.5486401319503784, "learning_rate": 1.4449355285238553e-05, "loss": 0.0221, "step": 10195 }, { "epoch": 3.18, "grad_norm": 0.792415201663971, "learning_rate": 1.4444479231253357e-05, "loss": 0.0218, "step": 10200 }, { "epoch": 3.18, "grad_norm": 0.5030585527420044, "learning_rate": 1.4439601860077721e-05, "loss": 0.026, "step": 10205 }, { "epoch": 3.18, "grad_norm": 0.9226459860801697, "learning_rate": 1.443472317315713e-05, "loss": 0.02, "step": 10210 }, { "epoch": 3.19, "grad_norm": 0.5767743587493896, "learning_rate": 1.4429843171937459e-05, "loss": 0.0206, "step": 10215 }, { "epoch": 3.19, "grad_norm": 0.5681598782539368, "learning_rate": 1.4424961857864976e-05, "loss": 0.0191, "step": 10220 }, { "epoch": 3.19, "grad_norm": 0.6394040584564209, "learning_rate": 1.4420079232386329e-05, "loss": 0.0214, "step": 10225 }, { "epoch": 3.19, "grad_norm": 0.8907806277275085, "learning_rate": 1.441519529694856e-05, "loss": 0.0291, "step": 10230 }, { "epoch": 3.19, "grad_norm": 0.43610310554504395, "learning_rate": 1.4410310052999103e-05, "loss": 0.018, "step": 10235 }, { "epoch": 3.19, "grad_norm": 0.4930350184440613, "learning_rate": 1.4405423501985776e-05, "loss": 0.0232, "step": 10240 }, { "epoch": 3.2, "grad_norm": 0.5421867370605469, "learning_rate": 1.440053564535678e-05, "loss": 0.0184, "step": 10245 }, { "epoch": 3.2, "grad_norm": 0.38000333309173584, "learning_rate": 1.4395646484560712e-05, "loss": 0.0177, "step": 10250 }, { "epoch": 3.2, "grad_norm": 0.7972351312637329, "learning_rate": 1.4390756021046545e-05, "loss": 0.0219, "step": 10255 }, { "epoch": 3.2, "grad_norm": 0.5814357995986938, "learning_rate": 1.4385864256263649e-05, "loss": 0.0309, "step": 10260 }, { "epoch": 3.2, "grad_norm": 1.068788766860962, "learning_rate": 1.4380971191661773e-05, "loss": 0.0264, "step": 10265 }, { "epoch": 3.2, "grad_norm": 0.6000214219093323, "learning_rate": 1.437607682869105e-05, "loss": 0.0211, "step": 10270 }, { "epoch": 3.2, "grad_norm": 0.7504013180732727, "learning_rate": 1.4371181168802005e-05, "loss": 0.02, "step": 10275 }, { "epoch": 3.21, "grad_norm": 0.37451502680778503, "learning_rate": 1.4366284213445538e-05, "loss": 0.0161, "step": 10280 }, { "epoch": 3.21, "grad_norm": 0.5724549293518066, "learning_rate": 1.4361385964072943e-05, "loss": 0.0224, "step": 10285 }, { "epoch": 3.21, "grad_norm": 0.44703173637390137, "learning_rate": 1.4356486422135888e-05, "loss": 0.0191, "step": 10290 }, { "epoch": 3.21, "grad_norm": 0.3945390284061432, "learning_rate": 1.435158558908643e-05, "loss": 0.0219, "step": 10295 }, { "epoch": 3.21, "grad_norm": 0.42554935812950134, "learning_rate": 1.4346683466377004e-05, "loss": 0.0225, "step": 10300 }, { "epoch": 3.21, "grad_norm": 0.40748491883277893, "learning_rate": 1.4341780055460437e-05, "loss": 0.0205, "step": 10305 }, { "epoch": 3.22, "grad_norm": 0.3901636004447937, "learning_rate": 1.4336875357789927e-05, "loss": 0.0184, "step": 10310 }, { "epoch": 3.22, "grad_norm": 0.4704354405403137, "learning_rate": 1.4331969374819057e-05, "loss": 0.0153, "step": 10315 }, { "epoch": 3.22, "grad_norm": 0.8930145502090454, "learning_rate": 1.4327062108001788e-05, "loss": 0.0204, "step": 10320 }, { "epoch": 3.22, "grad_norm": 0.7317019701004028, "learning_rate": 1.4322153558792468e-05, "loss": 0.0142, "step": 10325 }, { "epoch": 3.22, "grad_norm": 0.9982587695121765, "learning_rate": 1.4317243728645829e-05, "loss": 0.019, "step": 10330 }, { "epoch": 3.22, "grad_norm": 0.6566906571388245, "learning_rate": 1.4312332619016964e-05, "loss": 0.0222, "step": 10335 }, { "epoch": 3.23, "grad_norm": 0.5765242576599121, "learning_rate": 1.4307420231361357e-05, "loss": 0.0181, "step": 10340 }, { "epoch": 3.23, "grad_norm": 0.8253374695777893, "learning_rate": 1.4302506567134881e-05, "loss": 0.0237, "step": 10345 }, { "epoch": 3.23, "grad_norm": 0.43849310278892517, "learning_rate": 1.429759162779377e-05, "loss": 0.0182, "step": 10350 }, { "epoch": 3.23, "grad_norm": 0.5556694269180298, "learning_rate": 1.429267541479464e-05, "loss": 0.0256, "step": 10355 }, { "epoch": 3.23, "grad_norm": 0.6590076088905334, "learning_rate": 1.4287757929594488e-05, "loss": 0.018, "step": 10360 }, { "epoch": 3.23, "grad_norm": 0.4504948854446411, "learning_rate": 1.4282839173650694e-05, "loss": 0.0216, "step": 10365 }, { "epoch": 3.23, "grad_norm": 0.5615300536155701, "learning_rate": 1.4277919148421e-05, "loss": 0.0257, "step": 10370 }, { "epoch": 3.24, "grad_norm": 0.49208933115005493, "learning_rate": 1.4272997855363532e-05, "loss": 0.0184, "step": 10375 }, { "epoch": 3.24, "grad_norm": 0.47556278109550476, "learning_rate": 1.4268075295936796e-05, "loss": 0.0207, "step": 10380 }, { "epoch": 3.24, "grad_norm": 1.0073806047439575, "learning_rate": 1.4263151471599668e-05, "loss": 0.0224, "step": 10385 }, { "epoch": 3.24, "grad_norm": 0.6340844035148621, "learning_rate": 1.4258226383811397e-05, "loss": 0.0251, "step": 10390 }, { "epoch": 3.24, "grad_norm": 0.6471311450004578, "learning_rate": 1.4253300034031613e-05, "loss": 0.0195, "step": 10395 }, { "epoch": 3.24, "grad_norm": 0.81852787733078, "learning_rate": 1.4248372423720313e-05, "loss": 0.0253, "step": 10400 }, { "epoch": 3.25, "grad_norm": 1.1031111478805542, "learning_rate": 1.4243443554337874e-05, "loss": 0.0176, "step": 10405 }, { "epoch": 3.25, "grad_norm": 0.5694208145141602, "learning_rate": 1.423851342734504e-05, "loss": 0.0178, "step": 10410 }, { "epoch": 3.25, "grad_norm": 0.4479517936706543, "learning_rate": 1.4233582044202932e-05, "loss": 0.0224, "step": 10415 }, { "epoch": 3.25, "grad_norm": 0.5453998446464539, "learning_rate": 1.422864940637304e-05, "loss": 0.0133, "step": 10420 }, { "epoch": 3.25, "grad_norm": 0.6944233179092407, "learning_rate": 1.4223715515317232e-05, "loss": 0.0171, "step": 10425 }, { "epoch": 3.25, "grad_norm": 0.7661752104759216, "learning_rate": 1.421878037249774e-05, "loss": 0.0246, "step": 10430 }, { "epoch": 3.25, "grad_norm": 1.1437700986862183, "learning_rate": 1.4213843979377173e-05, "loss": 0.0221, "step": 10435 }, { "epoch": 3.26, "grad_norm": 0.5264135003089905, "learning_rate": 1.4208906337418502e-05, "loss": 0.0238, "step": 10440 }, { "epoch": 3.26, "grad_norm": 0.4517305791378021, "learning_rate": 1.420396744808508e-05, "loss": 0.0239, "step": 10445 }, { "epoch": 3.26, "grad_norm": 0.5580541491508484, "learning_rate": 1.419902731284062e-05, "loss": 0.0234, "step": 10450 }, { "epoch": 3.26, "grad_norm": 0.43453770875930786, "learning_rate": 1.4194085933149212e-05, "loss": 0.0176, "step": 10455 }, { "epoch": 3.26, "grad_norm": 0.7332136034965515, "learning_rate": 1.4189143310475306e-05, "loss": 0.0222, "step": 10460 }, { "epoch": 3.26, "grad_norm": 0.6036754846572876, "learning_rate": 1.4184199446283726e-05, "loss": 0.0195, "step": 10465 }, { "epoch": 3.27, "grad_norm": 0.4978627860546112, "learning_rate": 1.4179254342039665e-05, "loss": 0.0211, "step": 10470 }, { "epoch": 3.27, "grad_norm": 0.6187092661857605, "learning_rate": 1.4174307999208679e-05, "loss": 0.022, "step": 10475 }, { "epoch": 3.27, "grad_norm": 0.9858737587928772, "learning_rate": 1.4169360419256695e-05, "loss": 0.0197, "step": 10480 }, { "epoch": 3.27, "grad_norm": 0.7682695984840393, "learning_rate": 1.4164411603650001e-05, "loss": 0.0292, "step": 10485 }, { "epoch": 3.27, "grad_norm": 0.8246850371360779, "learning_rate": 1.4159461553855265e-05, "loss": 0.023, "step": 10490 }, { "epoch": 3.27, "grad_norm": 0.6413214802742004, "learning_rate": 1.4154510271339504e-05, "loss": 0.0181, "step": 10495 }, { "epoch": 3.27, "grad_norm": 0.546469509601593, "learning_rate": 1.4149557757570105e-05, "loss": 0.0214, "step": 10500 }, { "epoch": 3.28, "grad_norm": 0.6325284242630005, "learning_rate": 1.4144604014014827e-05, "loss": 0.0296, "step": 10505 }, { "epoch": 3.28, "grad_norm": 0.5793845057487488, "learning_rate": 1.413964904214179e-05, "loss": 0.022, "step": 10510 }, { "epoch": 3.28, "grad_norm": 0.5916551351547241, "learning_rate": 1.4134692843419473e-05, "loss": 0.0218, "step": 10515 }, { "epoch": 3.28, "grad_norm": 0.645812451839447, "learning_rate": 1.4129735419316725e-05, "loss": 0.0206, "step": 10520 }, { "epoch": 3.28, "grad_norm": 0.526564359664917, "learning_rate": 1.4124776771302755e-05, "loss": 0.0277, "step": 10525 }, { "epoch": 3.28, "grad_norm": 0.6979032158851624, "learning_rate": 1.4119816900847134e-05, "loss": 0.0227, "step": 10530 }, { "epoch": 3.29, "grad_norm": 0.588740348815918, "learning_rate": 1.4114855809419798e-05, "loss": 0.0251, "step": 10535 }, { "epoch": 3.29, "grad_norm": 0.74185711145401, "learning_rate": 1.4109893498491044e-05, "loss": 0.021, "step": 10540 }, { "epoch": 3.29, "grad_norm": 0.7735092639923096, "learning_rate": 1.4104929969531527e-05, "loss": 0.0233, "step": 10545 }, { "epoch": 3.29, "grad_norm": 0.47949472069740295, "learning_rate": 1.4099965224012269e-05, "loss": 0.0193, "step": 10550 }, { "epoch": 3.29, "grad_norm": 0.35745835304260254, "learning_rate": 1.409499926340465e-05, "loss": 0.0228, "step": 10555 }, { "epoch": 3.29, "grad_norm": 0.6792040467262268, "learning_rate": 1.4090032089180407e-05, "loss": 0.0196, "step": 10560 }, { "epoch": 3.3, "grad_norm": 0.35544055700302124, "learning_rate": 1.408506370281164e-05, "loss": 0.0208, "step": 10565 }, { "epoch": 3.3, "grad_norm": 0.7614099383354187, "learning_rate": 1.4080094105770808e-05, "loss": 0.0234, "step": 10570 }, { "epoch": 3.3, "grad_norm": 0.5480696558952332, "learning_rate": 1.4075123299530726e-05, "loss": 0.026, "step": 10575 }, { "epoch": 3.3, "grad_norm": 0.4874107241630554, "learning_rate": 1.4070151285564575e-05, "loss": 0.0192, "step": 10580 }, { "epoch": 3.3, "grad_norm": 0.5505118370056152, "learning_rate": 1.4065178065345882e-05, "loss": 0.0185, "step": 10585 }, { "epoch": 3.3, "grad_norm": 0.7177337408065796, "learning_rate": 1.4060203640348542e-05, "loss": 0.0243, "step": 10590 }, { "epoch": 3.3, "grad_norm": 0.6744440197944641, "learning_rate": 1.40552280120468e-05, "loss": 0.0229, "step": 10595 }, { "epoch": 3.31, "grad_norm": 0.6274732351303101, "learning_rate": 1.4050251181915268e-05, "loss": 0.0161, "step": 10600 }, { "epoch": 3.31, "grad_norm": 0.5714767575263977, "learning_rate": 1.40452731514289e-05, "loss": 0.0198, "step": 10605 }, { "epoch": 3.31, "grad_norm": 0.6027191281318665, "learning_rate": 1.4040293922063011e-05, "loss": 0.0272, "step": 10610 }, { "epoch": 3.31, "grad_norm": 0.6635540127754211, "learning_rate": 1.4035313495293281e-05, "loss": 0.0274, "step": 10615 }, { "epoch": 3.31, "grad_norm": 0.6043592691421509, "learning_rate": 1.403033187259573e-05, "loss": 0.0175, "step": 10620 }, { "epoch": 3.31, "grad_norm": 0.4285273551940918, "learning_rate": 1.4025349055446741e-05, "loss": 0.0166, "step": 10625 }, { "epoch": 3.32, "grad_norm": 0.7969894409179688, "learning_rate": 1.4020365045323047e-05, "loss": 0.0254, "step": 10630 }, { "epoch": 3.32, "grad_norm": 0.6607228517532349, "learning_rate": 1.4015379843701747e-05, "loss": 0.0252, "step": 10635 }, { "epoch": 3.32, "grad_norm": 0.6720756888389587, "learning_rate": 1.401039345206027e-05, "loss": 0.0248, "step": 10640 }, { "epoch": 3.32, "grad_norm": 0.39906564354896545, "learning_rate": 1.4005405871876419e-05, "loss": 0.0195, "step": 10645 }, { "epoch": 3.32, "grad_norm": 0.6192402839660645, "learning_rate": 1.4000417104628332e-05, "loss": 0.0281, "step": 10650 }, { "epoch": 3.32, "grad_norm": 0.416329562664032, "learning_rate": 1.399542715179452e-05, "loss": 0.0259, "step": 10655 }, { "epoch": 3.32, "grad_norm": 0.7425665855407715, "learning_rate": 1.3990436014853823e-05, "loss": 0.0219, "step": 10660 }, { "epoch": 3.33, "grad_norm": 0.3826334774494171, "learning_rate": 1.3985443695285447e-05, "loss": 0.0241, "step": 10665 }, { "epoch": 3.33, "grad_norm": 0.5703093409538269, "learning_rate": 1.3980450194568942e-05, "loss": 0.0154, "step": 10670 }, { "epoch": 3.33, "grad_norm": 0.6470087170600891, "learning_rate": 1.3975455514184209e-05, "loss": 0.0162, "step": 10675 }, { "epoch": 3.33, "grad_norm": 0.671140730381012, "learning_rate": 1.39704596556115e-05, "loss": 0.0222, "step": 10680 }, { "epoch": 3.33, "grad_norm": 0.618836522102356, "learning_rate": 1.3965462620331414e-05, "loss": 0.0189, "step": 10685 }, { "epoch": 3.33, "grad_norm": 0.49931779503822327, "learning_rate": 1.3960464409824899e-05, "loss": 0.0192, "step": 10690 }, { "epoch": 3.34, "grad_norm": 0.7505419850349426, "learning_rate": 1.3955465025573256e-05, "loss": 0.0179, "step": 10695 }, { "epoch": 3.34, "grad_norm": 1.0673131942749023, "learning_rate": 1.3950464469058126e-05, "loss": 0.0206, "step": 10700 }, { "epoch": 3.34, "grad_norm": 0.4440382719039917, "learning_rate": 1.3945462741761509e-05, "loss": 0.0117, "step": 10705 }, { "epoch": 3.34, "grad_norm": 0.48062238097190857, "learning_rate": 1.3940459845165735e-05, "loss": 0.0193, "step": 10710 }, { "epoch": 3.34, "grad_norm": 0.620808482170105, "learning_rate": 1.3935455780753496e-05, "loss": 0.0225, "step": 10715 }, { "epoch": 3.34, "grad_norm": 0.6680368185043335, "learning_rate": 1.3930450550007818e-05, "loss": 0.0148, "step": 10720 }, { "epoch": 3.35, "grad_norm": 0.43228110671043396, "learning_rate": 1.3925444154412091e-05, "loss": 0.0226, "step": 10725 }, { "epoch": 3.35, "grad_norm": 0.604020893573761, "learning_rate": 1.3920436595450025e-05, "loss": 0.0212, "step": 10730 }, { "epoch": 3.35, "grad_norm": 0.8461281061172485, "learning_rate": 1.3915427874605695e-05, "loss": 0.018, "step": 10735 }, { "epoch": 3.35, "grad_norm": 0.6334606409072876, "learning_rate": 1.3910417993363511e-05, "loss": 0.0216, "step": 10740 }, { "epoch": 3.35, "grad_norm": 0.6531664133071899, "learning_rate": 1.3905406953208233e-05, "loss": 0.0271, "step": 10745 }, { "epoch": 3.35, "grad_norm": 0.703217625617981, "learning_rate": 1.3900394755624955e-05, "loss": 0.0295, "step": 10750 }, { "epoch": 3.35, "grad_norm": 0.6552525162696838, "learning_rate": 1.3895381402099119e-05, "loss": 0.0252, "step": 10755 }, { "epoch": 3.36, "grad_norm": 0.9942788481712341, "learning_rate": 1.3890366894116517e-05, "loss": 0.0257, "step": 10760 }, { "epoch": 3.36, "grad_norm": 0.4330316185951233, "learning_rate": 1.3885351233163271e-05, "loss": 0.0206, "step": 10765 }, { "epoch": 3.36, "grad_norm": 0.6584552526473999, "learning_rate": 1.3880334420725854e-05, "loss": 0.0221, "step": 10770 }, { "epoch": 3.36, "grad_norm": 0.38093966245651245, "learning_rate": 1.3875316458291069e-05, "loss": 0.0251, "step": 10775 }, { "epoch": 3.36, "grad_norm": 0.5486773252487183, "learning_rate": 1.3870297347346075e-05, "loss": 0.0199, "step": 10780 }, { "epoch": 3.36, "grad_norm": 0.45985671877861023, "learning_rate": 1.3865277089378357e-05, "loss": 0.0175, "step": 10785 }, { "epoch": 3.37, "grad_norm": 0.54729825258255, "learning_rate": 1.3860255685875754e-05, "loss": 0.0193, "step": 10790 }, { "epoch": 3.37, "grad_norm": 1.0338304042816162, "learning_rate": 1.385523313832643e-05, "loss": 0.0292, "step": 10795 }, { "epoch": 3.37, "grad_norm": 0.6128621697425842, "learning_rate": 1.3850209448218897e-05, "loss": 0.0222, "step": 10800 }, { "epoch": 3.37, "grad_norm": 0.6406121253967285, "learning_rate": 1.3845184617042005e-05, "loss": 0.0213, "step": 10805 }, { "epoch": 3.37, "grad_norm": 0.40638235211372375, "learning_rate": 1.3840158646284938e-05, "loss": 0.0195, "step": 10810 }, { "epoch": 3.37, "grad_norm": 0.47747060656547546, "learning_rate": 1.3835131537437223e-05, "loss": 0.0239, "step": 10815 }, { "epoch": 3.37, "grad_norm": 0.5485616326332092, "learning_rate": 1.3830103291988723e-05, "loss": 0.0263, "step": 10820 }, { "epoch": 3.38, "grad_norm": 0.7525022625923157, "learning_rate": 1.3825073911429634e-05, "loss": 0.0344, "step": 10825 }, { "epoch": 3.38, "grad_norm": 0.6187776923179626, "learning_rate": 1.382004339725049e-05, "loss": 0.0183, "step": 10830 }, { "epoch": 3.38, "grad_norm": 0.6603714823722839, "learning_rate": 1.3815011750942164e-05, "loss": 0.0219, "step": 10835 }, { "epoch": 3.38, "grad_norm": 0.7578282356262207, "learning_rate": 1.3809978973995864e-05, "loss": 0.0167, "step": 10840 }, { "epoch": 3.38, "grad_norm": 0.7237257361412048, "learning_rate": 1.3804945067903128e-05, "loss": 0.0253, "step": 10845 }, { "epoch": 3.38, "grad_norm": 0.5169981122016907, "learning_rate": 1.3799910034155837e-05, "loss": 0.0209, "step": 10850 }, { "epoch": 3.39, "grad_norm": 0.5577664971351624, "learning_rate": 1.3794873874246193e-05, "loss": 0.031, "step": 10855 }, { "epoch": 3.39, "grad_norm": 0.9182189702987671, "learning_rate": 1.378983658966675e-05, "loss": 0.0224, "step": 10860 }, { "epoch": 3.39, "grad_norm": 0.5072724223136902, "learning_rate": 1.3784798181910381e-05, "loss": 0.0176, "step": 10865 }, { "epoch": 3.39, "grad_norm": 0.4871101975440979, "learning_rate": 1.3779758652470296e-05, "loss": 0.0249, "step": 10870 }, { "epoch": 3.39, "grad_norm": 0.5638007521629333, "learning_rate": 1.3774718002840037e-05, "loss": 0.0246, "step": 10875 }, { "epoch": 3.39, "grad_norm": 0.759212076663971, "learning_rate": 1.3769676234513481e-05, "loss": 0.0288, "step": 10880 }, { "epoch": 3.39, "grad_norm": 0.6488536596298218, "learning_rate": 1.3764633348984837e-05, "loss": 0.0195, "step": 10885 }, { "epoch": 3.4, "grad_norm": 0.522614061832428, "learning_rate": 1.3759589347748636e-05, "loss": 0.0252, "step": 10890 }, { "epoch": 3.4, "grad_norm": 0.8051714301109314, "learning_rate": 1.3754544232299749e-05, "loss": 0.0217, "step": 10895 }, { "epoch": 3.4, "grad_norm": 0.7797815799713135, "learning_rate": 1.374949800413337e-05, "loss": 0.023, "step": 10900 }, { "epoch": 3.4, "grad_norm": 0.8559707999229431, "learning_rate": 1.374445066474504e-05, "loss": 0.0263, "step": 10905 }, { "epoch": 3.4, "grad_norm": 0.8661820292472839, "learning_rate": 1.3739402215630602e-05, "loss": 0.0232, "step": 10910 }, { "epoch": 3.4, "grad_norm": 0.6741891503334045, "learning_rate": 1.3734352658286252e-05, "loss": 0.0159, "step": 10915 }, { "epoch": 3.41, "grad_norm": 0.5936999320983887, "learning_rate": 1.3729301994208495e-05, "loss": 0.026, "step": 10920 }, { "epoch": 3.41, "grad_norm": 1.0814099311828613, "learning_rate": 1.3724250224894186e-05, "loss": 0.0246, "step": 10925 }, { "epoch": 3.41, "grad_norm": 0.8289362192153931, "learning_rate": 1.3719197351840485e-05, "loss": 0.0342, "step": 10930 }, { "epoch": 3.41, "grad_norm": 0.6794367432594299, "learning_rate": 1.3714143376544896e-05, "loss": 0.0182, "step": 10935 }, { "epoch": 3.41, "grad_norm": 0.724094569683075, "learning_rate": 1.3709088300505237e-05, "loss": 0.0205, "step": 10940 }, { "epoch": 3.41, "grad_norm": 0.6084389090538025, "learning_rate": 1.3704032125219664e-05, "loss": 0.021, "step": 10945 }, { "epoch": 3.42, "grad_norm": 0.587928056716919, "learning_rate": 1.369897485218665e-05, "loss": 0.0161, "step": 10950 }, { "epoch": 3.42, "grad_norm": 0.5043631792068481, "learning_rate": 1.3693916482905e-05, "loss": 0.0175, "step": 10955 }, { "epoch": 3.42, "grad_norm": 0.6000282764434814, "learning_rate": 1.3688857018873831e-05, "loss": 0.0156, "step": 10960 }, { "epoch": 3.42, "grad_norm": 0.5633286833763123, "learning_rate": 1.3683796461592604e-05, "loss": 0.0237, "step": 10965 }, { "epoch": 3.42, "grad_norm": 0.647972583770752, "learning_rate": 1.367873481256109e-05, "loss": 0.0233, "step": 10970 }, { "epoch": 3.42, "grad_norm": 0.755373477935791, "learning_rate": 1.367367207327939e-05, "loss": 0.0273, "step": 10975 }, { "epoch": 3.42, "grad_norm": 0.5081478357315063, "learning_rate": 1.3668608245247916e-05, "loss": 0.0167, "step": 10980 }, { "epoch": 3.43, "grad_norm": 0.5032436847686768, "learning_rate": 1.366354332996742e-05, "loss": 0.0195, "step": 10985 }, { "epoch": 3.43, "grad_norm": 0.699909508228302, "learning_rate": 1.365847732893897e-05, "loss": 0.0277, "step": 10990 }, { "epoch": 3.43, "grad_norm": 0.7266407012939453, "learning_rate": 1.3653410243663953e-05, "loss": 0.0218, "step": 10995 }, { "epoch": 3.43, "grad_norm": 0.7415618896484375, "learning_rate": 1.3648342075644072e-05, "loss": 0.0206, "step": 11000 }, { "epoch": 3.43, "grad_norm": 0.4602750539779663, "learning_rate": 1.3643272826381361e-05, "loss": 0.0212, "step": 11005 }, { "epoch": 3.43, "grad_norm": 0.7643329501152039, "learning_rate": 1.3638202497378175e-05, "loss": 0.0271, "step": 11010 }, { "epoch": 3.44, "grad_norm": 0.6913391947746277, "learning_rate": 1.3633131090137182e-05, "loss": 0.0175, "step": 11015 }, { "epoch": 3.44, "grad_norm": 0.28211793303489685, "learning_rate": 1.3628058606161369e-05, "loss": 0.0187, "step": 11020 }, { "epoch": 3.44, "grad_norm": 0.5162671208381653, "learning_rate": 1.3622985046954045e-05, "loss": 0.0208, "step": 11025 }, { "epoch": 3.44, "grad_norm": 0.5287748575210571, "learning_rate": 1.3617910414018847e-05, "loss": 0.0233, "step": 11030 }, { "epoch": 3.44, "grad_norm": 0.7692965865135193, "learning_rate": 1.3612834708859711e-05, "loss": 0.0233, "step": 11035 }, { "epoch": 3.44, "grad_norm": 0.4626641273498535, "learning_rate": 1.3607757932980907e-05, "loss": 0.0157, "step": 11040 }, { "epoch": 3.44, "grad_norm": 0.5515432357788086, "learning_rate": 1.360268008788701e-05, "loss": 0.0205, "step": 11045 }, { "epoch": 3.45, "grad_norm": 0.39440417289733887, "learning_rate": 1.3597601175082931e-05, "loss": 0.0223, "step": 11050 }, { "epoch": 3.45, "grad_norm": 0.8312334418296814, "learning_rate": 1.3592521196073872e-05, "loss": 0.0184, "step": 11055 }, { "epoch": 3.45, "grad_norm": 0.3813522756099701, "learning_rate": 1.3587440152365368e-05, "loss": 0.0276, "step": 11060 }, { "epoch": 3.45, "grad_norm": 0.4386773705482483, "learning_rate": 1.3582358045463264e-05, "loss": 0.0151, "step": 11065 }, { "epoch": 3.45, "grad_norm": 0.6900048851966858, "learning_rate": 1.3577274876873727e-05, "loss": 0.0282, "step": 11070 }, { "epoch": 3.45, "grad_norm": 0.7808547019958496, "learning_rate": 1.3572190648103227e-05, "loss": 0.0252, "step": 11075 }, { "epoch": 3.46, "grad_norm": 0.9441418051719666, "learning_rate": 1.3567105360658555e-05, "loss": 0.0261, "step": 11080 }, { "epoch": 3.46, "grad_norm": 0.8128235340118408, "learning_rate": 1.3562019016046814e-05, "loss": 0.0251, "step": 11085 }, { "epoch": 3.46, "grad_norm": 1.0327625274658203, "learning_rate": 1.3556931615775426e-05, "loss": 0.0309, "step": 11090 }, { "epoch": 3.46, "grad_norm": 0.5404015183448792, "learning_rate": 1.3551843161352116e-05, "loss": 0.02, "step": 11095 }, { "epoch": 3.46, "grad_norm": 0.812623143196106, "learning_rate": 1.354675365428493e-05, "loss": 0.0291, "step": 11100 }, { "epoch": 3.46, "grad_norm": 0.6458951830863953, "learning_rate": 1.354166309608222e-05, "loss": 0.0182, "step": 11105 }, { "epoch": 3.47, "grad_norm": 0.45248669385910034, "learning_rate": 1.3536571488252655e-05, "loss": 0.029, "step": 11110 }, { "epoch": 3.47, "grad_norm": 0.6244685649871826, "learning_rate": 1.3531478832305207e-05, "loss": 0.0227, "step": 11115 }, { "epoch": 3.47, "grad_norm": 0.36896705627441406, "learning_rate": 1.3526385129749171e-05, "loss": 0.0174, "step": 11120 }, { "epoch": 3.47, "grad_norm": 0.5725215077400208, "learning_rate": 1.3521290382094135e-05, "loss": 0.0224, "step": 11125 }, { "epoch": 3.47, "grad_norm": 0.524311363697052, "learning_rate": 1.3516194590850014e-05, "loss": 0.0214, "step": 11130 }, { "epoch": 3.47, "grad_norm": 0.6719222664833069, "learning_rate": 1.3511097757527023e-05, "loss": 0.0258, "step": 11135 }, { "epoch": 3.47, "grad_norm": 0.7116886377334595, "learning_rate": 1.350599988363569e-05, "loss": 0.0254, "step": 11140 }, { "epoch": 3.48, "grad_norm": 0.6066772937774658, "learning_rate": 1.3500900970686842e-05, "loss": 0.0203, "step": 11145 }, { "epoch": 3.48, "grad_norm": 0.8623872995376587, "learning_rate": 1.349580102019163e-05, "loss": 0.0305, "step": 11150 }, { "epoch": 3.48, "grad_norm": 0.6644154787063599, "learning_rate": 1.3490700033661497e-05, "loss": 0.0183, "step": 11155 }, { "epoch": 3.48, "grad_norm": 0.7615149021148682, "learning_rate": 1.3485598012608204e-05, "loss": 0.021, "step": 11160 }, { "epoch": 3.48, "grad_norm": 0.641488790512085, "learning_rate": 1.3480494958543811e-05, "loss": 0.0143, "step": 11165 }, { "epoch": 3.48, "grad_norm": 0.6052054762840271, "learning_rate": 1.3475390872980688e-05, "loss": 0.0269, "step": 11170 }, { "epoch": 3.49, "grad_norm": 0.8062578439712524, "learning_rate": 1.347028575743151e-05, "loss": 0.0278, "step": 11175 }, { "epoch": 3.49, "grad_norm": 0.5274462103843689, "learning_rate": 1.3465179613409258e-05, "loss": 0.0219, "step": 11180 }, { "epoch": 3.49, "grad_norm": 0.3634663224220276, "learning_rate": 1.3460072442427216e-05, "loss": 0.0174, "step": 11185 }, { "epoch": 3.49, "grad_norm": 0.7998952269554138, "learning_rate": 1.3454964245998973e-05, "loss": 0.0263, "step": 11190 }, { "epoch": 3.49, "grad_norm": 0.7923819422721863, "learning_rate": 1.344985502563843e-05, "loss": 0.0238, "step": 11195 }, { "epoch": 3.49, "grad_norm": 0.5489522814750671, "learning_rate": 1.3444744782859772e-05, "loss": 0.017, "step": 11200 }, { "epoch": 3.49, "grad_norm": 1.0061874389648438, "learning_rate": 1.3439633519177504e-05, "loss": 0.0281, "step": 11205 }, { "epoch": 3.5, "grad_norm": 0.8746218681335449, "learning_rate": 1.343452123610643e-05, "loss": 0.0235, "step": 11210 }, { "epoch": 3.5, "grad_norm": 0.9540253281593323, "learning_rate": 1.3429407935161651e-05, "loss": 0.0254, "step": 11215 }, { "epoch": 3.5, "grad_norm": 0.6697771549224854, "learning_rate": 1.3424293617858577e-05, "loss": 0.0195, "step": 11220 }, { "epoch": 3.5, "grad_norm": 0.3734327554702759, "learning_rate": 1.3419178285712912e-05, "loss": 0.0232, "step": 11225 }, { "epoch": 3.5, "grad_norm": 0.6305715441703796, "learning_rate": 1.3414061940240669e-05, "loss": 0.0248, "step": 11230 }, { "epoch": 3.5, "grad_norm": 0.7490862011909485, "learning_rate": 1.3408944582958147e-05, "loss": 0.0155, "step": 11235 }, { "epoch": 3.51, "grad_norm": 0.45252835750579834, "learning_rate": 1.3403826215381965e-05, "loss": 0.0145, "step": 11240 }, { "epoch": 3.51, "grad_norm": 0.6468937993049622, "learning_rate": 1.3398706839029024e-05, "loss": 0.0205, "step": 11245 }, { "epoch": 3.51, "grad_norm": 0.5805210471153259, "learning_rate": 1.3393586455416534e-05, "loss": 0.023, "step": 11250 }, { "epoch": 3.51, "grad_norm": 0.74920654296875, "learning_rate": 1.3388465066061997e-05, "loss": 0.0269, "step": 11255 }, { "epoch": 3.51, "grad_norm": 0.5356035828590393, "learning_rate": 1.3383342672483219e-05, "loss": 0.0199, "step": 11260 }, { "epoch": 3.51, "grad_norm": 0.4009453058242798, "learning_rate": 1.3378219276198303e-05, "loss": 0.0176, "step": 11265 }, { "epoch": 3.52, "grad_norm": 0.7870226502418518, "learning_rate": 1.3373094878725638e-05, "loss": 0.0199, "step": 11270 }, { "epoch": 3.52, "grad_norm": 0.760246217250824, "learning_rate": 1.336796948158393e-05, "loss": 0.024, "step": 11275 }, { "epoch": 3.52, "grad_norm": 0.4638262689113617, "learning_rate": 1.336284308629216e-05, "loss": 0.0279, "step": 11280 }, { "epoch": 3.52, "grad_norm": 0.5972824692726135, "learning_rate": 1.3357715694369626e-05, "loss": 0.0196, "step": 11285 }, { "epoch": 3.52, "grad_norm": 0.6205165386199951, "learning_rate": 1.33525873073359e-05, "loss": 0.018, "step": 11290 }, { "epoch": 3.52, "grad_norm": 0.7853833436965942, "learning_rate": 1.3347457926710863e-05, "loss": 0.0129, "step": 11295 }, { "epoch": 3.52, "grad_norm": 0.7756471037864685, "learning_rate": 1.334232755401469e-05, "loss": 0.0228, "step": 11300 }, { "epoch": 3.53, "grad_norm": 0.516747772693634, "learning_rate": 1.3337196190767841e-05, "loss": 0.0215, "step": 11305 }, { "epoch": 3.53, "grad_norm": 0.693603515625, "learning_rate": 1.3332063838491077e-05, "loss": 0.0294, "step": 11310 }, { "epoch": 3.53, "grad_norm": 1.0093333721160889, "learning_rate": 1.3326930498705449e-05, "loss": 0.0257, "step": 11315 }, { "epoch": 3.53, "grad_norm": 0.6264593005180359, "learning_rate": 1.3321796172932306e-05, "loss": 0.0263, "step": 11320 }, { "epoch": 3.53, "grad_norm": 0.7026219367980957, "learning_rate": 1.3316660862693285e-05, "loss": 0.0198, "step": 11325 }, { "epoch": 3.53, "grad_norm": 0.6751310229301453, "learning_rate": 1.3311524569510309e-05, "loss": 0.0261, "step": 11330 }, { "epoch": 3.54, "grad_norm": 0.5312600135803223, "learning_rate": 1.3306387294905601e-05, "loss": 0.0269, "step": 11335 }, { "epoch": 3.54, "grad_norm": 0.8188959956169128, "learning_rate": 1.3301249040401677e-05, "loss": 0.0333, "step": 11340 }, { "epoch": 3.54, "grad_norm": 1.0527750253677368, "learning_rate": 1.3296109807521336e-05, "loss": 0.0257, "step": 11345 }, { "epoch": 3.54, "grad_norm": 0.3274320960044861, "learning_rate": 1.3290969597787664e-05, "loss": 0.024, "step": 11350 }, { "epoch": 3.54, "grad_norm": 0.42005568742752075, "learning_rate": 1.3285828412724046e-05, "loss": 0.0259, "step": 11355 }, { "epoch": 3.54, "grad_norm": 0.5386154055595398, "learning_rate": 1.3280686253854156e-05, "loss": 0.0206, "step": 11360 }, { "epoch": 3.54, "grad_norm": 0.5777267813682556, "learning_rate": 1.3275543122701946e-05, "loss": 0.0247, "step": 11365 }, { "epoch": 3.55, "grad_norm": 0.5420946478843689, "learning_rate": 1.3270399020791667e-05, "loss": 0.022, "step": 11370 }, { "epoch": 3.55, "grad_norm": 0.4722997844219208, "learning_rate": 1.3265253949647851e-05, "loss": 0.0253, "step": 11375 }, { "epoch": 3.55, "grad_norm": 0.6599250435829163, "learning_rate": 1.3260107910795324e-05, "loss": 0.021, "step": 11380 }, { "epoch": 3.55, "grad_norm": 0.6490116715431213, "learning_rate": 1.3254960905759186e-05, "loss": 0.0289, "step": 11385 }, { "epoch": 3.55, "grad_norm": 0.6252589225769043, "learning_rate": 1.3249812936064844e-05, "loss": 0.0246, "step": 11390 }, { "epoch": 3.55, "grad_norm": 0.7048884034156799, "learning_rate": 1.3244664003237967e-05, "loss": 0.0204, "step": 11395 }, { "epoch": 3.56, "grad_norm": 0.6130014657974243, "learning_rate": 1.3239514108804525e-05, "loss": 0.0217, "step": 11400 }, { "epoch": 3.56, "grad_norm": 0.7588436007499695, "learning_rate": 1.3234363254290772e-05, "loss": 0.027, "step": 11405 }, { "epoch": 3.56, "grad_norm": 0.7372493743896484, "learning_rate": 1.3229211441223244e-05, "loss": 0.0277, "step": 11410 }, { "epoch": 3.56, "grad_norm": 0.7405737638473511, "learning_rate": 1.3224058671128757e-05, "loss": 0.0206, "step": 11415 }, { "epoch": 3.56, "grad_norm": 0.7434006929397583, "learning_rate": 1.3218904945534416e-05, "loss": 0.0238, "step": 11420 }, { "epoch": 3.56, "grad_norm": 0.5010198354721069, "learning_rate": 1.3213750265967609e-05, "loss": 0.0275, "step": 11425 }, { "epoch": 3.56, "grad_norm": 0.5099480152130127, "learning_rate": 1.3208594633956007e-05, "loss": 0.0273, "step": 11430 }, { "epoch": 3.57, "grad_norm": 0.738741934299469, "learning_rate": 1.3203438051027552e-05, "loss": 0.0281, "step": 11435 }, { "epoch": 3.57, "grad_norm": 0.6779162883758545, "learning_rate": 1.3198280518710489e-05, "loss": 0.0223, "step": 11440 }, { "epoch": 3.57, "grad_norm": 0.335631787776947, "learning_rate": 1.319312203853333e-05, "loss": 0.0188, "step": 11445 }, { "epoch": 3.57, "grad_norm": 0.3947046995162964, "learning_rate": 1.3187962612024866e-05, "loss": 0.0217, "step": 11450 }, { "epoch": 3.57, "grad_norm": 0.5501173138618469, "learning_rate": 1.3182802240714176e-05, "loss": 0.0268, "step": 11455 }, { "epoch": 3.57, "grad_norm": 0.38181018829345703, "learning_rate": 1.3177640926130615e-05, "loss": 0.0246, "step": 11460 }, { "epoch": 3.58, "grad_norm": 0.8614093065261841, "learning_rate": 1.3172478669803823e-05, "loss": 0.0203, "step": 11465 }, { "epoch": 3.58, "grad_norm": 0.6074875593185425, "learning_rate": 1.3167315473263709e-05, "loss": 0.0197, "step": 11470 }, { "epoch": 3.58, "grad_norm": 0.606174886226654, "learning_rate": 1.3162151338040471e-05, "loss": 0.0206, "step": 11475 }, { "epoch": 3.58, "grad_norm": 0.5583638548851013, "learning_rate": 1.3156986265664575e-05, "loss": 0.0267, "step": 11480 }, { "epoch": 3.58, "grad_norm": 0.4318353235721588, "learning_rate": 1.315182025766678e-05, "loss": 0.0256, "step": 11485 }, { "epoch": 3.58, "grad_norm": 0.6225631237030029, "learning_rate": 1.3146653315578103e-05, "loss": 0.0246, "step": 11490 }, { "epoch": 3.59, "grad_norm": 0.5729676485061646, "learning_rate": 1.3141485440929853e-05, "loss": 0.0265, "step": 11495 }, { "epoch": 3.59, "grad_norm": 0.40861061215400696, "learning_rate": 1.313631663525361e-05, "loss": 0.0272, "step": 11500 }, { "epoch": 3.59, "grad_norm": 0.6979518532752991, "learning_rate": 1.3131146900081226e-05, "loss": 0.0152, "step": 11505 }, { "epoch": 3.59, "grad_norm": 0.5879753828048706, "learning_rate": 1.3125976236944835e-05, "loss": 0.0196, "step": 11510 }, { "epoch": 3.59, "grad_norm": 0.6799219250679016, "learning_rate": 1.3120804647376844e-05, "loss": 0.0233, "step": 11515 }, { "epoch": 3.59, "grad_norm": 0.5397365093231201, "learning_rate": 1.311563213290993e-05, "loss": 0.0217, "step": 11520 }, { "epoch": 3.59, "grad_norm": 0.48731374740600586, "learning_rate": 1.3110458695077054e-05, "loss": 0.0202, "step": 11525 }, { "epoch": 3.6, "grad_norm": 0.9724308252334595, "learning_rate": 1.3105284335411436e-05, "loss": 0.0248, "step": 11530 }, { "epoch": 3.6, "grad_norm": 0.7429783940315247, "learning_rate": 1.310010905544659e-05, "loss": 0.0217, "step": 11535 }, { "epoch": 3.6, "grad_norm": 0.6432991623878479, "learning_rate": 1.3094932856716279e-05, "loss": 0.0301, "step": 11540 }, { "epoch": 3.6, "grad_norm": 0.8815121650695801, "learning_rate": 1.3089755740754554e-05, "loss": 0.0181, "step": 11545 }, { "epoch": 3.6, "grad_norm": 0.39547908306121826, "learning_rate": 1.3084577709095738e-05, "loss": 0.0183, "step": 11550 }, { "epoch": 3.6, "grad_norm": 0.6698602437973022, "learning_rate": 1.3079398763274418e-05, "loss": 0.0302, "step": 11555 }, { "epoch": 3.61, "grad_norm": 0.48663145303726196, "learning_rate": 1.3074218904825451e-05, "loss": 0.0168, "step": 11560 }, { "epoch": 3.61, "grad_norm": 0.5853646397590637, "learning_rate": 1.3069038135283974e-05, "loss": 0.0218, "step": 11565 }, { "epoch": 3.61, "grad_norm": 0.675857424736023, "learning_rate": 1.3063856456185384e-05, "loss": 0.0193, "step": 11570 }, { "epoch": 3.61, "grad_norm": 0.6646441221237183, "learning_rate": 1.3058673869065359e-05, "loss": 0.0239, "step": 11575 }, { "epoch": 3.61, "grad_norm": 0.5286111831665039, "learning_rate": 1.3053490375459831e-05, "loss": 0.0188, "step": 11580 }, { "epoch": 3.61, "grad_norm": 0.5661985874176025, "learning_rate": 1.3048305976905011e-05, "loss": 0.0175, "step": 11585 }, { "epoch": 3.61, "grad_norm": 0.5855173468589783, "learning_rate": 1.3043120674937384e-05, "loss": 0.0195, "step": 11590 }, { "epoch": 3.62, "grad_norm": 0.9668726325035095, "learning_rate": 1.3037934471093683e-05, "loss": 0.0266, "step": 11595 }, { "epoch": 3.62, "grad_norm": 0.8840664625167847, "learning_rate": 1.3032747366910927e-05, "loss": 0.0219, "step": 11600 }, { "epoch": 3.62, "grad_norm": 0.40016576647758484, "learning_rate": 1.302755936392639e-05, "loss": 0.0203, "step": 11605 }, { "epoch": 3.62, "grad_norm": 0.678231418132782, "learning_rate": 1.3022370463677627e-05, "loss": 0.0301, "step": 11610 }, { "epoch": 3.62, "grad_norm": 0.6121237874031067, "learning_rate": 1.3017180667702438e-05, "loss": 0.0251, "step": 11615 }, { "epoch": 3.62, "grad_norm": 0.9894143342971802, "learning_rate": 1.3011989977538907e-05, "loss": 0.0212, "step": 11620 }, { "epoch": 3.63, "grad_norm": 0.5731955170631409, "learning_rate": 1.300679839472537e-05, "loss": 0.0144, "step": 11625 }, { "epoch": 3.63, "grad_norm": 0.540483295917511, "learning_rate": 1.3001605920800438e-05, "loss": 0.0267, "step": 11630 }, { "epoch": 3.63, "grad_norm": 0.6893200874328613, "learning_rate": 1.2996412557302977e-05, "loss": 0.0182, "step": 11635 }, { "epoch": 3.63, "grad_norm": 0.7962963581085205, "learning_rate": 1.2991218305772126e-05, "loss": 0.0231, "step": 11640 }, { "epoch": 3.63, "grad_norm": 0.860883355140686, "learning_rate": 1.2986023167747278e-05, "loss": 0.0264, "step": 11645 }, { "epoch": 3.63, "grad_norm": 0.4079477787017822, "learning_rate": 1.2980827144768093e-05, "loss": 0.0306, "step": 11650 }, { "epoch": 3.64, "grad_norm": 0.602584183216095, "learning_rate": 1.2975630238374498e-05, "loss": 0.0236, "step": 11655 }, { "epoch": 3.64, "grad_norm": 0.80098956823349, "learning_rate": 1.2970432450106669e-05, "loss": 0.0284, "step": 11660 }, { "epoch": 3.64, "grad_norm": 0.7290306091308594, "learning_rate": 1.2965233781505058e-05, "loss": 0.0208, "step": 11665 }, { "epoch": 3.64, "grad_norm": 0.8001556396484375, "learning_rate": 1.2960034234110368e-05, "loss": 0.0281, "step": 11670 }, { "epoch": 3.64, "grad_norm": 0.8522571921348572, "learning_rate": 1.2954833809463567e-05, "loss": 0.0239, "step": 11675 }, { "epoch": 3.64, "grad_norm": 0.5874483585357666, "learning_rate": 1.2949632509105883e-05, "loss": 0.0288, "step": 11680 }, { "epoch": 3.64, "grad_norm": 0.6494091749191284, "learning_rate": 1.2944430334578795e-05, "loss": 0.0289, "step": 11685 }, { "epoch": 3.65, "grad_norm": 0.5915369987487793, "learning_rate": 1.2939227287424056e-05, "loss": 0.0224, "step": 11690 }, { "epoch": 3.65, "grad_norm": 0.832787275314331, "learning_rate": 1.2934023369183669e-05, "loss": 0.0267, "step": 11695 }, { "epoch": 3.65, "grad_norm": 0.6795758605003357, "learning_rate": 1.2928818581399893e-05, "loss": 0.0281, "step": 11700 }, { "epoch": 3.65, "grad_norm": 0.8091376423835754, "learning_rate": 1.2923612925615245e-05, "loss": 0.0228, "step": 11705 }, { "epoch": 3.65, "grad_norm": 0.6956303715705872, "learning_rate": 1.2918406403372508e-05, "loss": 0.0222, "step": 11710 }, { "epoch": 3.65, "grad_norm": 0.7285389304161072, "learning_rate": 1.2913199016214715e-05, "loss": 0.0288, "step": 11715 }, { "epoch": 3.66, "grad_norm": 0.6634201407432556, "learning_rate": 1.2907990765685155e-05, "loss": 0.0229, "step": 11720 }, { "epoch": 3.66, "grad_norm": 0.8717896342277527, "learning_rate": 1.290278165332737e-05, "loss": 0.0217, "step": 11725 }, { "epoch": 3.66, "grad_norm": 2.627668857574463, "learning_rate": 1.2897571680685164e-05, "loss": 0.0248, "step": 11730 }, { "epoch": 3.66, "grad_norm": 0.5117378830909729, "learning_rate": 1.2892360849302598e-05, "loss": 0.022, "step": 11735 }, { "epoch": 3.66, "grad_norm": 0.5402873754501343, "learning_rate": 1.2887149160723978e-05, "loss": 0.0245, "step": 11740 }, { "epoch": 3.66, "grad_norm": 0.8802453875541687, "learning_rate": 1.2881936616493867e-05, "loss": 0.0259, "step": 11745 }, { "epoch": 3.66, "grad_norm": 0.6248879432678223, "learning_rate": 1.2876723218157086e-05, "loss": 0.0229, "step": 11750 }, { "epoch": 3.67, "grad_norm": 0.6852085590362549, "learning_rate": 1.287150896725871e-05, "loss": 0.0322, "step": 11755 }, { "epoch": 3.67, "grad_norm": 0.573294460773468, "learning_rate": 1.2866293865344058e-05, "loss": 0.0221, "step": 11760 }, { "epoch": 3.67, "grad_norm": 0.847869873046875, "learning_rate": 1.2861077913958708e-05, "loss": 0.024, "step": 11765 }, { "epoch": 3.67, "grad_norm": 0.7071881294250488, "learning_rate": 1.285586111464849e-05, "loss": 0.0202, "step": 11770 }, { "epoch": 3.67, "grad_norm": 0.7383145093917847, "learning_rate": 1.2850643468959479e-05, "loss": 0.0211, "step": 11775 }, { "epoch": 3.67, "grad_norm": 0.8499407768249512, "learning_rate": 1.284542497843801e-05, "loss": 0.0278, "step": 11780 }, { "epoch": 3.68, "grad_norm": 0.7504806518554688, "learning_rate": 1.2840205644630662e-05, "loss": 0.0174, "step": 11785 }, { "epoch": 3.68, "grad_norm": 1.0881952047348022, "learning_rate": 1.2834985469084265e-05, "loss": 0.0276, "step": 11790 }, { "epoch": 3.68, "grad_norm": 0.6055206656455994, "learning_rate": 1.28297644533459e-05, "loss": 0.0235, "step": 11795 }, { "epoch": 3.68, "grad_norm": 0.7683250308036804, "learning_rate": 1.2824542598962895e-05, "loss": 0.025, "step": 11800 }, { "epoch": 3.68, "grad_norm": 1.2123945951461792, "learning_rate": 1.2819319907482832e-05, "loss": 0.0241, "step": 11805 }, { "epoch": 3.68, "grad_norm": 0.546367347240448, "learning_rate": 1.281409638045353e-05, "loss": 0.019, "step": 11810 }, { "epoch": 3.69, "grad_norm": 0.5594978332519531, "learning_rate": 1.2808872019423064e-05, "loss": 0.0212, "step": 11815 }, { "epoch": 3.69, "grad_norm": 0.5757129192352295, "learning_rate": 1.2803646825939758e-05, "loss": 0.0239, "step": 11820 }, { "epoch": 3.69, "grad_norm": 1.0514737367630005, "learning_rate": 1.2798420801552181e-05, "loss": 0.0242, "step": 11825 }, { "epoch": 3.69, "grad_norm": 0.6186000108718872, "learning_rate": 1.2793193947809135e-05, "loss": 0.0234, "step": 11830 }, { "epoch": 3.69, "grad_norm": 0.3759470283985138, "learning_rate": 1.2787966266259693e-05, "loss": 0.0176, "step": 11835 }, { "epoch": 3.69, "grad_norm": 0.6968141198158264, "learning_rate": 1.278273775845315e-05, "loss": 0.0255, "step": 11840 }, { "epoch": 3.69, "grad_norm": 0.48914167284965515, "learning_rate": 1.2777508425939062e-05, "loss": 0.0258, "step": 11845 }, { "epoch": 3.7, "grad_norm": 0.7071937918663025, "learning_rate": 1.2772278270267218e-05, "loss": 0.0297, "step": 11850 }, { "epoch": 3.7, "grad_norm": 0.3311907649040222, "learning_rate": 1.2767047292987658e-05, "loss": 0.0299, "step": 11855 }, { "epoch": 3.7, "grad_norm": 0.714726984500885, "learning_rate": 1.2761815495650666e-05, "loss": 0.0228, "step": 11860 }, { "epoch": 3.7, "grad_norm": 0.4907616674900055, "learning_rate": 1.2756582879806761e-05, "loss": 0.0243, "step": 11865 }, { "epoch": 3.7, "grad_norm": 0.594176709651947, "learning_rate": 1.2751349447006713e-05, "loss": 0.0275, "step": 11870 }, { "epoch": 3.7, "grad_norm": 0.5221083760261536, "learning_rate": 1.2746115198801527e-05, "loss": 0.018, "step": 11875 }, { "epoch": 3.71, "grad_norm": 0.5824384689331055, "learning_rate": 1.2740880136742464e-05, "loss": 0.0207, "step": 11880 }, { "epoch": 3.71, "grad_norm": 0.6119704246520996, "learning_rate": 1.2735644262381006e-05, "loss": 0.0223, "step": 11885 }, { "epoch": 3.71, "grad_norm": 0.7727152109146118, "learning_rate": 1.2730407577268888e-05, "loss": 0.0163, "step": 11890 }, { "epoch": 3.71, "grad_norm": 0.5603602528572083, "learning_rate": 1.2725170082958088e-05, "loss": 0.0286, "step": 11895 }, { "epoch": 3.71, "grad_norm": 0.6091859340667725, "learning_rate": 1.271993178100081e-05, "loss": 0.0294, "step": 11900 }, { "epoch": 3.71, "grad_norm": 0.9491273164749146, "learning_rate": 1.2714692672949513e-05, "loss": 0.0308, "step": 11905 }, { "epoch": 3.71, "grad_norm": 0.4891149401664734, "learning_rate": 1.2709452760356884e-05, "loss": 0.0243, "step": 11910 }, { "epoch": 3.72, "grad_norm": 0.5642223358154297, "learning_rate": 1.2704212044775856e-05, "loss": 0.0282, "step": 11915 }, { "epoch": 3.72, "grad_norm": 0.42882299423217773, "learning_rate": 1.2698970527759595e-05, "loss": 0.027, "step": 11920 }, { "epoch": 3.72, "grad_norm": 0.4645093083381653, "learning_rate": 1.2693728210861505e-05, "loss": 0.0252, "step": 11925 }, { "epoch": 3.72, "grad_norm": 0.7219882607460022, "learning_rate": 1.2688485095635229e-05, "loss": 0.017, "step": 11930 }, { "epoch": 3.72, "grad_norm": 0.4665991961956024, "learning_rate": 1.2683241183634645e-05, "loss": 0.0254, "step": 11935 }, { "epoch": 3.72, "grad_norm": 0.8865329623222351, "learning_rate": 1.2677996476413868e-05, "loss": 0.0272, "step": 11940 }, { "epoch": 3.73, "grad_norm": 0.7679188847541809, "learning_rate": 1.2672750975527249e-05, "loss": 0.0269, "step": 11945 }, { "epoch": 3.73, "grad_norm": 0.4800529479980469, "learning_rate": 1.2667504682529375e-05, "loss": 0.0183, "step": 11950 }, { "epoch": 3.73, "grad_norm": 0.8061618804931641, "learning_rate": 1.266225759897506e-05, "loss": 0.0288, "step": 11955 }, { "epoch": 3.73, "grad_norm": 0.6751968860626221, "learning_rate": 1.2657009726419368e-05, "loss": 0.0217, "step": 11960 }, { "epoch": 3.73, "grad_norm": 0.6072093844413757, "learning_rate": 1.2651761066417581e-05, "loss": 0.024, "step": 11965 }, { "epoch": 3.73, "grad_norm": 0.9960513114929199, "learning_rate": 1.2646511620525224e-05, "loss": 0.025, "step": 11970 }, { "epoch": 3.73, "grad_norm": 0.3995935022830963, "learning_rate": 1.2641261390298048e-05, "loss": 0.0228, "step": 11975 }, { "epoch": 3.74, "grad_norm": 0.6832537055015564, "learning_rate": 1.2636010377292042e-05, "loss": 0.0258, "step": 11980 }, { "epoch": 3.74, "grad_norm": 0.5022556781768799, "learning_rate": 1.2630758583063428e-05, "loss": 0.0293, "step": 11985 }, { "epoch": 3.74, "grad_norm": 0.8268438577651978, "learning_rate": 1.2625506009168658e-05, "loss": 0.0304, "step": 11990 }, { "epoch": 3.74, "grad_norm": 0.5869343876838684, "learning_rate": 1.2620252657164404e-05, "loss": 0.0274, "step": 11995 }, { "epoch": 3.74, "grad_norm": 1.0159419775009155, "learning_rate": 1.2614998528607589e-05, "loss": 0.0223, "step": 12000 }, { "epoch": 3.74, "grad_norm": 0.9052306413650513, "learning_rate": 1.2609743625055353e-05, "loss": 0.0303, "step": 12005 }, { "epoch": 3.75, "grad_norm": 0.7687567472457886, "learning_rate": 1.2604487948065066e-05, "loss": 0.0248, "step": 12010 }, { "epoch": 3.75, "grad_norm": 0.9072149991989136, "learning_rate": 1.2599231499194328e-05, "loss": 0.0217, "step": 12015 }, { "epoch": 3.75, "grad_norm": 0.5874031186103821, "learning_rate": 1.2593974280000971e-05, "loss": 0.025, "step": 12020 }, { "epoch": 3.75, "grad_norm": 0.3685324490070343, "learning_rate": 1.258871629204306e-05, "loss": 0.019, "step": 12025 }, { "epoch": 3.75, "grad_norm": 0.5925565361976624, "learning_rate": 1.2583457536878872e-05, "loss": 0.0215, "step": 12030 }, { "epoch": 3.75, "grad_norm": 0.7138723731040955, "learning_rate": 1.2578198016066925e-05, "loss": 0.0311, "step": 12035 }, { "epoch": 3.76, "grad_norm": 0.7673307657241821, "learning_rate": 1.2572937731165958e-05, "loss": 0.0156, "step": 12040 }, { "epoch": 3.76, "grad_norm": 0.9401269555091858, "learning_rate": 1.256767668373494e-05, "loss": 0.0338, "step": 12045 }, { "epoch": 3.76, "grad_norm": 0.7034611701965332, "learning_rate": 1.2562414875333062e-05, "loss": 0.0218, "step": 12050 }, { "epoch": 3.76, "grad_norm": 0.6674792766571045, "learning_rate": 1.2557152307519746e-05, "loss": 0.0235, "step": 12055 }, { "epoch": 3.76, "grad_norm": 0.6051164865493774, "learning_rate": 1.2551888981854632e-05, "loss": 0.0246, "step": 12060 }, { "epoch": 3.76, "grad_norm": 0.6730068325996399, "learning_rate": 1.254662489989759e-05, "loss": 0.0204, "step": 12065 }, { "epoch": 3.76, "grad_norm": 0.6211580038070679, "learning_rate": 1.2541360063208714e-05, "loss": 0.0263, "step": 12070 }, { "epoch": 3.77, "grad_norm": 2.0400476455688477, "learning_rate": 1.2536094473348317e-05, "loss": 0.0257, "step": 12075 }, { "epoch": 3.77, "grad_norm": 0.3689804673194885, "learning_rate": 1.253082813187694e-05, "loss": 0.0282, "step": 12080 }, { "epoch": 3.77, "grad_norm": 0.5824944376945496, "learning_rate": 1.2525561040355342e-05, "loss": 0.0259, "step": 12085 }, { "epoch": 3.77, "grad_norm": 1.1334221363067627, "learning_rate": 1.2520293200344516e-05, "loss": 0.0241, "step": 12090 }, { "epoch": 3.77, "grad_norm": 0.6835818886756897, "learning_rate": 1.2515024613405662e-05, "loss": 0.0199, "step": 12095 }, { "epoch": 3.77, "grad_norm": 0.6704448461532593, "learning_rate": 1.2509755281100201e-05, "loss": 0.0245, "step": 12100 }, { "epoch": 3.78, "grad_norm": 0.8395538926124573, "learning_rate": 1.2504485204989794e-05, "loss": 0.0192, "step": 12105 }, { "epoch": 3.78, "grad_norm": 0.6028382182121277, "learning_rate": 1.2499214386636302e-05, "loss": 0.0314, "step": 12110 }, { "epoch": 3.78, "grad_norm": 0.4773300290107727, "learning_rate": 1.2493942827601819e-05, "loss": 0.0155, "step": 12115 }, { "epoch": 3.78, "grad_norm": 0.4813371002674103, "learning_rate": 1.2488670529448648e-05, "loss": 0.0191, "step": 12120 }, { "epoch": 3.78, "grad_norm": 0.48700809478759766, "learning_rate": 1.248339749373932e-05, "loss": 0.032, "step": 12125 }, { "epoch": 3.78, "grad_norm": 0.5969123244285583, "learning_rate": 1.2478123722036581e-05, "loss": 0.0228, "step": 12130 }, { "epoch": 3.78, "grad_norm": 0.587505578994751, "learning_rate": 1.2472849215903393e-05, "loss": 0.0185, "step": 12135 }, { "epoch": 3.79, "grad_norm": 0.7423061728477478, "learning_rate": 1.2467573976902936e-05, "loss": 0.0303, "step": 12140 }, { "epoch": 3.79, "grad_norm": 0.6694118976593018, "learning_rate": 1.2462298006598616e-05, "loss": 0.0214, "step": 12145 }, { "epoch": 3.79, "grad_norm": 0.8038167357444763, "learning_rate": 1.2457021306554045e-05, "loss": 0.0264, "step": 12150 }, { "epoch": 3.79, "grad_norm": 0.6632444858551025, "learning_rate": 1.2451743878333053e-05, "loss": 0.0287, "step": 12155 }, { "epoch": 3.79, "grad_norm": 0.6460713148117065, "learning_rate": 1.244646572349969e-05, "loss": 0.0291, "step": 12160 }, { "epoch": 3.79, "grad_norm": 0.5237529277801514, "learning_rate": 1.2441186843618216e-05, "loss": 0.0225, "step": 12165 }, { "epoch": 3.8, "grad_norm": 0.7596482634544373, "learning_rate": 1.2435907240253116e-05, "loss": 0.0281, "step": 12170 }, { "epoch": 3.8, "grad_norm": 0.7577078938484192, "learning_rate": 1.2430626914969074e-05, "loss": 0.0269, "step": 12175 }, { "epoch": 3.8, "grad_norm": 0.4984164834022522, "learning_rate": 1.2425345869331004e-05, "loss": 0.0251, "step": 12180 }, { "epoch": 3.8, "grad_norm": 0.62168949842453, "learning_rate": 1.2420064104904022e-05, "loss": 0.0178, "step": 12185 }, { "epoch": 3.8, "grad_norm": 0.8107472658157349, "learning_rate": 1.241478162325346e-05, "loss": 0.0236, "step": 12190 }, { "epoch": 3.8, "grad_norm": 0.5770743489265442, "learning_rate": 1.2409498425944864e-05, "loss": 0.0222, "step": 12195 }, { "epoch": 3.81, "grad_norm": 0.6336897015571594, "learning_rate": 1.2404214514543993e-05, "loss": 0.0204, "step": 12200 }, { "epoch": 3.81, "grad_norm": 0.8589664697647095, "learning_rate": 1.2398929890616816e-05, "loss": 0.0263, "step": 12205 }, { "epoch": 3.81, "grad_norm": 0.4037541449069977, "learning_rate": 1.239364455572951e-05, "loss": 0.0233, "step": 12210 }, { "epoch": 3.81, "grad_norm": 0.8225604295730591, "learning_rate": 1.2388358511448475e-05, "loss": 0.0187, "step": 12215 }, { "epoch": 3.81, "grad_norm": 0.7085739970207214, "learning_rate": 1.2383071759340305e-05, "loss": 0.0329, "step": 12220 }, { "epoch": 3.81, "grad_norm": 0.735085666179657, "learning_rate": 1.2377784300971807e-05, "loss": 0.0202, "step": 12225 }, { "epoch": 3.81, "grad_norm": 0.9221487045288086, "learning_rate": 1.2372496137910011e-05, "loss": 0.0263, "step": 12230 }, { "epoch": 3.82, "grad_norm": 0.550031840801239, "learning_rate": 1.2367207271722139e-05, "loss": 0.0212, "step": 12235 }, { "epoch": 3.82, "grad_norm": 0.744546115398407, "learning_rate": 1.2361917703975636e-05, "loss": 0.0256, "step": 12240 }, { "epoch": 3.82, "grad_norm": 0.4377955496311188, "learning_rate": 1.235662743623814e-05, "loss": 0.0167, "step": 12245 }, { "epoch": 3.82, "grad_norm": 0.7871254086494446, "learning_rate": 1.2351336470077507e-05, "loss": 0.0223, "step": 12250 }, { "epoch": 3.82, "grad_norm": 0.8724322319030762, "learning_rate": 1.2346044807061798e-05, "loss": 0.0266, "step": 12255 }, { "epoch": 3.82, "grad_norm": 0.7433586120605469, "learning_rate": 1.2340752448759281e-05, "loss": 0.022, "step": 12260 }, { "epoch": 3.83, "grad_norm": 0.7378526926040649, "learning_rate": 1.233545939673842e-05, "loss": 0.0291, "step": 12265 }, { "epoch": 3.83, "grad_norm": 0.7587401270866394, "learning_rate": 1.2330165652567905e-05, "loss": 0.0273, "step": 12270 }, { "epoch": 3.83, "grad_norm": 1.0976639986038208, "learning_rate": 1.2324871217816615e-05, "loss": 0.0233, "step": 12275 }, { "epoch": 3.83, "grad_norm": 0.8238142728805542, "learning_rate": 1.2319576094053631e-05, "loss": 0.0265, "step": 12280 }, { "epoch": 3.83, "grad_norm": 0.5005680918693542, "learning_rate": 1.2314280282848254e-05, "loss": 0.0308, "step": 12285 }, { "epoch": 3.83, "grad_norm": 0.631725013256073, "learning_rate": 1.2308983785769975e-05, "loss": 0.0207, "step": 12290 }, { "epoch": 3.83, "grad_norm": 0.770847499370575, "learning_rate": 1.2303686604388498e-05, "loss": 0.0311, "step": 12295 }, { "epoch": 3.84, "grad_norm": 0.6609355211257935, "learning_rate": 1.2298388740273719e-05, "loss": 0.0316, "step": 12300 }, { "epoch": 3.84, "grad_norm": 0.7061938643455505, "learning_rate": 1.2293090194995746e-05, "loss": 0.019, "step": 12305 }, { "epoch": 3.84, "grad_norm": 0.6347512602806091, "learning_rate": 1.2287790970124884e-05, "loss": 0.0243, "step": 12310 }, { "epoch": 3.84, "grad_norm": 0.8019315600395203, "learning_rate": 1.2282491067231641e-05, "loss": 0.016, "step": 12315 }, { "epoch": 3.84, "grad_norm": 0.5562499165534973, "learning_rate": 1.2277190487886725e-05, "loss": 0.0199, "step": 12320 }, { "epoch": 3.84, "grad_norm": 0.8764368295669556, "learning_rate": 1.2271889233661045e-05, "loss": 0.03, "step": 12325 }, { "epoch": 3.85, "grad_norm": 0.6134083867073059, "learning_rate": 1.2266587306125713e-05, "loss": 0.0287, "step": 12330 }, { "epoch": 3.85, "grad_norm": 0.7679027318954468, "learning_rate": 1.2261284706852032e-05, "loss": 0.0327, "step": 12335 }, { "epoch": 3.85, "grad_norm": 0.46010440587997437, "learning_rate": 1.2255981437411513e-05, "loss": 0.0202, "step": 12340 }, { "epoch": 3.85, "grad_norm": 1.033030390739441, "learning_rate": 1.2250677499375863e-05, "loss": 0.0283, "step": 12345 }, { "epoch": 3.85, "grad_norm": 0.5734465718269348, "learning_rate": 1.2245372894316984e-05, "loss": 0.0214, "step": 12350 }, { "epoch": 3.85, "grad_norm": 0.6062762141227722, "learning_rate": 1.224006762380698e-05, "loss": 0.03, "step": 12355 }, { "epoch": 3.86, "grad_norm": 0.2957155704498291, "learning_rate": 1.223476168941815e-05, "loss": 0.0162, "step": 12360 }, { "epoch": 3.86, "grad_norm": 0.6891098618507385, "learning_rate": 1.222945509272299e-05, "loss": 0.0276, "step": 12365 }, { "epoch": 3.86, "grad_norm": 0.838630199432373, "learning_rate": 1.222414783529419e-05, "loss": 0.0305, "step": 12370 }, { "epoch": 3.86, "grad_norm": 0.6958526968955994, "learning_rate": 1.221883991870464e-05, "loss": 0.0276, "step": 12375 }, { "epoch": 3.86, "grad_norm": 0.42833009362220764, "learning_rate": 1.2213531344527426e-05, "loss": 0.0253, "step": 12380 }, { "epoch": 3.86, "grad_norm": 0.3405981659889221, "learning_rate": 1.2208222114335826e-05, "loss": 0.0227, "step": 12385 }, { "epoch": 3.86, "grad_norm": 0.5990917682647705, "learning_rate": 1.2202912229703305e-05, "loss": 0.0214, "step": 12390 }, { "epoch": 3.87, "grad_norm": 1.7133599519729614, "learning_rate": 1.219760169220354e-05, "loss": 0.0183, "step": 12395 }, { "epoch": 3.87, "grad_norm": 0.5767427086830139, "learning_rate": 1.2192290503410384e-05, "loss": 0.0208, "step": 12400 }, { "epoch": 3.87, "grad_norm": 0.7734942436218262, "learning_rate": 1.2186978664897899e-05, "loss": 0.0291, "step": 12405 }, { "epoch": 3.87, "grad_norm": 1.1354448795318604, "learning_rate": 1.2181666178240319e-05, "loss": 0.0275, "step": 12410 }, { "epoch": 3.87, "grad_norm": 0.5956282019615173, "learning_rate": 1.2176353045012089e-05, "loss": 0.0251, "step": 12415 }, { "epoch": 3.87, "grad_norm": 0.8632811307907104, "learning_rate": 1.217103926678784e-05, "loss": 0.0277, "step": 12420 }, { "epoch": 3.88, "grad_norm": 0.7880505323410034, "learning_rate": 1.2165724845142388e-05, "loss": 0.0241, "step": 12425 }, { "epoch": 3.88, "grad_norm": 0.6448076367378235, "learning_rate": 1.2160409781650748e-05, "loss": 0.023, "step": 12430 }, { "epoch": 3.88, "grad_norm": 0.4887710511684418, "learning_rate": 1.2155094077888117e-05, "loss": 0.0216, "step": 12435 }, { "epoch": 3.88, "grad_norm": 0.7783735394477844, "learning_rate": 1.2149777735429896e-05, "loss": 0.0285, "step": 12440 }, { "epoch": 3.88, "grad_norm": 1.0767704248428345, "learning_rate": 1.2144460755851652e-05, "loss": 0.0262, "step": 12445 }, { "epoch": 3.88, "grad_norm": 1.1533854007720947, "learning_rate": 1.2139143140729166e-05, "loss": 0.0224, "step": 12450 }, { "epoch": 3.88, "grad_norm": 0.41049784421920776, "learning_rate": 1.2133824891638391e-05, "loss": 0.025, "step": 12455 }, { "epoch": 3.89, "grad_norm": 0.6735257506370544, "learning_rate": 1.2128506010155472e-05, "loss": 0.0275, "step": 12460 }, { "epoch": 3.89, "grad_norm": 0.9369131326675415, "learning_rate": 1.2123186497856743e-05, "loss": 0.0196, "step": 12465 }, { "epoch": 3.89, "grad_norm": 0.5147635340690613, "learning_rate": 1.2117866356318724e-05, "loss": 0.0249, "step": 12470 }, { "epoch": 3.89, "grad_norm": 0.5361283421516418, "learning_rate": 1.2112545587118124e-05, "loss": 0.0247, "step": 12475 }, { "epoch": 3.89, "grad_norm": 0.621346652507782, "learning_rate": 1.210722419183183e-05, "loss": 0.0259, "step": 12480 }, { "epoch": 3.89, "grad_norm": 0.4300675094127655, "learning_rate": 1.2101902172036927e-05, "loss": 0.0241, "step": 12485 }, { "epoch": 3.9, "grad_norm": 0.5782265663146973, "learning_rate": 1.2096579529310673e-05, "loss": 0.0226, "step": 12490 }, { "epoch": 3.9, "grad_norm": 0.7544804215431213, "learning_rate": 1.2091256265230517e-05, "loss": 0.0299, "step": 12495 }, { "epoch": 3.9, "grad_norm": 0.9750189185142517, "learning_rate": 1.2085932381374094e-05, "loss": 0.0206, "step": 12500 }, { "epoch": 3.9, "grad_norm": 0.5985661745071411, "learning_rate": 1.2080607879319217e-05, "loss": 0.0394, "step": 12505 }, { "epoch": 3.9, "grad_norm": 0.9388284683227539, "learning_rate": 1.2075282760643888e-05, "loss": 0.0183, "step": 12510 }, { "epoch": 3.9, "grad_norm": 0.8321487903594971, "learning_rate": 1.2069957026926278e-05, "loss": 0.0328, "step": 12515 }, { "epoch": 3.9, "grad_norm": 0.9432512521743774, "learning_rate": 1.2064630679744762e-05, "loss": 0.0238, "step": 12520 }, { "epoch": 3.91, "grad_norm": 0.7519678473472595, "learning_rate": 1.2059303720677885e-05, "loss": 0.0197, "step": 12525 }, { "epoch": 3.91, "grad_norm": 0.7325347065925598, "learning_rate": 1.2053976151304372e-05, "loss": 0.0239, "step": 12530 }, { "epoch": 3.91, "grad_norm": 0.36889663338661194, "learning_rate": 1.2048647973203125e-05, "loss": 0.023, "step": 12535 }, { "epoch": 3.91, "grad_norm": 0.5902948379516602, "learning_rate": 1.2043319187953242e-05, "loss": 0.0196, "step": 12540 }, { "epoch": 3.91, "grad_norm": 0.5314132571220398, "learning_rate": 1.2037989797133986e-05, "loss": 0.0241, "step": 12545 }, { "epoch": 3.91, "grad_norm": 0.5154591202735901, "learning_rate": 1.2032659802324807e-05, "loss": 0.0218, "step": 12550 }, { "epoch": 3.92, "grad_norm": 0.6044285893440247, "learning_rate": 1.2027329205105325e-05, "loss": 0.0205, "step": 12555 }, { "epoch": 3.92, "grad_norm": 0.6315939426422119, "learning_rate": 1.2021998007055353e-05, "loss": 0.025, "step": 12560 }, { "epoch": 3.92, "grad_norm": 0.7652075886726379, "learning_rate": 1.2016666209754877e-05, "loss": 0.0241, "step": 12565 }, { "epoch": 3.92, "grad_norm": 0.668079674243927, "learning_rate": 1.2011333814784048e-05, "loss": 0.03, "step": 12570 }, { "epoch": 3.92, "grad_norm": 0.8443313241004944, "learning_rate": 1.200600082372321e-05, "loss": 0.0348, "step": 12575 }, { "epoch": 3.92, "grad_norm": 0.4662708640098572, "learning_rate": 1.2000667238152874e-05, "loss": 0.0171, "step": 12580 }, { "epoch": 3.93, "grad_norm": 0.922974705696106, "learning_rate": 1.1995333059653739e-05, "loss": 0.0302, "step": 12585 }, { "epoch": 3.93, "grad_norm": 0.6022462248802185, "learning_rate": 1.1989998289806663e-05, "loss": 0.0268, "step": 12590 }, { "epoch": 3.93, "grad_norm": 0.7470941543579102, "learning_rate": 1.1984662930192692e-05, "loss": 0.0258, "step": 12595 }, { "epoch": 3.93, "grad_norm": 1.0102777481079102, "learning_rate": 1.197932698239304e-05, "loss": 0.032, "step": 12600 }, { "epoch": 3.93, "grad_norm": 0.827872097492218, "learning_rate": 1.19739904479891e-05, "loss": 0.033, "step": 12605 }, { "epoch": 3.93, "grad_norm": 0.8313772082328796, "learning_rate": 1.1968653328562438e-05, "loss": 0.0294, "step": 12610 }, { "epoch": 3.93, "grad_norm": 0.434220552444458, "learning_rate": 1.1963315625694792e-05, "loss": 0.0183, "step": 12615 }, { "epoch": 3.94, "grad_norm": 0.640626847743988, "learning_rate": 1.195797734096807e-05, "loss": 0.0281, "step": 12620 }, { "epoch": 3.94, "grad_norm": 0.6942445039749146, "learning_rate": 1.1952638475964358e-05, "loss": 0.036, "step": 12625 }, { "epoch": 3.94, "grad_norm": 0.4551185369491577, "learning_rate": 1.1947299032265911e-05, "loss": 0.0214, "step": 12630 }, { "epoch": 3.94, "grad_norm": 0.7700601816177368, "learning_rate": 1.1941959011455158e-05, "loss": 0.0243, "step": 12635 }, { "epoch": 3.94, "grad_norm": 0.6045627593994141, "learning_rate": 1.193661841511469e-05, "loss": 0.0211, "step": 12640 }, { "epoch": 3.94, "grad_norm": 0.8159734606742859, "learning_rate": 1.1931277244827287e-05, "loss": 0.0223, "step": 12645 }, { "epoch": 3.95, "grad_norm": 0.6006589531898499, "learning_rate": 1.1925935502175878e-05, "loss": 0.0209, "step": 12650 }, { "epoch": 3.95, "grad_norm": 0.5091835260391235, "learning_rate": 1.1920593188743579e-05, "loss": 0.0276, "step": 12655 }, { "epoch": 3.95, "grad_norm": 0.5942313075065613, "learning_rate": 1.1915250306113657e-05, "loss": 0.0269, "step": 12660 }, { "epoch": 3.95, "grad_norm": 0.7808033227920532, "learning_rate": 1.1909906855869568e-05, "loss": 0.0321, "step": 12665 }, { "epoch": 3.95, "grad_norm": 1.2433404922485352, "learning_rate": 1.1904562839594924e-05, "loss": 0.0388, "step": 12670 }, { "epoch": 3.95, "grad_norm": 0.7753468751907349, "learning_rate": 1.1899218258873508e-05, "loss": 0.0189, "step": 12675 }, { "epoch": 3.95, "grad_norm": 0.551559329032898, "learning_rate": 1.1893873115289266e-05, "loss": 0.0162, "step": 12680 }, { "epoch": 3.96, "grad_norm": 0.5967293977737427, "learning_rate": 1.1888527410426317e-05, "loss": 0.0273, "step": 12685 }, { "epoch": 3.96, "grad_norm": 0.51964271068573, "learning_rate": 1.1883181145868943e-05, "loss": 0.0327, "step": 12690 }, { "epoch": 3.96, "grad_norm": 0.27552059292793274, "learning_rate": 1.187783432320159e-05, "loss": 0.0268, "step": 12695 }, { "epoch": 3.96, "grad_norm": 0.7243056297302246, "learning_rate": 1.1872486944008876e-05, "loss": 0.025, "step": 12700 }, { "epoch": 3.96, "grad_norm": 0.8522168397903442, "learning_rate": 1.186713900987558e-05, "loss": 0.0219, "step": 12705 }, { "epoch": 3.96, "grad_norm": 0.6728376746177673, "learning_rate": 1.1861790522386643e-05, "loss": 0.025, "step": 12710 }, { "epoch": 3.97, "grad_norm": 0.602962851524353, "learning_rate": 1.1856441483127172e-05, "loss": 0.0228, "step": 12715 }, { "epoch": 3.97, "grad_norm": 0.5500351786613464, "learning_rate": 1.1851091893682438e-05, "loss": 0.028, "step": 12720 }, { "epoch": 3.97, "grad_norm": 0.6493998765945435, "learning_rate": 1.1845741755637877e-05, "loss": 0.0242, "step": 12725 }, { "epoch": 3.97, "grad_norm": 0.914799153804779, "learning_rate": 1.1840391070579083e-05, "loss": 0.0203, "step": 12730 }, { "epoch": 3.97, "grad_norm": 0.7835777401924133, "learning_rate": 1.1835039840091818e-05, "loss": 0.0227, "step": 12735 }, { "epoch": 3.97, "grad_norm": 0.7413834929466248, "learning_rate": 1.1829688065761997e-05, "loss": 0.031, "step": 12740 }, { "epoch": 3.98, "grad_norm": 0.7562592625617981, "learning_rate": 1.1824335749175706e-05, "loss": 0.0322, "step": 12745 }, { "epoch": 3.98, "grad_norm": 0.4942041039466858, "learning_rate": 1.1818982891919184e-05, "loss": 0.0238, "step": 12750 }, { "epoch": 3.98, "grad_norm": 0.5303024649620056, "learning_rate": 1.1813629495578838e-05, "loss": 0.0252, "step": 12755 }, { "epoch": 3.98, "grad_norm": 0.6929956078529358, "learning_rate": 1.1808275561741224e-05, "loss": 0.0239, "step": 12760 }, { "epoch": 3.98, "grad_norm": 0.7698571085929871, "learning_rate": 1.180292109199307e-05, "loss": 0.0299, "step": 12765 }, { "epoch": 3.98, "grad_norm": 0.7231923937797546, "learning_rate": 1.179756608792125e-05, "loss": 0.0234, "step": 12770 }, { "epoch": 3.98, "grad_norm": 0.5542680621147156, "learning_rate": 1.1792210551112806e-05, "loss": 0.0201, "step": 12775 }, { "epoch": 3.99, "grad_norm": 0.4305645823478699, "learning_rate": 1.1786854483154935e-05, "loss": 0.0255, "step": 12780 }, { "epoch": 3.99, "grad_norm": 0.5332358479499817, "learning_rate": 1.1781497885634987e-05, "loss": 0.019, "step": 12785 }, { "epoch": 3.99, "grad_norm": 0.7843573093414307, "learning_rate": 1.1776140760140477e-05, "loss": 0.0304, "step": 12790 }, { "epoch": 3.99, "grad_norm": 0.668351411819458, "learning_rate": 1.1770783108259072e-05, "loss": 0.0269, "step": 12795 }, { "epoch": 3.99, "grad_norm": 0.7774969339370728, "learning_rate": 1.1765424931578597e-05, "loss": 0.0209, "step": 12800 }, { "epoch": 3.99, "grad_norm": 0.6049669981002808, "learning_rate": 1.1760066231687021e-05, "loss": 0.0188, "step": 12805 }, { "epoch": 4.0, "grad_norm": 0.4718351662158966, "learning_rate": 1.175470701017249e-05, "loss": 0.0156, "step": 12810 }, { "epoch": 4.0, "grad_norm": 0.7984088063240051, "learning_rate": 1.1749347268623289e-05, "loss": 0.0323, "step": 12815 }, { "epoch": 4.0, "grad_norm": 0.7956653237342834, "learning_rate": 1.1743987008627858e-05, "loss": 0.0322, "step": 12820 }, { "epoch": 4.0, "grad_norm": 0.5670791268348694, "learning_rate": 1.1738626231774792e-05, "loss": 0.0219, "step": 12825 }, { "epoch": 4.0, "grad_norm": 0.44218578934669495, "learning_rate": 1.1733264939652844e-05, "loss": 0.0141, "step": 12830 }, { "epoch": 4.0, "grad_norm": 0.4226168692111969, "learning_rate": 1.172790313385092e-05, "loss": 0.0118, "step": 12835 }, { "epoch": 4.0, "grad_norm": 0.47754889726638794, "learning_rate": 1.1722540815958066e-05, "loss": 0.0159, "step": 12840 }, { "epoch": 4.01, "grad_norm": 0.3430834710597992, "learning_rate": 1.1717177987563488e-05, "loss": 0.0118, "step": 12845 }, { "epoch": 4.01, "grad_norm": 0.49128466844558716, "learning_rate": 1.171181465025655e-05, "loss": 0.0178, "step": 12850 }, { "epoch": 4.01, "grad_norm": 0.5054926872253418, "learning_rate": 1.1706450805626762e-05, "loss": 0.0131, "step": 12855 }, { "epoch": 4.01, "grad_norm": 0.4532546401023865, "learning_rate": 1.1701086455263776e-05, "loss": 0.0143, "step": 12860 }, { "epoch": 4.01, "grad_norm": 0.4453551173210144, "learning_rate": 1.16957216007574e-05, "loss": 0.0139, "step": 12865 }, { "epoch": 4.01, "grad_norm": 0.7018569707870483, "learning_rate": 1.1690356243697597e-05, "loss": 0.0168, "step": 12870 }, { "epoch": 4.02, "grad_norm": 0.4247054159641266, "learning_rate": 1.1684990385674469e-05, "loss": 0.0123, "step": 12875 }, { "epoch": 4.02, "grad_norm": 0.3397986590862274, "learning_rate": 1.1679624028278276e-05, "loss": 0.0136, "step": 12880 }, { "epoch": 4.02, "grad_norm": 0.2713475227355957, "learning_rate": 1.167425717309942e-05, "loss": 0.0184, "step": 12885 }, { "epoch": 4.02, "grad_norm": 0.6043540239334106, "learning_rate": 1.166888982172845e-05, "loss": 0.0131, "step": 12890 }, { "epoch": 4.02, "grad_norm": 0.438080757856369, "learning_rate": 1.1663521975756065e-05, "loss": 0.0135, "step": 12895 }, { "epoch": 4.02, "grad_norm": 0.40145036578178406, "learning_rate": 1.1658153636773108e-05, "loss": 0.0142, "step": 12900 }, { "epoch": 4.03, "grad_norm": 0.7116342782974243, "learning_rate": 1.1652784806370576e-05, "loss": 0.0129, "step": 12905 }, { "epoch": 4.03, "grad_norm": 0.5809972286224365, "learning_rate": 1.1647415486139597e-05, "loss": 0.022, "step": 12910 }, { "epoch": 4.03, "grad_norm": 0.7480118870735168, "learning_rate": 1.1642045677671457e-05, "loss": 0.0122, "step": 12915 }, { "epoch": 4.03, "grad_norm": 0.6057562828063965, "learning_rate": 1.163667538255758e-05, "loss": 0.0132, "step": 12920 }, { "epoch": 4.03, "grad_norm": 0.5527793169021606, "learning_rate": 1.163130460238954e-05, "loss": 0.0098, "step": 12925 }, { "epoch": 4.03, "grad_norm": 0.6269908547401428, "learning_rate": 1.1625933338759045e-05, "loss": 0.013, "step": 12930 }, { "epoch": 4.03, "grad_norm": 0.8735302090644836, "learning_rate": 1.1620561593257958e-05, "loss": 0.0156, "step": 12935 }, { "epoch": 4.04, "grad_norm": 0.58750981092453, "learning_rate": 1.1615189367478275e-05, "loss": 0.0126, "step": 12940 }, { "epoch": 4.04, "grad_norm": 0.6866432428359985, "learning_rate": 1.1609816663012145e-05, "loss": 0.0158, "step": 12945 }, { "epoch": 4.04, "grad_norm": 0.6975976228713989, "learning_rate": 1.1604443481451843e-05, "loss": 0.0131, "step": 12950 }, { "epoch": 4.04, "grad_norm": 3.5784661769866943, "learning_rate": 1.1599069824389804e-05, "loss": 0.0129, "step": 12955 }, { "epoch": 4.04, "grad_norm": 0.5637338757514954, "learning_rate": 1.159369569341859e-05, "loss": 0.0125, "step": 12960 }, { "epoch": 4.04, "grad_norm": 0.9260528683662415, "learning_rate": 1.1588321090130907e-05, "loss": 0.0169, "step": 12965 }, { "epoch": 4.05, "grad_norm": 0.36479848623275757, "learning_rate": 1.1582946016119602e-05, "loss": 0.0127, "step": 12970 }, { "epoch": 4.05, "grad_norm": 0.7243846654891968, "learning_rate": 1.1577570472977665e-05, "loss": 0.0159, "step": 12975 }, { "epoch": 4.05, "grad_norm": 0.4703471064567566, "learning_rate": 1.1572194462298225e-05, "loss": 0.011, "step": 12980 }, { "epoch": 4.05, "grad_norm": 0.31925779581069946, "learning_rate": 1.1566817985674537e-05, "loss": 0.0099, "step": 12985 }, { "epoch": 4.05, "grad_norm": 0.4469374418258667, "learning_rate": 1.1561441044700009e-05, "loss": 0.0135, "step": 12990 }, { "epoch": 4.05, "grad_norm": 0.6372437477111816, "learning_rate": 1.1556063640968176e-05, "loss": 0.0186, "step": 12995 }, { "epoch": 4.05, "grad_norm": 0.533165693283081, "learning_rate": 1.1550685776072723e-05, "loss": 0.0098, "step": 13000 }, { "epoch": 4.06, "grad_norm": 0.24450081586837769, "learning_rate": 1.154530745160746e-05, "loss": 0.0091, "step": 13005 }, { "epoch": 4.06, "grad_norm": 0.4475289583206177, "learning_rate": 1.1539928669166334e-05, "loss": 0.015, "step": 13010 }, { "epoch": 4.06, "grad_norm": 0.44072267413139343, "learning_rate": 1.1534549430343436e-05, "loss": 0.0106, "step": 13015 }, { "epoch": 4.06, "grad_norm": 0.7471389770507812, "learning_rate": 1.152916973673298e-05, "loss": 0.0176, "step": 13020 }, { "epoch": 4.06, "grad_norm": 0.45366644859313965, "learning_rate": 1.1523789589929331e-05, "loss": 0.0118, "step": 13025 }, { "epoch": 4.06, "grad_norm": 0.5312186479568481, "learning_rate": 1.151840899152697e-05, "loss": 0.0132, "step": 13030 }, { "epoch": 4.07, "grad_norm": 0.39465898275375366, "learning_rate": 1.1513027943120528e-05, "loss": 0.0103, "step": 13035 }, { "epoch": 4.07, "grad_norm": 0.4964543581008911, "learning_rate": 1.1507646446304759e-05, "loss": 0.0183, "step": 13040 }, { "epoch": 4.07, "grad_norm": 0.36636462807655334, "learning_rate": 1.1502264502674552e-05, "loss": 0.0137, "step": 13045 }, { "epoch": 4.07, "grad_norm": 0.5650526881217957, "learning_rate": 1.149688211382493e-05, "loss": 0.0116, "step": 13050 }, { "epoch": 4.07, "grad_norm": 0.7195597290992737, "learning_rate": 1.149149928135105e-05, "loss": 0.0153, "step": 13055 }, { "epoch": 4.07, "grad_norm": 0.6196027398109436, "learning_rate": 1.1486116006848196e-05, "loss": 0.0142, "step": 13060 }, { "epoch": 4.07, "grad_norm": 0.45367714762687683, "learning_rate": 1.1480732291911786e-05, "loss": 0.012, "step": 13065 }, { "epoch": 4.08, "grad_norm": 0.5096328854560852, "learning_rate": 1.1475348138137368e-05, "loss": 0.0142, "step": 13070 }, { "epoch": 4.08, "grad_norm": 0.3048243522644043, "learning_rate": 1.1469963547120614e-05, "loss": 0.0138, "step": 13075 }, { "epoch": 4.08, "grad_norm": 0.5990984439849854, "learning_rate": 1.1464578520457341e-05, "loss": 0.0145, "step": 13080 }, { "epoch": 4.08, "grad_norm": 0.5580481290817261, "learning_rate": 1.1459193059743476e-05, "loss": 0.0139, "step": 13085 }, { "epoch": 4.08, "grad_norm": 0.6747692823410034, "learning_rate": 1.1453807166575093e-05, "loss": 0.0134, "step": 13090 }, { "epoch": 4.08, "grad_norm": 0.6296996474266052, "learning_rate": 1.1448420842548375e-05, "loss": 0.0131, "step": 13095 }, { "epoch": 4.09, "grad_norm": 0.5080475211143494, "learning_rate": 1.1443034089259651e-05, "loss": 0.0142, "step": 13100 }, { "epoch": 4.09, "grad_norm": 0.6452639698982239, "learning_rate": 1.1437646908305364e-05, "loss": 0.0197, "step": 13105 }, { "epoch": 4.09, "grad_norm": 0.7623153328895569, "learning_rate": 1.1432259301282091e-05, "loss": 0.0137, "step": 13110 }, { "epoch": 4.09, "grad_norm": 0.539873480796814, "learning_rate": 1.142687126978653e-05, "loss": 0.018, "step": 13115 }, { "epoch": 4.09, "grad_norm": 0.4487948417663574, "learning_rate": 1.1421482815415513e-05, "loss": 0.0114, "step": 13120 }, { "epoch": 4.09, "grad_norm": 0.3985424041748047, "learning_rate": 1.141609393976599e-05, "loss": 0.0127, "step": 13125 }, { "epoch": 4.1, "grad_norm": 0.5115072131156921, "learning_rate": 1.1410704644435038e-05, "loss": 0.0096, "step": 13130 }, { "epoch": 4.1, "grad_norm": 0.7904471158981323, "learning_rate": 1.1405314931019854e-05, "loss": 0.0156, "step": 13135 }, { "epoch": 4.1, "grad_norm": 0.512873649597168, "learning_rate": 1.139992480111777e-05, "loss": 0.0128, "step": 13140 }, { "epoch": 4.1, "grad_norm": 0.7409130930900574, "learning_rate": 1.1394534256326231e-05, "loss": 0.0126, "step": 13145 }, { "epoch": 4.1, "grad_norm": 0.7204809188842773, "learning_rate": 1.1389143298242811e-05, "loss": 0.0148, "step": 13150 }, { "epoch": 4.1, "grad_norm": 0.37489232420921326, "learning_rate": 1.13837519284652e-05, "loss": 0.0104, "step": 13155 }, { "epoch": 4.1, "grad_norm": 0.38769617676734924, "learning_rate": 1.1378360148591217e-05, "loss": 0.0151, "step": 13160 }, { "epoch": 4.11, "grad_norm": 0.6176652312278748, "learning_rate": 1.13729679602188e-05, "loss": 0.0134, "step": 13165 }, { "epoch": 4.11, "grad_norm": 0.4696696400642395, "learning_rate": 1.1367575364946006e-05, "loss": 0.0129, "step": 13170 }, { "epoch": 4.11, "grad_norm": 0.39097660779953003, "learning_rate": 1.1362182364371018e-05, "loss": 0.0196, "step": 13175 }, { "epoch": 4.11, "grad_norm": 0.7201048135757446, "learning_rate": 1.1356788960092132e-05, "loss": 0.0129, "step": 13180 }, { "epoch": 4.11, "grad_norm": 0.581547737121582, "learning_rate": 1.1351395153707765e-05, "loss": 0.0151, "step": 13185 }, { "epoch": 4.11, "grad_norm": 0.35174039006233215, "learning_rate": 1.1346000946816462e-05, "loss": 0.0108, "step": 13190 }, { "epoch": 4.12, "grad_norm": 0.48897290229797363, "learning_rate": 1.1340606341016875e-05, "loss": 0.018, "step": 13195 }, { "epoch": 4.12, "grad_norm": 0.6607118844985962, "learning_rate": 1.1335211337907782e-05, "loss": 0.014, "step": 13200 }, { "epoch": 4.12, "grad_norm": 0.3014785051345825, "learning_rate": 1.1329815939088072e-05, "loss": 0.0099, "step": 13205 }, { "epoch": 4.12, "grad_norm": 0.34429848194122314, "learning_rate": 1.1324420146156757e-05, "loss": 0.0102, "step": 13210 }, { "epoch": 4.12, "grad_norm": 0.3376854956150055, "learning_rate": 1.1319023960712972e-05, "loss": 0.0165, "step": 13215 }, { "epoch": 4.12, "grad_norm": 0.4123086631298065, "learning_rate": 1.1313627384355943e-05, "loss": 0.0094, "step": 13220 }, { "epoch": 4.12, "grad_norm": 0.47521141171455383, "learning_rate": 1.1308230418685045e-05, "loss": 0.0166, "step": 13225 }, { "epoch": 4.13, "grad_norm": 0.629194438457489, "learning_rate": 1.1302833065299745e-05, "loss": 0.0131, "step": 13230 }, { "epoch": 4.13, "grad_norm": 0.9634842872619629, "learning_rate": 1.129743532579964e-05, "loss": 0.0122, "step": 13235 }, { "epoch": 4.13, "grad_norm": 0.7814756631851196, "learning_rate": 1.129203720178442e-05, "loss": 0.019, "step": 13240 }, { "epoch": 4.13, "grad_norm": 0.4899265170097351, "learning_rate": 1.1286638694853916e-05, "loss": 0.0153, "step": 13245 }, { "epoch": 4.13, "grad_norm": 0.5780858993530273, "learning_rate": 1.1281239806608056e-05, "loss": 0.0157, "step": 13250 }, { "epoch": 4.13, "grad_norm": 0.7560383081436157, "learning_rate": 1.1275840538646878e-05, "loss": 0.021, "step": 13255 }, { "epoch": 4.14, "grad_norm": 0.6205662488937378, "learning_rate": 1.1270440892570544e-05, "loss": 0.0188, "step": 13260 }, { "epoch": 4.14, "grad_norm": 0.3816826641559601, "learning_rate": 1.1265040869979325e-05, "loss": 0.0187, "step": 13265 }, { "epoch": 4.14, "grad_norm": 0.6150768995285034, "learning_rate": 1.1259640472473602e-05, "loss": 0.0193, "step": 13270 }, { "epoch": 4.14, "grad_norm": 0.7604739665985107, "learning_rate": 1.1254239701653862e-05, "loss": 0.0159, "step": 13275 }, { "epoch": 4.14, "grad_norm": 0.6447388529777527, "learning_rate": 1.1248838559120708e-05, "loss": 0.0128, "step": 13280 }, { "epoch": 4.14, "grad_norm": 0.6438114643096924, "learning_rate": 1.1243437046474854e-05, "loss": 0.0149, "step": 13285 }, { "epoch": 4.15, "grad_norm": 0.7647647857666016, "learning_rate": 1.1238035165317123e-05, "loss": 0.016, "step": 13290 }, { "epoch": 4.15, "grad_norm": 0.5507582426071167, "learning_rate": 1.1232632917248442e-05, "loss": 0.0195, "step": 13295 }, { "epoch": 4.15, "grad_norm": 0.5412368178367615, "learning_rate": 1.1227230303869858e-05, "loss": 0.016, "step": 13300 }, { "epoch": 4.15, "grad_norm": 0.6669492125511169, "learning_rate": 1.1221827326782515e-05, "loss": 0.0126, "step": 13305 }, { "epoch": 4.15, "grad_norm": 0.48139962553977966, "learning_rate": 1.121642398758767e-05, "loss": 0.0124, "step": 13310 }, { "epoch": 4.15, "grad_norm": 0.6500893235206604, "learning_rate": 1.1211020287886685e-05, "loss": 0.0135, "step": 13315 }, { "epoch": 4.15, "grad_norm": 0.6527283787727356, "learning_rate": 1.1205616229281033e-05, "loss": 0.0133, "step": 13320 }, { "epoch": 4.16, "grad_norm": 0.4571009576320648, "learning_rate": 1.1200211813372288e-05, "loss": 0.0107, "step": 13325 }, { "epoch": 4.16, "grad_norm": 0.3937748372554779, "learning_rate": 1.1194807041762137e-05, "loss": 0.0148, "step": 13330 }, { "epoch": 4.16, "grad_norm": 0.4135548770427704, "learning_rate": 1.1189401916052362e-05, "loss": 0.0205, "step": 13335 }, { "epoch": 4.16, "grad_norm": 0.6549053192138672, "learning_rate": 1.118399643784486e-05, "loss": 0.0181, "step": 13340 }, { "epoch": 4.16, "grad_norm": 0.7284772396087646, "learning_rate": 1.1178590608741628e-05, "loss": 0.0113, "step": 13345 }, { "epoch": 4.16, "grad_norm": 0.8119202256202698, "learning_rate": 1.1173184430344765e-05, "loss": 0.0144, "step": 13350 }, { "epoch": 4.17, "grad_norm": 0.8524966239929199, "learning_rate": 1.1167777904256481e-05, "loss": 0.0138, "step": 13355 }, { "epoch": 4.17, "grad_norm": 0.6740317940711975, "learning_rate": 1.116237103207908e-05, "loss": 0.0165, "step": 13360 }, { "epoch": 4.17, "grad_norm": 0.43391916155815125, "learning_rate": 1.115696381541497e-05, "loss": 0.0118, "step": 13365 }, { "epoch": 4.17, "grad_norm": 0.8504462838172913, "learning_rate": 1.1151556255866668e-05, "loss": 0.0148, "step": 13370 }, { "epoch": 4.17, "grad_norm": 0.6841346025466919, "learning_rate": 1.1146148355036788e-05, "loss": 0.0118, "step": 13375 }, { "epoch": 4.17, "grad_norm": 0.5312472581863403, "learning_rate": 1.1140740114528044e-05, "loss": 0.0162, "step": 13380 }, { "epoch": 4.17, "grad_norm": 0.7032490372657776, "learning_rate": 1.113533153594325e-05, "loss": 0.0128, "step": 13385 }, { "epoch": 4.18, "grad_norm": 0.5678761005401611, "learning_rate": 1.1129922620885327e-05, "loss": 0.0132, "step": 13390 }, { "epoch": 4.18, "grad_norm": 0.5700942873954773, "learning_rate": 1.112451337095729e-05, "loss": 0.0122, "step": 13395 }, { "epoch": 4.18, "grad_norm": 0.6323286294937134, "learning_rate": 1.1119103787762254e-05, "loss": 0.0158, "step": 13400 }, { "epoch": 4.18, "grad_norm": 0.38179242610931396, "learning_rate": 1.1113693872903426e-05, "loss": 0.0135, "step": 13405 }, { "epoch": 4.18, "grad_norm": 0.8654710054397583, "learning_rate": 1.1108283627984129e-05, "loss": 0.0148, "step": 13410 }, { "epoch": 4.18, "grad_norm": 0.5700730681419373, "learning_rate": 1.1102873054607768e-05, "loss": 0.0167, "step": 13415 }, { "epoch": 4.19, "grad_norm": 0.5876476168632507, "learning_rate": 1.109746215437785e-05, "loss": 0.0163, "step": 13420 }, { "epoch": 4.19, "grad_norm": 0.3003236651420593, "learning_rate": 1.109205092889798e-05, "loss": 0.0151, "step": 13425 }, { "epoch": 4.19, "grad_norm": 0.5799517035484314, "learning_rate": 1.1086639379771857e-05, "loss": 0.0159, "step": 13430 }, { "epoch": 4.19, "grad_norm": 0.8325760364532471, "learning_rate": 1.1081227508603283e-05, "loss": 0.0162, "step": 13435 }, { "epoch": 4.19, "grad_norm": 0.4690932631492615, "learning_rate": 1.1075815316996144e-05, "loss": 0.0145, "step": 13440 }, { "epoch": 4.19, "grad_norm": 0.8196485638618469, "learning_rate": 1.107040280655443e-05, "loss": 0.0141, "step": 13445 }, { "epoch": 4.2, "grad_norm": 0.7324041724205017, "learning_rate": 1.1064989978882221e-05, "loss": 0.0168, "step": 13450 }, { "epoch": 4.2, "grad_norm": 0.47851309180259705, "learning_rate": 1.1059576835583694e-05, "loss": 0.0168, "step": 13455 }, { "epoch": 4.2, "grad_norm": 0.32001355290412903, "learning_rate": 1.1054163378263119e-05, "loss": 0.0119, "step": 13460 }, { "epoch": 4.2, "grad_norm": 0.42558789253234863, "learning_rate": 1.1048749608524856e-05, "loss": 0.0116, "step": 13465 }, { "epoch": 4.2, "grad_norm": 0.6945168375968933, "learning_rate": 1.104333552797336e-05, "loss": 0.0147, "step": 13470 }, { "epoch": 4.2, "grad_norm": 0.47573333978652954, "learning_rate": 1.1037921138213177e-05, "loss": 0.0148, "step": 13475 }, { "epoch": 4.2, "grad_norm": 0.3896462917327881, "learning_rate": 1.103250644084895e-05, "loss": 0.0114, "step": 13480 }, { "epoch": 4.21, "grad_norm": 0.5291655659675598, "learning_rate": 1.1027091437485404e-05, "loss": 0.0127, "step": 13485 }, { "epoch": 4.21, "grad_norm": 0.4930930733680725, "learning_rate": 1.1021676129727359e-05, "loss": 0.0135, "step": 13490 }, { "epoch": 4.21, "grad_norm": 0.6311109066009521, "learning_rate": 1.101626051917973e-05, "loss": 0.0177, "step": 13495 }, { "epoch": 4.21, "grad_norm": 0.5875706076622009, "learning_rate": 1.101084460744751e-05, "loss": 0.0112, "step": 13500 }, { "epoch": 4.21, "grad_norm": 0.8075762391090393, "learning_rate": 1.10054283961358e-05, "loss": 0.0144, "step": 13505 }, { "epoch": 4.21, "grad_norm": 0.7476543188095093, "learning_rate": 1.1000011886849767e-05, "loss": 0.0133, "step": 13510 }, { "epoch": 4.22, "grad_norm": 0.5966615676879883, "learning_rate": 1.099459508119468e-05, "loss": 0.0139, "step": 13515 }, { "epoch": 4.22, "grad_norm": 0.6924692392349243, "learning_rate": 1.0989177980775902e-05, "loss": 0.0135, "step": 13520 }, { "epoch": 4.22, "grad_norm": 0.5964117050170898, "learning_rate": 1.0983760587198866e-05, "loss": 0.0137, "step": 13525 }, { "epoch": 4.22, "grad_norm": 0.5716801285743713, "learning_rate": 1.0978342902069099e-05, "loss": 0.0155, "step": 13530 }, { "epoch": 4.22, "grad_norm": 0.6075406074523926, "learning_rate": 1.0972924926992224e-05, "loss": 0.0159, "step": 13535 }, { "epoch": 4.22, "grad_norm": 0.31922054290771484, "learning_rate": 1.0967506663573942e-05, "loss": 0.0162, "step": 13540 }, { "epoch": 4.22, "grad_norm": 0.548858642578125, "learning_rate": 1.0962088113420031e-05, "loss": 0.0173, "step": 13545 }, { "epoch": 4.23, "grad_norm": 0.6774040460586548, "learning_rate": 1.0956669278136367e-05, "loss": 0.0124, "step": 13550 }, { "epoch": 4.23, "grad_norm": 0.6590894460678101, "learning_rate": 1.0951250159328908e-05, "loss": 0.0136, "step": 13555 }, { "epoch": 4.23, "grad_norm": 0.5625125169754028, "learning_rate": 1.094583075860369e-05, "loss": 0.016, "step": 13560 }, { "epoch": 4.23, "grad_norm": 0.8643558621406555, "learning_rate": 1.0940411077566842e-05, "loss": 0.013, "step": 13565 }, { "epoch": 4.23, "grad_norm": 1.2786364555358887, "learning_rate": 1.0934991117824565e-05, "loss": 0.0156, "step": 13570 }, { "epoch": 4.23, "grad_norm": 0.21733850240707397, "learning_rate": 1.0929570880983152e-05, "loss": 0.0128, "step": 13575 }, { "epoch": 4.24, "grad_norm": 1.16891348361969, "learning_rate": 1.0924150368648974e-05, "loss": 0.0142, "step": 13580 }, { "epoch": 4.24, "grad_norm": 0.6256799697875977, "learning_rate": 1.091872958242848e-05, "loss": 0.0112, "step": 13585 }, { "epoch": 4.24, "grad_norm": 0.5066383481025696, "learning_rate": 1.091330852392821e-05, "loss": 0.0123, "step": 13590 }, { "epoch": 4.24, "grad_norm": 0.4950759708881378, "learning_rate": 1.0907887194754774e-05, "loss": 0.0151, "step": 13595 }, { "epoch": 4.24, "grad_norm": 0.5494158267974854, "learning_rate": 1.0902465596514871e-05, "loss": 0.0103, "step": 13600 }, { "epoch": 4.24, "grad_norm": 0.5297831296920776, "learning_rate": 1.0897043730815273e-05, "loss": 0.0098, "step": 13605 }, { "epoch": 4.24, "grad_norm": 0.5016347169876099, "learning_rate": 1.0891621599262835e-05, "loss": 0.0134, "step": 13610 }, { "epoch": 4.25, "grad_norm": 0.45652124285697937, "learning_rate": 1.0886199203464492e-05, "loss": 0.0138, "step": 13615 }, { "epoch": 4.25, "grad_norm": 0.4522571265697479, "learning_rate": 1.0880776545027252e-05, "loss": 0.0125, "step": 13620 }, { "epoch": 4.25, "grad_norm": 0.388315349817276, "learning_rate": 1.0875353625558205e-05, "loss": 0.0113, "step": 13625 }, { "epoch": 4.25, "grad_norm": 1.0517215728759766, "learning_rate": 1.086993044666452e-05, "loss": 0.0172, "step": 13630 }, { "epoch": 4.25, "grad_norm": 0.38936805725097656, "learning_rate": 1.0864507009953435e-05, "loss": 0.0146, "step": 13635 }, { "epoch": 4.25, "grad_norm": 0.6670991778373718, "learning_rate": 1.0859083317032273e-05, "loss": 0.0147, "step": 13640 }, { "epoch": 4.26, "grad_norm": 0.5981236696243286, "learning_rate": 1.0853659369508432e-05, "loss": 0.0192, "step": 13645 }, { "epoch": 4.26, "grad_norm": 0.7371012568473816, "learning_rate": 1.0848235168989378e-05, "loss": 0.0134, "step": 13650 }, { "epoch": 4.26, "grad_norm": 0.5573756098747253, "learning_rate": 1.084281071708266e-05, "loss": 0.0146, "step": 13655 }, { "epoch": 4.26, "grad_norm": 0.6925496459007263, "learning_rate": 1.0837386015395897e-05, "loss": 0.0106, "step": 13660 }, { "epoch": 4.26, "grad_norm": 0.5630655288696289, "learning_rate": 1.0831961065536786e-05, "loss": 0.0098, "step": 13665 }, { "epoch": 4.26, "grad_norm": 0.5372442007064819, "learning_rate": 1.0826535869113093e-05, "loss": 0.0172, "step": 13670 }, { "epoch": 4.27, "grad_norm": 0.4406470060348511, "learning_rate": 1.0821110427732658e-05, "loss": 0.0138, "step": 13675 }, { "epoch": 4.27, "grad_norm": 0.8311110138893127, "learning_rate": 1.0815684743003398e-05, "loss": 0.0141, "step": 13680 }, { "epoch": 4.27, "grad_norm": 0.5476308465003967, "learning_rate": 1.08102588165333e-05, "loss": 0.0139, "step": 13685 }, { "epoch": 4.27, "grad_norm": 0.3883194327354431, "learning_rate": 1.0804832649930416e-05, "loss": 0.0092, "step": 13690 }, { "epoch": 4.27, "grad_norm": 0.8831512928009033, "learning_rate": 1.0799406244802881e-05, "loss": 0.0198, "step": 13695 }, { "epoch": 4.27, "grad_norm": 0.6092522740364075, "learning_rate": 1.0793979602758889e-05, "loss": 0.016, "step": 13700 }, { "epoch": 4.27, "grad_norm": 0.6196035146713257, "learning_rate": 1.0788552725406713e-05, "loss": 0.0121, "step": 13705 }, { "epoch": 4.28, "grad_norm": 0.5230036973953247, "learning_rate": 1.0783125614354691e-05, "loss": 0.0124, "step": 13710 }, { "epoch": 4.28, "grad_norm": 0.5164169669151306, "learning_rate": 1.0777698271211232e-05, "loss": 0.0107, "step": 13715 }, { "epoch": 4.28, "grad_norm": 0.43008527159690857, "learning_rate": 1.0772270697584812e-05, "loss": 0.0122, "step": 13720 }, { "epoch": 4.28, "grad_norm": 0.4724281132221222, "learning_rate": 1.076684289508398e-05, "loss": 0.0137, "step": 13725 }, { "epoch": 4.28, "grad_norm": 0.6964073777198792, "learning_rate": 1.0761414865317348e-05, "loss": 0.0128, "step": 13730 }, { "epoch": 4.28, "grad_norm": 0.47468698024749756, "learning_rate": 1.0755986609893597e-05, "loss": 0.0122, "step": 13735 }, { "epoch": 4.29, "grad_norm": 1.2172765731811523, "learning_rate": 1.0750558130421473e-05, "loss": 0.0129, "step": 13740 }, { "epoch": 4.29, "grad_norm": 0.6301424503326416, "learning_rate": 1.0745129428509791e-05, "loss": 0.015, "step": 13745 }, { "epoch": 4.29, "grad_norm": 0.38925108313560486, "learning_rate": 1.0739700505767433e-05, "loss": 0.0127, "step": 13750 }, { "epoch": 4.29, "grad_norm": 1.6824076175689697, "learning_rate": 1.0734271363803343e-05, "loss": 0.0157, "step": 13755 }, { "epoch": 4.29, "grad_norm": 0.594070315361023, "learning_rate": 1.0728842004226531e-05, "loss": 0.0157, "step": 13760 }, { "epoch": 4.29, "grad_norm": 0.8799555897712708, "learning_rate": 1.0723412428646074e-05, "loss": 0.0179, "step": 13765 }, { "epoch": 4.29, "grad_norm": 0.5111485719680786, "learning_rate": 1.071798263867111e-05, "loss": 0.0168, "step": 13770 }, { "epoch": 4.3, "grad_norm": 0.5702099800109863, "learning_rate": 1.0712552635910844e-05, "loss": 0.0144, "step": 13775 }, { "epoch": 4.3, "grad_norm": 0.32097887992858887, "learning_rate": 1.0707122421974535e-05, "loss": 0.0115, "step": 13780 }, { "epoch": 4.3, "grad_norm": 0.6477245092391968, "learning_rate": 1.0701691998471515e-05, "loss": 0.0166, "step": 13785 }, { "epoch": 4.3, "grad_norm": 0.536425769329071, "learning_rate": 1.0696261367011178e-05, "loss": 0.016, "step": 13790 }, { "epoch": 4.3, "grad_norm": 0.668903648853302, "learning_rate": 1.0690830529202971e-05, "loss": 0.0128, "step": 13795 }, { "epoch": 4.3, "grad_norm": 0.4450022578239441, "learning_rate": 1.0685399486656407e-05, "loss": 0.0107, "step": 13800 }, { "epoch": 4.31, "grad_norm": 0.8847901225090027, "learning_rate": 1.0679968240981062e-05, "loss": 0.0127, "step": 13805 }, { "epoch": 4.31, "grad_norm": 0.5488216280937195, "learning_rate": 1.0674536793786572e-05, "loss": 0.0153, "step": 13810 }, { "epoch": 4.31, "grad_norm": 0.4895569086074829, "learning_rate": 1.0669105146682628e-05, "loss": 0.0131, "step": 13815 }, { "epoch": 4.31, "grad_norm": 0.5281688570976257, "learning_rate": 1.0663673301278981e-05, "loss": 0.0117, "step": 13820 }, { "epoch": 4.31, "grad_norm": 0.5016342401504517, "learning_rate": 1.0658241259185444e-05, "loss": 0.0091, "step": 13825 }, { "epoch": 4.31, "grad_norm": 0.6420480608940125, "learning_rate": 1.0652809022011891e-05, "loss": 0.0165, "step": 13830 }, { "epoch": 4.32, "grad_norm": 0.4913181960582733, "learning_rate": 1.0647376591368248e-05, "loss": 0.014, "step": 13835 }, { "epoch": 4.32, "grad_norm": 0.5184372663497925, "learning_rate": 1.0641943968864496e-05, "loss": 0.0116, "step": 13840 }, { "epoch": 4.32, "grad_norm": 0.8069402575492859, "learning_rate": 1.063651115611068e-05, "loss": 0.0176, "step": 13845 }, { "epoch": 4.32, "grad_norm": 0.593238890171051, "learning_rate": 1.06310781547169e-05, "loss": 0.0178, "step": 13850 }, { "epoch": 4.32, "grad_norm": 0.6287248730659485, "learning_rate": 1.062564496629331e-05, "loss": 0.015, "step": 13855 }, { "epoch": 4.32, "grad_norm": 0.38136905431747437, "learning_rate": 1.0620211592450116e-05, "loss": 0.0136, "step": 13860 }, { "epoch": 4.32, "grad_norm": 0.45957428216934204, "learning_rate": 1.0614778034797586e-05, "loss": 0.0154, "step": 13865 }, { "epoch": 4.33, "grad_norm": 0.38002118468284607, "learning_rate": 1.060934429494604e-05, "loss": 0.0133, "step": 13870 }, { "epoch": 4.33, "grad_norm": 0.5099014639854431, "learning_rate": 1.0603910374505847e-05, "loss": 0.014, "step": 13875 }, { "epoch": 4.33, "grad_norm": 0.5811852812767029, "learning_rate": 1.0598476275087437e-05, "loss": 0.0161, "step": 13880 }, { "epoch": 4.33, "grad_norm": 0.4693862795829773, "learning_rate": 1.059304199830129e-05, "loss": 0.0161, "step": 13885 }, { "epoch": 4.33, "grad_norm": 0.44405293464660645, "learning_rate": 1.0587607545757938e-05, "loss": 0.0147, "step": 13890 }, { "epoch": 4.33, "grad_norm": 0.43415597081184387, "learning_rate": 1.0582172919067963e-05, "loss": 0.015, "step": 13895 }, { "epoch": 4.34, "grad_norm": 0.6808017492294312, "learning_rate": 1.0576738119842005e-05, "loss": 0.015, "step": 13900 }, { "epoch": 4.34, "grad_norm": 0.978279709815979, "learning_rate": 1.0571303149690749e-05, "loss": 0.0192, "step": 13905 }, { "epoch": 4.34, "grad_norm": 0.44453221559524536, "learning_rate": 1.0565868010224932e-05, "loss": 0.0147, "step": 13910 }, { "epoch": 4.34, "grad_norm": 0.8853317499160767, "learning_rate": 1.0560432703055341e-05, "loss": 0.0186, "step": 13915 }, { "epoch": 4.34, "grad_norm": 0.7187196612358093, "learning_rate": 1.0556084337655437e-05, "loss": 0.0177, "step": 13920 }, { "epoch": 4.34, "grad_norm": 0.6485170722007751, "learning_rate": 1.0550648732678402e-05, "loss": 0.0173, "step": 13925 }, { "epoch": 4.34, "grad_norm": 0.7718275785446167, "learning_rate": 1.0545212964508063e-05, "loss": 0.0183, "step": 13930 }, { "epoch": 4.35, "grad_norm": 0.6682250499725342, "learning_rate": 1.0539777034755396e-05, "loss": 0.0137, "step": 13935 }, { "epoch": 4.35, "grad_norm": 0.4407562017440796, "learning_rate": 1.0534340945031425e-05, "loss": 0.013, "step": 13940 }, { "epoch": 4.35, "grad_norm": 0.5514277219772339, "learning_rate": 1.0528904696947211e-05, "loss": 0.0171, "step": 13945 }, { "epoch": 4.35, "grad_norm": 0.33027181029319763, "learning_rate": 1.052346829211388e-05, "loss": 0.0109, "step": 13950 }, { "epoch": 4.35, "grad_norm": 0.758332371711731, "learning_rate": 1.0518031732142592e-05, "loss": 0.0168, "step": 13955 }, { "epoch": 4.35, "grad_norm": 0.29376351833343506, "learning_rate": 1.051259501864456e-05, "loss": 0.011, "step": 13960 }, { "epoch": 4.36, "grad_norm": 0.47187724709510803, "learning_rate": 1.0507158153231027e-05, "loss": 0.0143, "step": 13965 }, { "epoch": 4.36, "grad_norm": 0.6413419246673584, "learning_rate": 1.0501721137513302e-05, "loss": 0.019, "step": 13970 }, { "epoch": 4.36, "grad_norm": 0.6035584211349487, "learning_rate": 1.049628397310273e-05, "loss": 0.0158, "step": 13975 }, { "epoch": 4.36, "grad_norm": 0.5032995343208313, "learning_rate": 1.0490846661610695e-05, "loss": 0.0157, "step": 13980 }, { "epoch": 4.36, "grad_norm": 0.43947482109069824, "learning_rate": 1.0485409204648624e-05, "loss": 0.014, "step": 13985 }, { "epoch": 4.36, "grad_norm": 0.5266494750976562, "learning_rate": 1.0479971603828001e-05, "loss": 0.0144, "step": 13990 }, { "epoch": 4.36, "grad_norm": 0.6087942719459534, "learning_rate": 1.047453386076034e-05, "loss": 0.0161, "step": 13995 }, { "epoch": 4.37, "grad_norm": 0.41034409403800964, "learning_rate": 1.04690959770572e-05, "loss": 0.0155, "step": 14000 }, { "epoch": 4.37, "grad_norm": 0.48467421531677246, "learning_rate": 1.0463657954330182e-05, "loss": 0.0137, "step": 14005 }, { "epoch": 4.37, "grad_norm": 0.4704853296279907, "learning_rate": 1.0458219794190927e-05, "loss": 0.0141, "step": 14010 }, { "epoch": 4.37, "grad_norm": 0.41661834716796875, "learning_rate": 1.0452781498251126e-05, "loss": 0.0159, "step": 14015 }, { "epoch": 4.37, "grad_norm": 0.5433653593063354, "learning_rate": 1.0447343068122493e-05, "loss": 0.0133, "step": 14020 }, { "epoch": 4.37, "grad_norm": 0.6942476034164429, "learning_rate": 1.0441904505416796e-05, "loss": 0.019, "step": 14025 }, { "epoch": 4.38, "grad_norm": 0.5101887583732605, "learning_rate": 1.0436465811745834e-05, "loss": 0.0124, "step": 14030 }, { "epoch": 4.38, "grad_norm": 0.659727156162262, "learning_rate": 1.0431026988721452e-05, "loss": 0.0211, "step": 14035 }, { "epoch": 4.38, "grad_norm": 0.4992978274822235, "learning_rate": 1.0425588037955532e-05, "loss": 0.0115, "step": 14040 }, { "epoch": 4.38, "grad_norm": 0.8023609519004822, "learning_rate": 1.0420148961059984e-05, "loss": 0.0143, "step": 14045 }, { "epoch": 4.38, "grad_norm": 0.7880727648735046, "learning_rate": 1.0414709759646769e-05, "loss": 0.0128, "step": 14050 }, { "epoch": 4.38, "grad_norm": 0.6856178641319275, "learning_rate": 1.0409270435327875e-05, "loss": 0.0138, "step": 14055 }, { "epoch": 4.39, "grad_norm": 0.2927437126636505, "learning_rate": 1.0403830989715333e-05, "loss": 0.0117, "step": 14060 }, { "epoch": 4.39, "grad_norm": 0.840043306350708, "learning_rate": 1.0398391424421209e-05, "loss": 0.0112, "step": 14065 }, { "epoch": 4.39, "grad_norm": 0.3406813144683838, "learning_rate": 1.0392951741057594e-05, "loss": 0.0127, "step": 14070 }, { "epoch": 4.39, "grad_norm": 0.5625513195991516, "learning_rate": 1.038751194123663e-05, "loss": 0.0171, "step": 14075 }, { "epoch": 4.39, "grad_norm": 0.6220264434814453, "learning_rate": 1.0382072026570485e-05, "loss": 0.0177, "step": 14080 }, { "epoch": 4.39, "grad_norm": 0.3575296998023987, "learning_rate": 1.0376631998671365e-05, "loss": 0.012, "step": 14085 }, { "epoch": 4.39, "grad_norm": 0.6170724630355835, "learning_rate": 1.0371191859151497e-05, "loss": 0.0193, "step": 14090 }, { "epoch": 4.4, "grad_norm": 0.5731143355369568, "learning_rate": 1.0365751609623162e-05, "loss": 0.0126, "step": 14095 }, { "epoch": 4.4, "grad_norm": 0.6981022357940674, "learning_rate": 1.0360311251698656e-05, "loss": 0.0192, "step": 14100 }, { "epoch": 4.4, "grad_norm": 0.34567970037460327, "learning_rate": 1.0354870786990318e-05, "loss": 0.0139, "step": 14105 }, { "epoch": 4.4, "grad_norm": 0.5584869980812073, "learning_rate": 1.0349430217110508e-05, "loss": 0.0158, "step": 14110 }, { "epoch": 4.4, "grad_norm": 0.4890233874320984, "learning_rate": 1.0343989543671629e-05, "loss": 0.013, "step": 14115 }, { "epoch": 4.4, "grad_norm": 0.40981024503707886, "learning_rate": 1.0338548768286107e-05, "loss": 0.0167, "step": 14120 }, { "epoch": 4.41, "grad_norm": 0.3282528221607208, "learning_rate": 1.0333107892566401e-05, "loss": 0.0157, "step": 14125 }, { "epoch": 4.41, "grad_norm": 0.4963136613368988, "learning_rate": 1.0327666918124995e-05, "loss": 0.0136, "step": 14130 }, { "epoch": 4.41, "grad_norm": 0.6778050661087036, "learning_rate": 1.032222584657441e-05, "loss": 0.0117, "step": 14135 }, { "epoch": 4.41, "grad_norm": 0.9065635800361633, "learning_rate": 1.0316784679527193e-05, "loss": 0.0217, "step": 14140 }, { "epoch": 4.41, "grad_norm": 0.5900851488113403, "learning_rate": 1.0311343418595915e-05, "loss": 0.0126, "step": 14145 }, { "epoch": 4.41, "grad_norm": 0.6750664114952087, "learning_rate": 1.0305902065393183e-05, "loss": 0.0181, "step": 14150 }, { "epoch": 4.41, "grad_norm": 0.5409610271453857, "learning_rate": 1.0300460621531623e-05, "loss": 0.0104, "step": 14155 }, { "epoch": 4.42, "grad_norm": 0.5124744772911072, "learning_rate": 1.029501908862389e-05, "loss": 0.0098, "step": 14160 }, { "epoch": 4.42, "grad_norm": 0.3976758122444153, "learning_rate": 1.028957746828267e-05, "loss": 0.012, "step": 14165 }, { "epoch": 4.42, "grad_norm": 0.5759036540985107, "learning_rate": 1.028413576212067e-05, "loss": 0.0143, "step": 14170 }, { "epoch": 4.42, "grad_norm": 0.7016305923461914, "learning_rate": 1.0278693971750623e-05, "loss": 0.0141, "step": 14175 }, { "epoch": 4.42, "grad_norm": 0.5577017068862915, "learning_rate": 1.0273252098785293e-05, "loss": 0.0163, "step": 14180 }, { "epoch": 4.42, "grad_norm": 0.6542133688926697, "learning_rate": 1.0267810144837458e-05, "loss": 0.0141, "step": 14185 }, { "epoch": 4.43, "grad_norm": 0.45770275592803955, "learning_rate": 1.0262368111519929e-05, "loss": 0.0158, "step": 14190 }, { "epoch": 4.43, "grad_norm": 0.5199791193008423, "learning_rate": 1.0256926000445535e-05, "loss": 0.0108, "step": 14195 }, { "epoch": 4.43, "grad_norm": 0.6465505361557007, "learning_rate": 1.025148381322713e-05, "loss": 0.0103, "step": 14200 }, { "epoch": 4.43, "grad_norm": 0.5390932559967041, "learning_rate": 1.0246041551477592e-05, "loss": 0.016, "step": 14205 }, { "epoch": 4.43, "grad_norm": 0.5630213022232056, "learning_rate": 1.0240599216809823e-05, "loss": 0.0172, "step": 14210 }, { "epoch": 4.43, "grad_norm": 0.7540024518966675, "learning_rate": 1.0235156810836735e-05, "loss": 0.011, "step": 14215 }, { "epoch": 4.44, "grad_norm": 0.7141256332397461, "learning_rate": 1.0229714335171277e-05, "loss": 0.0194, "step": 14220 }, { "epoch": 4.44, "grad_norm": 0.35217756032943726, "learning_rate": 1.0224271791426409e-05, "loss": 0.0086, "step": 14225 }, { "epoch": 4.44, "grad_norm": 0.6676583290100098, "learning_rate": 1.0218829181215114e-05, "loss": 0.012, "step": 14230 }, { "epoch": 4.44, "grad_norm": 0.7634176015853882, "learning_rate": 1.0213386506150387e-05, "loss": 0.0141, "step": 14235 }, { "epoch": 4.44, "grad_norm": 0.49505695700645447, "learning_rate": 1.0207943767845259e-05, "loss": 0.0159, "step": 14240 }, { "epoch": 4.44, "grad_norm": 0.6663855910301208, "learning_rate": 1.0202500967912764e-05, "loss": 0.0117, "step": 14245 }, { "epoch": 4.44, "grad_norm": 1.1110700368881226, "learning_rate": 1.019705810796597e-05, "loss": 0.0156, "step": 14250 }, { "epoch": 4.45, "grad_norm": 0.7089707851409912, "learning_rate": 1.0191615189617938e-05, "loss": 0.0131, "step": 14255 }, { "epoch": 4.45, "grad_norm": 0.6198787689208984, "learning_rate": 1.0186172214481769e-05, "loss": 0.0107, "step": 14260 }, { "epoch": 4.45, "grad_norm": 0.7208309769630432, "learning_rate": 1.0180729184170577e-05, "loss": 0.012, "step": 14265 }, { "epoch": 4.45, "grad_norm": 1.0942414999008179, "learning_rate": 1.0175286100297484e-05, "loss": 0.0121, "step": 14270 }, { "epoch": 4.45, "grad_norm": 0.7647571563720703, "learning_rate": 1.0169842964475631e-05, "loss": 0.0169, "step": 14275 }, { "epoch": 4.45, "grad_norm": 0.3617548942565918, "learning_rate": 1.0164399778318176e-05, "loss": 0.0155, "step": 14280 }, { "epoch": 4.46, "grad_norm": 1.1076143980026245, "learning_rate": 1.01589565434383e-05, "loss": 0.0177, "step": 14285 }, { "epoch": 4.46, "grad_norm": 0.7034644484519958, "learning_rate": 1.0153513261449182e-05, "loss": 0.0169, "step": 14290 }, { "epoch": 4.46, "grad_norm": 0.7932576537132263, "learning_rate": 1.0148069933964024e-05, "loss": 0.0197, "step": 14295 }, { "epoch": 4.46, "grad_norm": 0.36392897367477417, "learning_rate": 1.014262656259604e-05, "loss": 0.0136, "step": 14300 }, { "epoch": 4.46, "grad_norm": 0.516571044921875, "learning_rate": 1.0137183148958462e-05, "loss": 0.0134, "step": 14305 }, { "epoch": 4.46, "grad_norm": 0.7711630463600159, "learning_rate": 1.0131739694664527e-05, "loss": 0.0132, "step": 14310 }, { "epoch": 4.46, "grad_norm": 0.4761156439781189, "learning_rate": 1.0126296201327486e-05, "loss": 0.0192, "step": 14315 }, { "epoch": 4.47, "grad_norm": 0.6719192266464233, "learning_rate": 1.0120852670560606e-05, "loss": 0.0126, "step": 14320 }, { "epoch": 4.47, "grad_norm": 0.38668563961982727, "learning_rate": 1.0115409103977161e-05, "loss": 0.0107, "step": 14325 }, { "epoch": 4.47, "grad_norm": 0.7272295951843262, "learning_rate": 1.0109965503190433e-05, "loss": 0.0144, "step": 14330 }, { "epoch": 4.47, "grad_norm": 0.7221820950508118, "learning_rate": 1.0104521869813724e-05, "loss": 0.0207, "step": 14335 }, { "epoch": 4.47, "grad_norm": 0.5080924034118652, "learning_rate": 1.0099078205460333e-05, "loss": 0.0109, "step": 14340 }, { "epoch": 4.47, "grad_norm": 0.3039231598377228, "learning_rate": 1.0093634511743577e-05, "loss": 0.0097, "step": 14345 }, { "epoch": 4.48, "grad_norm": 0.5701034665107727, "learning_rate": 1.0088190790276782e-05, "loss": 0.0147, "step": 14350 }, { "epoch": 4.48, "grad_norm": 0.5223418474197388, "learning_rate": 1.0082747042673277e-05, "loss": 0.0131, "step": 14355 }, { "epoch": 4.48, "grad_norm": 0.6130552291870117, "learning_rate": 1.0077303270546397e-05, "loss": 0.0182, "step": 14360 }, { "epoch": 4.48, "grad_norm": 0.5565687417984009, "learning_rate": 1.0071859475509496e-05, "loss": 0.02, "step": 14365 }, { "epoch": 4.48, "grad_norm": 0.32723382115364075, "learning_rate": 1.006641565917592e-05, "loss": 0.0187, "step": 14370 }, { "epoch": 4.48, "grad_norm": 0.6693105697631836, "learning_rate": 1.0060971823159039e-05, "loss": 0.0183, "step": 14375 }, { "epoch": 4.49, "grad_norm": 0.3853606581687927, "learning_rate": 1.0055527969072203e-05, "loss": 0.0196, "step": 14380 }, { "epoch": 4.49, "grad_norm": 0.5426690578460693, "learning_rate": 1.0050084098528794e-05, "loss": 0.0124, "step": 14385 }, { "epoch": 4.49, "grad_norm": 0.6572005748748779, "learning_rate": 1.0044640213142187e-05, "loss": 0.0155, "step": 14390 }, { "epoch": 4.49, "grad_norm": 0.6691354513168335, "learning_rate": 1.0039196314525759e-05, "loss": 0.0126, "step": 14395 }, { "epoch": 4.49, "grad_norm": 0.4074169099330902, "learning_rate": 1.0033752404292894e-05, "loss": 0.0144, "step": 14400 }, { "epoch": 4.49, "grad_norm": 0.5884296298027039, "learning_rate": 1.0028308484056977e-05, "loss": 0.0156, "step": 14405 }, { "epoch": 4.49, "grad_norm": 1.086810827255249, "learning_rate": 1.0022864555431405e-05, "loss": 0.0124, "step": 14410 }, { "epoch": 4.5, "grad_norm": 0.624291181564331, "learning_rate": 1.0017420620029566e-05, "loss": 0.0129, "step": 14415 }, { "epoch": 4.5, "grad_norm": 0.6476021409034729, "learning_rate": 1.0011976679464854e-05, "loss": 0.0176, "step": 14420 }, { "epoch": 4.5, "grad_norm": 0.7050710916519165, "learning_rate": 1.0006532735350666e-05, "loss": 0.0139, "step": 14425 }, { "epoch": 4.5, "grad_norm": 0.6201428174972534, "learning_rate": 1.0001088789300403e-05, "loss": 0.0142, "step": 14430 }, { "epoch": 4.5, "grad_norm": 0.6047577857971191, "learning_rate": 9.995644842927465e-06, "loss": 0.0117, "step": 14435 }, { "epoch": 4.5, "grad_norm": 0.6464779376983643, "learning_rate": 9.990200897845237e-06, "loss": 0.0144, "step": 14440 }, { "epoch": 4.51, "grad_norm": 1.0427998304367065, "learning_rate": 9.98475695566713e-06, "loss": 0.012, "step": 14445 }, { "epoch": 4.51, "grad_norm": 0.5257377028465271, "learning_rate": 9.979313018006533e-06, "loss": 0.0124, "step": 14450 }, { "epoch": 4.51, "grad_norm": 0.6395034193992615, "learning_rate": 9.973869086476846e-06, "loss": 0.0113, "step": 14455 }, { "epoch": 4.51, "grad_norm": 0.4360012114048004, "learning_rate": 9.96842516269146e-06, "loss": 0.016, "step": 14460 }, { "epoch": 4.51, "grad_norm": 0.7435383796691895, "learning_rate": 9.962981248263768e-06, "loss": 0.0168, "step": 14465 }, { "epoch": 4.51, "grad_norm": 0.5809354186058044, "learning_rate": 9.957537344807157e-06, "loss": 0.0106, "step": 14470 }, { "epoch": 4.51, "grad_norm": 0.5905982851982117, "learning_rate": 9.952093453935017e-06, "loss": 0.0156, "step": 14475 }, { "epoch": 4.52, "grad_norm": 0.5396180152893066, "learning_rate": 9.946649577260724e-06, "loss": 0.0172, "step": 14480 }, { "epoch": 4.52, "grad_norm": 0.2679448425769806, "learning_rate": 9.941205716397654e-06, "loss": 0.0146, "step": 14485 }, { "epoch": 4.52, "grad_norm": 1.4850140810012817, "learning_rate": 9.935761872959185e-06, "loss": 0.0123, "step": 14490 }, { "epoch": 4.52, "grad_norm": 0.47272995114326477, "learning_rate": 9.930318048558684e-06, "loss": 0.0155, "step": 14495 }, { "epoch": 4.52, "grad_norm": 0.8253952264785767, "learning_rate": 9.924874244809511e-06, "loss": 0.0182, "step": 14500 }, { "epoch": 4.52, "grad_norm": 0.6212565898895264, "learning_rate": 9.91943046332502e-06, "loss": 0.015, "step": 14505 }, { "epoch": 4.53, "grad_norm": 0.7062631249427795, "learning_rate": 9.913986705718567e-06, "loss": 0.0171, "step": 14510 }, { "epoch": 4.53, "grad_norm": 0.49691250920295715, "learning_rate": 9.908542973603485e-06, "loss": 0.0179, "step": 14515 }, { "epoch": 4.53, "grad_norm": 0.7071046233177185, "learning_rate": 9.903099268593115e-06, "loss": 0.0212, "step": 14520 }, { "epoch": 4.53, "grad_norm": 0.8758559823036194, "learning_rate": 9.897655592300779e-06, "loss": 0.015, "step": 14525 }, { "epoch": 4.53, "grad_norm": 0.5860288143157959, "learning_rate": 9.8922119463398e-06, "loss": 0.0157, "step": 14530 }, { "epoch": 4.53, "grad_norm": 0.65015709400177, "learning_rate": 9.886768332323483e-06, "loss": 0.0103, "step": 14535 }, { "epoch": 4.53, "grad_norm": 0.5659227967262268, "learning_rate": 9.88132475186513e-06, "loss": 0.0202, "step": 14540 }, { "epoch": 4.54, "grad_norm": 0.4392716884613037, "learning_rate": 9.875881206578032e-06, "loss": 0.0164, "step": 14545 }, { "epoch": 4.54, "grad_norm": 0.6249778270721436, "learning_rate": 9.870437698075464e-06, "loss": 0.0146, "step": 14550 }, { "epoch": 4.54, "grad_norm": 0.5201079249382019, "learning_rate": 9.864994227970695e-06, "loss": 0.0145, "step": 14555 }, { "epoch": 4.54, "grad_norm": 0.3231590986251831, "learning_rate": 9.859550797876984e-06, "loss": 0.0118, "step": 14560 }, { "epoch": 4.54, "grad_norm": 0.5004443526268005, "learning_rate": 9.854107409407575e-06, "loss": 0.0194, "step": 14565 }, { "epoch": 4.54, "grad_norm": 0.6936674118041992, "learning_rate": 9.848664064175698e-06, "loss": 0.0151, "step": 14570 }, { "epoch": 4.55, "grad_norm": 0.3963569104671478, "learning_rate": 9.843220763794578e-06, "loss": 0.0112, "step": 14575 }, { "epoch": 4.55, "grad_norm": 0.5949503779411316, "learning_rate": 9.837777509877425e-06, "loss": 0.0202, "step": 14580 }, { "epoch": 4.55, "grad_norm": 0.6990585327148438, "learning_rate": 9.832334304037418e-06, "loss": 0.013, "step": 14585 }, { "epoch": 4.55, "grad_norm": 0.7839829325675964, "learning_rate": 9.826891147887746e-06, "loss": 0.0103, "step": 14590 }, { "epoch": 4.55, "grad_norm": 0.536649227142334, "learning_rate": 9.821448043041571e-06, "loss": 0.0096, "step": 14595 }, { "epoch": 4.55, "grad_norm": 0.6189597845077515, "learning_rate": 9.816004991112042e-06, "loss": 0.0148, "step": 14600 }, { "epoch": 4.56, "grad_norm": 0.5167118310928345, "learning_rate": 9.81056199371229e-06, "loss": 0.0158, "step": 14605 }, { "epoch": 4.56, "grad_norm": 1.288644790649414, "learning_rate": 9.805119052455433e-06, "loss": 0.0127, "step": 14610 }, { "epoch": 4.56, "grad_norm": 0.5421487092971802, "learning_rate": 9.799676168954568e-06, "loss": 0.0171, "step": 14615 }, { "epoch": 4.56, "grad_norm": 0.44585683941841125, "learning_rate": 9.79423334482279e-06, "loss": 0.0118, "step": 14620 }, { "epoch": 4.56, "grad_norm": 0.49607759714126587, "learning_rate": 9.78879058167315e-06, "loss": 0.0122, "step": 14625 }, { "epoch": 4.56, "grad_norm": 0.4464818835258484, "learning_rate": 9.783347881118698e-06, "loss": 0.0126, "step": 14630 }, { "epoch": 4.56, "grad_norm": 0.638231098651886, "learning_rate": 9.77790524477247e-06, "loss": 0.0156, "step": 14635 }, { "epoch": 4.57, "grad_norm": 0.5741913914680481, "learning_rate": 9.77246267424747e-06, "loss": 0.0175, "step": 14640 }, { "epoch": 4.57, "grad_norm": 0.6646221876144409, "learning_rate": 9.76702017115669e-06, "loss": 0.0173, "step": 14645 }, { "epoch": 4.57, "grad_norm": 0.5025471448898315, "learning_rate": 9.7615777371131e-06, "loss": 0.0142, "step": 14650 }, { "epoch": 4.57, "grad_norm": 0.6054807901382446, "learning_rate": 9.756135373729654e-06, "loss": 0.0217, "step": 14655 }, { "epoch": 4.57, "grad_norm": 0.6085841655731201, "learning_rate": 9.750693082619274e-06, "loss": 0.0111, "step": 14660 }, { "epoch": 4.57, "grad_norm": 0.6730016469955444, "learning_rate": 9.74525086539487e-06, "loss": 0.0175, "step": 14665 }, { "epoch": 4.58, "grad_norm": 0.7598947286605835, "learning_rate": 9.739808723669327e-06, "loss": 0.0182, "step": 14670 }, { "epoch": 4.58, "grad_norm": 0.47956424951553345, "learning_rate": 9.73436665905551e-06, "loss": 0.0088, "step": 14675 }, { "epoch": 4.58, "grad_norm": 0.5771499872207642, "learning_rate": 9.728924673166258e-06, "loss": 0.0159, "step": 14680 }, { "epoch": 4.58, "grad_norm": 1.1014750003814697, "learning_rate": 9.723482767614391e-06, "loss": 0.0154, "step": 14685 }, { "epoch": 4.58, "grad_norm": 0.6661223769187927, "learning_rate": 9.7180409440127e-06, "loss": 0.017, "step": 14690 }, { "epoch": 4.58, "grad_norm": 0.7477891445159912, "learning_rate": 9.712599203973953e-06, "loss": 0.0209, "step": 14695 }, { "epoch": 4.58, "grad_norm": 0.6131916046142578, "learning_rate": 9.707157549110894e-06, "loss": 0.0165, "step": 14700 }, { "epoch": 4.59, "grad_norm": 0.636452317237854, "learning_rate": 9.701715981036243e-06, "loss": 0.0114, "step": 14705 }, { "epoch": 4.59, "grad_norm": 0.5207667946815491, "learning_rate": 9.696274501362694e-06, "loss": 0.0169, "step": 14710 }, { "epoch": 4.59, "grad_norm": 0.3022514581680298, "learning_rate": 9.69083311170291e-06, "loss": 0.0102, "step": 14715 }, { "epoch": 4.59, "grad_norm": 0.5009319186210632, "learning_rate": 9.685391813669537e-06, "loss": 0.0117, "step": 14720 }, { "epoch": 4.59, "grad_norm": 0.5763520002365112, "learning_rate": 9.67995060887519e-06, "loss": 0.0137, "step": 14725 }, { "epoch": 4.59, "grad_norm": 0.8080492615699768, "learning_rate": 9.674509498932443e-06, "loss": 0.0115, "step": 14730 }, { "epoch": 4.6, "grad_norm": 0.8563107848167419, "learning_rate": 9.669068485453863e-06, "loss": 0.0134, "step": 14735 }, { "epoch": 4.6, "grad_norm": 0.5381313562393188, "learning_rate": 9.663627570051975e-06, "loss": 0.013, "step": 14740 }, { "epoch": 4.6, "grad_norm": 0.7833681106567383, "learning_rate": 9.658186754339283e-06, "loss": 0.0181, "step": 14745 }, { "epoch": 4.6, "grad_norm": 1.1692765951156616, "learning_rate": 9.652746039928253e-06, "loss": 0.0174, "step": 14750 }, { "epoch": 4.6, "grad_norm": 0.5598737001419067, "learning_rate": 9.647305428431327e-06, "loss": 0.0122, "step": 14755 }, { "epoch": 4.6, "grad_norm": 0.7559071183204651, "learning_rate": 9.641864921460916e-06, "loss": 0.018, "step": 14760 }, { "epoch": 4.61, "grad_norm": 0.501556932926178, "learning_rate": 9.636424520629394e-06, "loss": 0.0131, "step": 14765 }, { "epoch": 4.61, "grad_norm": 0.5668889880180359, "learning_rate": 9.630984227549112e-06, "loss": 0.0176, "step": 14770 }, { "epoch": 4.61, "grad_norm": 0.5442070364952087, "learning_rate": 9.625544043832383e-06, "loss": 0.0134, "step": 14775 }, { "epoch": 4.61, "grad_norm": 0.7403197884559631, "learning_rate": 9.620103971091493e-06, "loss": 0.0166, "step": 14780 }, { "epoch": 4.61, "grad_norm": 0.6325731873512268, "learning_rate": 9.61466401093869e-06, "loss": 0.0133, "step": 14785 }, { "epoch": 4.61, "grad_norm": 0.6614811420440674, "learning_rate": 9.609224164986192e-06, "loss": 0.0162, "step": 14790 }, { "epoch": 4.61, "grad_norm": 0.6733828783035278, "learning_rate": 9.603784434846182e-06, "loss": 0.0169, "step": 14795 }, { "epoch": 4.62, "grad_norm": 0.5302515625953674, "learning_rate": 9.598344822130808e-06, "loss": 0.0169, "step": 14800 }, { "epoch": 4.62, "grad_norm": 0.5816189646720886, "learning_rate": 9.592905328452182e-06, "loss": 0.0145, "step": 14805 }, { "epoch": 4.62, "grad_norm": 0.4304620027542114, "learning_rate": 9.587465955422384e-06, "loss": 0.0179, "step": 14810 }, { "epoch": 4.62, "grad_norm": 0.5317957401275635, "learning_rate": 9.582026704653455e-06, "loss": 0.0148, "step": 14815 }, { "epoch": 4.62, "grad_norm": 0.6904965043067932, "learning_rate": 9.576587577757403e-06, "loss": 0.0142, "step": 14820 }, { "epoch": 4.62, "grad_norm": 0.7760544419288635, "learning_rate": 9.571148576346198e-06, "loss": 0.0174, "step": 14825 }, { "epoch": 4.63, "grad_norm": 0.7780206203460693, "learning_rate": 9.565709702031773e-06, "loss": 0.0176, "step": 14830 }, { "epoch": 4.63, "grad_norm": 0.5674579739570618, "learning_rate": 9.560270956426023e-06, "loss": 0.0157, "step": 14835 }, { "epoch": 4.63, "grad_norm": 0.5733282566070557, "learning_rate": 9.554832341140801e-06, "loss": 0.0183, "step": 14840 }, { "epoch": 4.63, "grad_norm": 0.320640504360199, "learning_rate": 9.54939385778793e-06, "loss": 0.0144, "step": 14845 }, { "epoch": 4.63, "grad_norm": 0.533781111240387, "learning_rate": 9.543955507979185e-06, "loss": 0.0188, "step": 14850 }, { "epoch": 4.63, "grad_norm": 0.8794890642166138, "learning_rate": 9.538517293326307e-06, "loss": 0.0153, "step": 14855 }, { "epoch": 4.63, "grad_norm": 0.38584229350090027, "learning_rate": 9.533079215440995e-06, "loss": 0.0128, "step": 14860 }, { "epoch": 4.64, "grad_norm": 0.48441481590270996, "learning_rate": 9.527641275934908e-06, "loss": 0.0142, "step": 14865 }, { "epoch": 4.64, "grad_norm": 0.4725772738456726, "learning_rate": 9.52220347641967e-06, "loss": 0.0124, "step": 14870 }, { "epoch": 4.64, "grad_norm": 0.5872272253036499, "learning_rate": 9.516765818506844e-06, "loss": 0.0118, "step": 14875 }, { "epoch": 4.64, "grad_norm": 0.7887629866600037, "learning_rate": 9.511328303807977e-06, "loss": 0.0136, "step": 14880 }, { "epoch": 4.64, "grad_norm": 0.5740352869033813, "learning_rate": 9.505890933934553e-06, "loss": 0.0118, "step": 14885 }, { "epoch": 4.64, "grad_norm": 0.689967691898346, "learning_rate": 9.500453710498024e-06, "loss": 0.0135, "step": 14890 }, { "epoch": 4.65, "grad_norm": 0.4478163421154022, "learning_rate": 9.495016635109795e-06, "loss": 0.0113, "step": 14895 }, { "epoch": 4.65, "grad_norm": 0.653808057308197, "learning_rate": 9.489579709381229e-06, "loss": 0.0167, "step": 14900 }, { "epoch": 4.65, "grad_norm": 0.3178090453147888, "learning_rate": 9.484142934923645e-06, "loss": 0.0112, "step": 14905 }, { "epoch": 4.65, "grad_norm": 0.8894709348678589, "learning_rate": 9.47870631334831e-06, "loss": 0.0202, "step": 14910 }, { "epoch": 4.65, "grad_norm": 0.4782661497592926, "learning_rate": 9.473269846266453e-06, "loss": 0.0131, "step": 14915 }, { "epoch": 4.65, "grad_norm": 0.8897411227226257, "learning_rate": 9.467833535289257e-06, "loss": 0.0137, "step": 14920 }, { "epoch": 4.66, "grad_norm": 0.7093350291252136, "learning_rate": 9.462397382027857e-06, "loss": 0.0151, "step": 14925 }, { "epoch": 4.66, "grad_norm": 0.6557932496070862, "learning_rate": 9.456961388093343e-06, "loss": 0.0102, "step": 14930 }, { "epoch": 4.66, "grad_norm": 0.48611825704574585, "learning_rate": 9.451525555096753e-06, "loss": 0.0122, "step": 14935 }, { "epoch": 4.66, "grad_norm": 0.5567466616630554, "learning_rate": 9.446089884649085e-06, "loss": 0.0171, "step": 14940 }, { "epoch": 4.66, "grad_norm": 0.3401321470737457, "learning_rate": 9.44065437836128e-06, "loss": 0.0137, "step": 14945 }, { "epoch": 4.66, "grad_norm": 0.40684616565704346, "learning_rate": 9.435219037844234e-06, "loss": 0.0118, "step": 14950 }, { "epoch": 4.66, "grad_norm": 0.8801999688148499, "learning_rate": 9.429783864708798e-06, "loss": 0.018, "step": 14955 }, { "epoch": 4.67, "grad_norm": 0.6474644541740417, "learning_rate": 9.42434886056577e-06, "loss": 0.0166, "step": 14960 }, { "epoch": 4.67, "grad_norm": 0.8113958835601807, "learning_rate": 9.418914027025891e-06, "loss": 0.0158, "step": 14965 }, { "epoch": 4.67, "grad_norm": 0.44780680537223816, "learning_rate": 9.413479365699868e-06, "loss": 0.0163, "step": 14970 }, { "epoch": 4.67, "grad_norm": 0.5623879432678223, "learning_rate": 9.408044878198343e-06, "loss": 0.014, "step": 14975 }, { "epoch": 4.67, "grad_norm": 0.5711343288421631, "learning_rate": 9.402610566131912e-06, "loss": 0.0155, "step": 14980 }, { "epoch": 4.67, "grad_norm": 0.5924063324928284, "learning_rate": 9.397176431111115e-06, "loss": 0.0153, "step": 14985 }, { "epoch": 4.68, "grad_norm": 0.7692981958389282, "learning_rate": 9.391742474746444e-06, "loss": 0.0155, "step": 14990 }, { "epoch": 4.68, "grad_norm": 0.7145710587501526, "learning_rate": 9.386308698648337e-06, "loss": 0.0171, "step": 14995 }, { "epoch": 4.68, "grad_norm": 0.9549365639686584, "learning_rate": 9.380875104427175e-06, "loss": 0.0167, "step": 15000 }, { "epoch": 4.68, "grad_norm": 0.5520309209823608, "learning_rate": 9.375441693693291e-06, "loss": 0.0128, "step": 15005 }, { "epoch": 4.68, "grad_norm": 0.5013812780380249, "learning_rate": 9.37000846805696e-06, "loss": 0.0114, "step": 15010 }, { "epoch": 4.68, "grad_norm": 0.8319398164749146, "learning_rate": 9.364575429128407e-06, "loss": 0.0156, "step": 15015 }, { "epoch": 4.68, "grad_norm": 0.48659223318099976, "learning_rate": 9.359142578517787e-06, "loss": 0.014, "step": 15020 }, { "epoch": 4.69, "grad_norm": 0.5164733529090881, "learning_rate": 9.353709917835218e-06, "loss": 0.0164, "step": 15025 }, { "epoch": 4.69, "grad_norm": 0.4213215112686157, "learning_rate": 9.34827744869075e-06, "loss": 0.0124, "step": 15030 }, { "epoch": 4.69, "grad_norm": 0.3633331060409546, "learning_rate": 9.342845172694383e-06, "loss": 0.012, "step": 15035 }, { "epoch": 4.69, "grad_norm": 0.576378345489502, "learning_rate": 9.337413091456052e-06, "loss": 0.0174, "step": 15040 }, { "epoch": 4.69, "grad_norm": 0.42284902930259705, "learning_rate": 9.331981206585642e-06, "loss": 0.021, "step": 15045 }, { "epoch": 4.69, "grad_norm": 0.773980438709259, "learning_rate": 9.326549519692977e-06, "loss": 0.0167, "step": 15050 }, { "epoch": 4.7, "grad_norm": 0.44876566529273987, "learning_rate": 9.321118032387816e-06, "loss": 0.0152, "step": 15055 }, { "epoch": 4.7, "grad_norm": 0.48520439863204956, "learning_rate": 9.31568674627987e-06, "loss": 0.0151, "step": 15060 }, { "epoch": 4.7, "grad_norm": 0.7521637678146362, "learning_rate": 9.310255662978783e-06, "loss": 0.0148, "step": 15065 }, { "epoch": 4.7, "grad_norm": 0.6264703869819641, "learning_rate": 9.30482478409414e-06, "loss": 0.0158, "step": 15070 }, { "epoch": 4.7, "grad_norm": 0.5164471864700317, "learning_rate": 9.29939411123547e-06, "loss": 0.0123, "step": 15075 }, { "epoch": 4.7, "grad_norm": 0.4082270562648773, "learning_rate": 9.293963646012234e-06, "loss": 0.0101, "step": 15080 }, { "epoch": 4.7, "grad_norm": 1.352462887763977, "learning_rate": 9.288533390033838e-06, "loss": 0.0215, "step": 15085 }, { "epoch": 4.71, "grad_norm": 0.31743505597114563, "learning_rate": 9.283103344909618e-06, "loss": 0.0114, "step": 15090 }, { "epoch": 4.71, "grad_norm": 0.6116018891334534, "learning_rate": 9.277673512248855e-06, "loss": 0.0151, "step": 15095 }, { "epoch": 4.71, "grad_norm": 0.5463698506355286, "learning_rate": 9.272243893660763e-06, "loss": 0.0124, "step": 15100 }, { "epoch": 4.71, "grad_norm": 0.6346951723098755, "learning_rate": 9.266814490754494e-06, "loss": 0.0182, "step": 15105 }, { "epoch": 4.71, "grad_norm": 0.543600857257843, "learning_rate": 9.261385305139135e-06, "loss": 0.0122, "step": 15110 }, { "epoch": 4.71, "grad_norm": 0.7223466634750366, "learning_rate": 9.255956338423714e-06, "loss": 0.0171, "step": 15115 }, { "epoch": 4.72, "grad_norm": 0.7255980372428894, "learning_rate": 9.250527592217183e-06, "loss": 0.0158, "step": 15120 }, { "epoch": 4.72, "grad_norm": 0.43430185317993164, "learning_rate": 9.245099068128443e-06, "loss": 0.0161, "step": 15125 }, { "epoch": 4.72, "grad_norm": 0.42145824432373047, "learning_rate": 9.239670767766315e-06, "loss": 0.0123, "step": 15130 }, { "epoch": 4.72, "grad_norm": 0.3683522939682007, "learning_rate": 9.23424269273956e-06, "loss": 0.0122, "step": 15135 }, { "epoch": 4.72, "grad_norm": 0.5747255682945251, "learning_rate": 9.228814844656878e-06, "loss": 0.0147, "step": 15140 }, { "epoch": 4.72, "grad_norm": 0.7251515984535217, "learning_rate": 9.22338722512689e-06, "loss": 0.0146, "step": 15145 }, { "epoch": 4.73, "grad_norm": 0.39844369888305664, "learning_rate": 9.217959835758156e-06, "loss": 0.0143, "step": 15150 }, { "epoch": 4.73, "grad_norm": 0.5769985318183899, "learning_rate": 9.212532678159173e-06, "loss": 0.0126, "step": 15155 }, { "epoch": 4.73, "grad_norm": 0.45943641662597656, "learning_rate": 9.207105753938362e-06, "loss": 0.0206, "step": 15160 }, { "epoch": 4.73, "grad_norm": 0.6093705296516418, "learning_rate": 9.201679064704067e-06, "loss": 0.0114, "step": 15165 }, { "epoch": 4.73, "grad_norm": 0.801814079284668, "learning_rate": 9.196252612064582e-06, "loss": 0.0133, "step": 15170 }, { "epoch": 4.73, "grad_norm": 0.41251370310783386, "learning_rate": 9.190826397628116e-06, "loss": 0.0133, "step": 15175 }, { "epoch": 4.73, "grad_norm": 0.6013932228088379, "learning_rate": 9.185400423002813e-06, "loss": 0.0183, "step": 15180 }, { "epoch": 4.74, "grad_norm": 0.7565571665763855, "learning_rate": 9.179974689796743e-06, "loss": 0.0185, "step": 15185 }, { "epoch": 4.74, "grad_norm": 0.7211235761642456, "learning_rate": 9.17454919961791e-06, "loss": 0.016, "step": 15190 }, { "epoch": 4.74, "grad_norm": 0.6997960209846497, "learning_rate": 9.169123954074242e-06, "loss": 0.0141, "step": 15195 }, { "epoch": 4.74, "grad_norm": 0.4590340554714203, "learning_rate": 9.16369895477359e-06, "loss": 0.0119, "step": 15200 }, { "epoch": 4.74, "grad_norm": 0.5132920145988464, "learning_rate": 9.158274203323739e-06, "loss": 0.0163, "step": 15205 }, { "epoch": 4.74, "grad_norm": 0.590715765953064, "learning_rate": 9.152849701332398e-06, "loss": 0.0205, "step": 15210 }, { "epoch": 4.75, "grad_norm": 0.45706048607826233, "learning_rate": 9.147425450407205e-06, "loss": 0.0119, "step": 15215 }, { "epoch": 4.75, "grad_norm": 0.4636910557746887, "learning_rate": 9.14200145215572e-06, "loss": 0.0148, "step": 15220 }, { "epoch": 4.75, "grad_norm": 0.9079941511154175, "learning_rate": 9.136577708185428e-06, "loss": 0.014, "step": 15225 }, { "epoch": 4.75, "grad_norm": 0.912060022354126, "learning_rate": 9.131154220103743e-06, "loss": 0.009, "step": 15230 }, { "epoch": 4.75, "grad_norm": 0.35609185695648193, "learning_rate": 9.125730989517994e-06, "loss": 0.0142, "step": 15235 }, { "epoch": 4.75, "grad_norm": 0.6244210600852966, "learning_rate": 9.120308018035444e-06, "loss": 0.0134, "step": 15240 }, { "epoch": 4.75, "grad_norm": 0.602467954158783, "learning_rate": 9.114885307263272e-06, "loss": 0.015, "step": 15245 }, { "epoch": 4.76, "grad_norm": 0.6154845952987671, "learning_rate": 9.109462858808586e-06, "loss": 0.0184, "step": 15250 }, { "epoch": 4.76, "grad_norm": 0.5339671969413757, "learning_rate": 9.104040674278408e-06, "loss": 0.0133, "step": 15255 }, { "epoch": 4.76, "grad_norm": 0.6302767395973206, "learning_rate": 9.098618755279693e-06, "loss": 0.0169, "step": 15260 }, { "epoch": 4.76, "grad_norm": 0.762374758720398, "learning_rate": 9.093197103419309e-06, "loss": 0.0131, "step": 15265 }, { "epoch": 4.76, "grad_norm": 0.5994053483009338, "learning_rate": 9.087775720304041e-06, "loss": 0.0152, "step": 15270 }, { "epoch": 4.76, "grad_norm": 0.46510350704193115, "learning_rate": 9.082354607540606e-06, "loss": 0.0112, "step": 15275 }, { "epoch": 4.77, "grad_norm": 0.32651185989379883, "learning_rate": 9.076933766735631e-06, "loss": 0.0123, "step": 15280 }, { "epoch": 4.77, "grad_norm": 0.8065100908279419, "learning_rate": 9.071513199495671e-06, "loss": 0.0155, "step": 15285 }, { "epoch": 4.77, "grad_norm": 0.653430700302124, "learning_rate": 9.066092907427194e-06, "loss": 0.0199, "step": 15290 }, { "epoch": 4.77, "grad_norm": 0.4591868221759796, "learning_rate": 9.06067289213658e-06, "loss": 0.0131, "step": 15295 }, { "epoch": 4.77, "grad_norm": 0.4707903563976288, "learning_rate": 9.055253155230147e-06, "loss": 0.0156, "step": 15300 }, { "epoch": 4.77, "grad_norm": 0.5545510649681091, "learning_rate": 9.049833698314116e-06, "loss": 0.012, "step": 15305 }, { "epoch": 4.78, "grad_norm": 0.5353935956954956, "learning_rate": 9.044414522994618e-06, "loss": 0.0197, "step": 15310 }, { "epoch": 4.78, "grad_norm": 0.4159187078475952, "learning_rate": 9.038995630877716e-06, "loss": 0.0149, "step": 15315 }, { "epoch": 4.78, "grad_norm": 0.5113319754600525, "learning_rate": 9.033577023569383e-06, "loss": 0.0129, "step": 15320 }, { "epoch": 4.78, "grad_norm": 0.6067572832107544, "learning_rate": 9.028158702675505e-06, "loss": 0.0192, "step": 15325 }, { "epoch": 4.78, "grad_norm": 0.422309011220932, "learning_rate": 9.022740669801888e-06, "loss": 0.0153, "step": 15330 }, { "epoch": 4.78, "grad_norm": 0.5361863970756531, "learning_rate": 9.017322926554251e-06, "loss": 0.015, "step": 15335 }, { "epoch": 4.78, "grad_norm": 0.28277361392974854, "learning_rate": 9.011905474538224e-06, "loss": 0.0131, "step": 15340 }, { "epoch": 4.79, "grad_norm": 0.9252229332923889, "learning_rate": 9.006488315359355e-06, "loss": 0.0146, "step": 15345 }, { "epoch": 4.79, "grad_norm": 0.7406449317932129, "learning_rate": 9.0010714506231e-06, "loss": 0.0134, "step": 15350 }, { "epoch": 4.79, "grad_norm": 0.6453289985656738, "learning_rate": 8.995654881934832e-06, "loss": 0.0132, "step": 15355 }, { "epoch": 4.79, "grad_norm": 0.7026208639144897, "learning_rate": 8.990238610899838e-06, "loss": 0.0181, "step": 15360 }, { "epoch": 4.79, "grad_norm": 0.6112136244773865, "learning_rate": 8.98482263912331e-06, "loss": 0.0123, "step": 15365 }, { "epoch": 4.79, "grad_norm": 0.5226003527641296, "learning_rate": 8.97940696821036e-06, "loss": 0.0135, "step": 15370 }, { "epoch": 4.8, "grad_norm": 0.41499006748199463, "learning_rate": 8.973991599766005e-06, "loss": 0.0174, "step": 15375 }, { "epoch": 4.8, "grad_norm": 0.7187453508377075, "learning_rate": 8.968576535395172e-06, "loss": 0.0132, "step": 15380 }, { "epoch": 4.8, "grad_norm": 0.5334180593490601, "learning_rate": 8.963161776702698e-06, "loss": 0.0163, "step": 15385 }, { "epoch": 4.8, "grad_norm": 0.5039098858833313, "learning_rate": 8.957747325293333e-06, "loss": 0.0167, "step": 15390 }, { "epoch": 4.8, "grad_norm": 0.9802941083908081, "learning_rate": 8.952333182771734e-06, "loss": 0.0129, "step": 15395 }, { "epoch": 4.8, "grad_norm": 1.0111184120178223, "learning_rate": 8.946919350742462e-06, "loss": 0.0111, "step": 15400 }, { "epoch": 4.8, "grad_norm": 0.33556801080703735, "learning_rate": 8.941505830809996e-06, "loss": 0.01, "step": 15405 }, { "epoch": 4.81, "grad_norm": 0.4566008746623993, "learning_rate": 8.93609262457872e-06, "loss": 0.0147, "step": 15410 }, { "epoch": 4.81, "grad_norm": 0.6226485371589661, "learning_rate": 8.930679733652908e-06, "loss": 0.0123, "step": 15415 }, { "epoch": 4.81, "grad_norm": 0.47943395376205444, "learning_rate": 8.925267159636766e-06, "loss": 0.0138, "step": 15420 }, { "epoch": 4.81, "grad_norm": 0.7245271801948547, "learning_rate": 8.91985490413439e-06, "loss": 0.0179, "step": 15425 }, { "epoch": 4.81, "grad_norm": 0.3484051525592804, "learning_rate": 8.914442968749787e-06, "loss": 0.0144, "step": 15430 }, { "epoch": 4.81, "grad_norm": 0.6594104766845703, "learning_rate": 8.909031355086868e-06, "loss": 0.0169, "step": 15435 }, { "epoch": 4.82, "grad_norm": 0.19373483955860138, "learning_rate": 8.903620064749447e-06, "loss": 0.0121, "step": 15440 }, { "epoch": 4.82, "grad_norm": 0.6843541860580444, "learning_rate": 8.898209099341246e-06, "loss": 0.0151, "step": 15445 }, { "epoch": 4.82, "grad_norm": 0.30797871947288513, "learning_rate": 8.892798460465894e-06, "loss": 0.0097, "step": 15450 }, { "epoch": 4.82, "grad_norm": 0.7222657799720764, "learning_rate": 8.887388149726907e-06, "loss": 0.0144, "step": 15455 }, { "epoch": 4.82, "grad_norm": 0.6683160662651062, "learning_rate": 8.881978168727717e-06, "loss": 0.0116, "step": 15460 }, { "epoch": 4.82, "grad_norm": 0.3965126872062683, "learning_rate": 8.876568519071661e-06, "loss": 0.0117, "step": 15465 }, { "epoch": 4.83, "grad_norm": 0.47046273946762085, "learning_rate": 8.87115920236197e-06, "loss": 0.0136, "step": 15470 }, { "epoch": 4.83, "grad_norm": 0.7667253613471985, "learning_rate": 8.86575022020178e-06, "loss": 0.0152, "step": 15475 }, { "epoch": 4.83, "grad_norm": 0.9530908465385437, "learning_rate": 8.860341574194125e-06, "loss": 0.0174, "step": 15480 }, { "epoch": 4.83, "grad_norm": 0.7011138796806335, "learning_rate": 8.854933265941946e-06, "loss": 0.0201, "step": 15485 }, { "epoch": 4.83, "grad_norm": 0.6812859773635864, "learning_rate": 8.849525297048072e-06, "loss": 0.0135, "step": 15490 }, { "epoch": 4.83, "grad_norm": 0.7322203516960144, "learning_rate": 8.844117669115242e-06, "loss": 0.0143, "step": 15495 }, { "epoch": 4.83, "grad_norm": 0.5292491912841797, "learning_rate": 8.83871038374609e-06, "loss": 0.0162, "step": 15500 }, { "epoch": 4.84, "grad_norm": 0.4952675700187683, "learning_rate": 8.833303442543148e-06, "loss": 0.0111, "step": 15505 }, { "epoch": 4.84, "grad_norm": 0.770052969455719, "learning_rate": 8.827896847108851e-06, "loss": 0.0154, "step": 15510 }, { "epoch": 4.84, "grad_norm": 0.6179236769676208, "learning_rate": 8.822490599045524e-06, "loss": 0.013, "step": 15515 }, { "epoch": 4.84, "grad_norm": 0.39276185631752014, "learning_rate": 8.817084699955397e-06, "loss": 0.0131, "step": 15520 }, { "epoch": 4.84, "grad_norm": 0.6833389401435852, "learning_rate": 8.811679151440585e-06, "loss": 0.0162, "step": 15525 }, { "epoch": 4.84, "grad_norm": 0.504504382610321, "learning_rate": 8.806273955103111e-06, "loss": 0.0147, "step": 15530 }, { "epoch": 4.85, "grad_norm": 0.38780394196510315, "learning_rate": 8.800869112544887e-06, "loss": 0.0119, "step": 15535 }, { "epoch": 4.85, "grad_norm": 0.18057525157928467, "learning_rate": 8.795464625367724e-06, "loss": 0.013, "step": 15540 }, { "epoch": 4.85, "grad_norm": 0.5500834584236145, "learning_rate": 8.790060495173321e-06, "loss": 0.0167, "step": 15545 }, { "epoch": 4.85, "grad_norm": 0.708239734172821, "learning_rate": 8.784656723563283e-06, "loss": 0.0158, "step": 15550 }, { "epoch": 4.85, "grad_norm": 0.4792191684246063, "learning_rate": 8.779253312139102e-06, "loss": 0.017, "step": 15555 }, { "epoch": 4.85, "grad_norm": 0.517033576965332, "learning_rate": 8.773850262502152e-06, "loss": 0.0129, "step": 15560 }, { "epoch": 4.85, "grad_norm": 0.6025055646896362, "learning_rate": 8.76844757625372e-06, "loss": 0.0144, "step": 15565 }, { "epoch": 4.86, "grad_norm": 0.868748664855957, "learning_rate": 8.763045254994973e-06, "loss": 0.0184, "step": 15570 }, { "epoch": 4.86, "grad_norm": 0.5168037414550781, "learning_rate": 8.757643300326975e-06, "loss": 0.0128, "step": 15575 }, { "epoch": 4.86, "grad_norm": 0.6352051496505737, "learning_rate": 8.752241713850675e-06, "loss": 0.0196, "step": 15580 }, { "epoch": 4.86, "grad_norm": 0.451431006193161, "learning_rate": 8.74684049716692e-06, "loss": 0.0122, "step": 15585 }, { "epoch": 4.86, "grad_norm": 0.6849236488342285, "learning_rate": 8.741439651876449e-06, "loss": 0.0191, "step": 15590 }, { "epoch": 4.86, "grad_norm": 0.7148830890655518, "learning_rate": 8.736039179579876e-06, "loss": 0.0154, "step": 15595 }, { "epoch": 4.87, "grad_norm": 0.5371099710464478, "learning_rate": 8.730639081877722e-06, "loss": 0.015, "step": 15600 }, { "epoch": 4.87, "grad_norm": 0.5085660219192505, "learning_rate": 8.725239360370385e-06, "loss": 0.0134, "step": 15605 }, { "epoch": 4.87, "grad_norm": 1.143683671951294, "learning_rate": 8.719840016658162e-06, "loss": 0.0181, "step": 15610 }, { "epoch": 4.87, "grad_norm": 0.539717435836792, "learning_rate": 8.71444105234123e-06, "loss": 0.011, "step": 15615 }, { "epoch": 4.87, "grad_norm": 0.49247002601623535, "learning_rate": 8.709042469019655e-06, "loss": 0.014, "step": 15620 }, { "epoch": 4.87, "grad_norm": 0.8581780195236206, "learning_rate": 8.703644268293391e-06, "loss": 0.0174, "step": 15625 }, { "epoch": 4.87, "grad_norm": 0.5282270312309265, "learning_rate": 8.698246451762283e-06, "loss": 0.0144, "step": 15630 }, { "epoch": 4.88, "grad_norm": 0.8629797697067261, "learning_rate": 8.692849021026051e-06, "loss": 0.009, "step": 15635 }, { "epoch": 4.88, "grad_norm": 0.8482663035392761, "learning_rate": 8.68745197768431e-06, "loss": 0.0156, "step": 15640 }, { "epoch": 4.88, "grad_norm": 0.4563334286212921, "learning_rate": 8.682055323336557e-06, "loss": 0.0098, "step": 15645 }, { "epoch": 4.88, "grad_norm": 0.5153648853302002, "learning_rate": 8.676659059582177e-06, "loss": 0.0128, "step": 15650 }, { "epoch": 4.88, "grad_norm": 0.524944543838501, "learning_rate": 8.671263188020433e-06, "loss": 0.0151, "step": 15655 }, { "epoch": 4.88, "grad_norm": 0.4763322174549103, "learning_rate": 8.66586771025048e-06, "loss": 0.0139, "step": 15660 }, { "epoch": 4.89, "grad_norm": 0.3153722882270813, "learning_rate": 8.66047262787135e-06, "loss": 0.0118, "step": 15665 }, { "epoch": 4.89, "grad_norm": 0.5332691073417664, "learning_rate": 8.655077942481956e-06, "loss": 0.0115, "step": 15670 }, { "epoch": 4.89, "grad_norm": 0.7228777408599854, "learning_rate": 8.649683655681098e-06, "loss": 0.0178, "step": 15675 }, { "epoch": 4.89, "grad_norm": 0.6176360249519348, "learning_rate": 8.644289769067459e-06, "loss": 0.0122, "step": 15680 }, { "epoch": 4.89, "grad_norm": 0.5154833197593689, "learning_rate": 8.6388962842396e-06, "loss": 0.0131, "step": 15685 }, { "epoch": 4.89, "grad_norm": 0.4984290301799774, "learning_rate": 8.63350320279596e-06, "loss": 0.0138, "step": 15690 }, { "epoch": 4.9, "grad_norm": 0.4621444046497345, "learning_rate": 8.62811052633487e-06, "loss": 0.0147, "step": 15695 }, { "epoch": 4.9, "grad_norm": 0.476084440946579, "learning_rate": 8.622718256454535e-06, "loss": 0.0144, "step": 15700 }, { "epoch": 4.9, "grad_norm": 0.3840945363044739, "learning_rate": 8.617326394753023e-06, "loss": 0.0155, "step": 15705 }, { "epoch": 4.9, "grad_norm": 0.40964987874031067, "learning_rate": 8.611934942828309e-06, "loss": 0.0167, "step": 15710 }, { "epoch": 4.9, "grad_norm": 0.9059739708900452, "learning_rate": 8.60654390227823e-06, "loss": 0.0193, "step": 15715 }, { "epoch": 4.9, "grad_norm": 0.5482441186904907, "learning_rate": 8.601153274700504e-06, "loss": 0.0132, "step": 15720 }, { "epoch": 4.9, "grad_norm": 0.5010052919387817, "learning_rate": 8.595763061692729e-06, "loss": 0.0167, "step": 15725 }, { "epoch": 4.91, "grad_norm": 0.5105224847793579, "learning_rate": 8.590373264852376e-06, "loss": 0.0143, "step": 15730 }, { "epoch": 4.91, "grad_norm": 0.4789036512374878, "learning_rate": 8.5849838857768e-06, "loss": 0.0167, "step": 15735 }, { "epoch": 4.91, "grad_norm": 0.28978851437568665, "learning_rate": 8.579594926063219e-06, "loss": 0.0093, "step": 15740 }, { "epoch": 4.91, "grad_norm": 0.5114542245864868, "learning_rate": 8.57420638730874e-06, "loss": 0.0107, "step": 15745 }, { "epoch": 4.91, "grad_norm": 0.5879108309745789, "learning_rate": 8.568818271110338e-06, "loss": 0.0133, "step": 15750 }, { "epoch": 4.91, "grad_norm": 0.5466969013214111, "learning_rate": 8.563430579064866e-06, "loss": 0.015, "step": 15755 }, { "epoch": 4.92, "grad_norm": 0.5947604179382324, "learning_rate": 8.558043312769053e-06, "loss": 0.0143, "step": 15760 }, { "epoch": 4.92, "grad_norm": 0.5738160014152527, "learning_rate": 8.552656473819495e-06, "loss": 0.0182, "step": 15765 }, { "epoch": 4.92, "grad_norm": 0.31728726625442505, "learning_rate": 8.547270063812669e-06, "loss": 0.0096, "step": 15770 }, { "epoch": 4.92, "grad_norm": 0.4764464497566223, "learning_rate": 8.541884084344918e-06, "loss": 0.0186, "step": 15775 }, { "epoch": 4.92, "grad_norm": 0.38102155923843384, "learning_rate": 8.536498537012461e-06, "loss": 0.0113, "step": 15780 }, { "epoch": 4.92, "grad_norm": 0.42733028531074524, "learning_rate": 8.531113423411388e-06, "loss": 0.0103, "step": 15785 }, { "epoch": 4.92, "grad_norm": 0.31331437826156616, "learning_rate": 8.525728745137662e-06, "loss": 0.0093, "step": 15790 }, { "epoch": 4.93, "grad_norm": 0.3956007957458496, "learning_rate": 8.520344503787119e-06, "loss": 0.0126, "step": 15795 }, { "epoch": 4.93, "grad_norm": 0.46345624327659607, "learning_rate": 8.514960700955457e-06, "loss": 0.0116, "step": 15800 }, { "epoch": 4.93, "grad_norm": 0.4458674192428589, "learning_rate": 8.509577338238255e-06, "loss": 0.0129, "step": 15805 }, { "epoch": 4.93, "grad_norm": 0.5788841843605042, "learning_rate": 8.504194417230955e-06, "loss": 0.0106, "step": 15810 }, { "epoch": 4.93, "grad_norm": 0.527131974697113, "learning_rate": 8.498811939528866e-06, "loss": 0.0156, "step": 15815 }, { "epoch": 4.93, "grad_norm": 0.8801189064979553, "learning_rate": 8.49342990672717e-06, "loss": 0.0135, "step": 15820 }, { "epoch": 4.94, "grad_norm": 0.5658692121505737, "learning_rate": 8.48804832042092e-06, "loss": 0.0155, "step": 15825 }, { "epoch": 4.94, "grad_norm": 0.5295402407646179, "learning_rate": 8.482667182205024e-06, "loss": 0.0127, "step": 15830 }, { "epoch": 4.94, "grad_norm": 0.5799888968467712, "learning_rate": 8.477286493674272e-06, "loss": 0.0153, "step": 15835 }, { "epoch": 4.94, "grad_norm": 0.4363563060760498, "learning_rate": 8.471906256423315e-06, "loss": 0.0123, "step": 15840 }, { "epoch": 4.94, "grad_norm": 0.8872877359390259, "learning_rate": 8.466526472046675e-06, "loss": 0.0085, "step": 15845 }, { "epoch": 4.94, "grad_norm": 0.5638665556907654, "learning_rate": 8.46114714213872e-06, "loss": 0.0132, "step": 15850 }, { "epoch": 4.95, "grad_norm": 0.42812150716781616, "learning_rate": 8.455768268293709e-06, "loss": 0.0082, "step": 15855 }, { "epoch": 4.95, "grad_norm": 1.0099304914474487, "learning_rate": 8.45038985210575e-06, "loss": 0.024, "step": 15860 }, { "epoch": 4.95, "grad_norm": 0.5413514971733093, "learning_rate": 8.445011895168826e-06, "loss": 0.0207, "step": 15865 }, { "epoch": 4.95, "grad_norm": 0.6191872358322144, "learning_rate": 8.439634399076772e-06, "loss": 0.0148, "step": 15870 }, { "epoch": 4.95, "grad_norm": 0.48538586497306824, "learning_rate": 8.434257365423296e-06, "loss": 0.0151, "step": 15875 }, { "epoch": 4.95, "grad_norm": 0.42932063341140747, "learning_rate": 8.428880795801965e-06, "loss": 0.0157, "step": 15880 }, { "epoch": 4.95, "grad_norm": 0.3916996121406555, "learning_rate": 8.423504691806209e-06, "loss": 0.0109, "step": 15885 }, { "epoch": 4.96, "grad_norm": 0.24138857424259186, "learning_rate": 8.418129055029316e-06, "loss": 0.0124, "step": 15890 }, { "epoch": 4.96, "grad_norm": 0.2233244627714157, "learning_rate": 8.412753887064441e-06, "loss": 0.0084, "step": 15895 }, { "epoch": 4.96, "grad_norm": 0.5459782481193542, "learning_rate": 8.407379189504602e-06, "loss": 0.0167, "step": 15900 }, { "epoch": 4.96, "grad_norm": 0.5292870402336121, "learning_rate": 8.402004963942672e-06, "loss": 0.0161, "step": 15905 }, { "epoch": 4.96, "grad_norm": 0.5536977648735046, "learning_rate": 8.396631211971386e-06, "loss": 0.0138, "step": 15910 }, { "epoch": 4.96, "grad_norm": 0.6610172986984253, "learning_rate": 8.391257935183342e-06, "loss": 0.0172, "step": 15915 }, { "epoch": 4.97, "grad_norm": 0.5862566232681274, "learning_rate": 8.38588513517099e-06, "loss": 0.0158, "step": 15920 }, { "epoch": 4.97, "grad_norm": 1.3602510690689087, "learning_rate": 8.380512813526641e-06, "loss": 0.0151, "step": 15925 }, { "epoch": 4.97, "grad_norm": 0.3232061266899109, "learning_rate": 8.375140971842471e-06, "loss": 0.0097, "step": 15930 }, { "epoch": 4.97, "grad_norm": 0.41499772667884827, "learning_rate": 8.369769611710504e-06, "loss": 0.0144, "step": 15935 }, { "epoch": 4.97, "grad_norm": 0.504601001739502, "learning_rate": 8.36439873472263e-06, "loss": 0.0119, "step": 15940 }, { "epoch": 4.97, "grad_norm": 0.5243173837661743, "learning_rate": 8.35902834247059e-06, "loss": 0.0126, "step": 15945 }, { "epoch": 4.97, "grad_norm": 0.8670143485069275, "learning_rate": 8.353658436545983e-06, "loss": 0.0158, "step": 15950 }, { "epoch": 4.98, "grad_norm": 0.7447572946548462, "learning_rate": 8.348289018540268e-06, "loss": 0.0206, "step": 15955 }, { "epoch": 4.98, "grad_norm": 0.4693433344364166, "learning_rate": 8.342920090044748e-06, "loss": 0.012, "step": 15960 }, { "epoch": 4.98, "grad_norm": 0.7901753783226013, "learning_rate": 8.337551652650592e-06, "loss": 0.0199, "step": 15965 }, { "epoch": 4.98, "grad_norm": 0.5879650712013245, "learning_rate": 8.332183707948822e-06, "loss": 0.0142, "step": 15970 }, { "epoch": 4.98, "grad_norm": 0.5871224403381348, "learning_rate": 8.326816257530308e-06, "loss": 0.0129, "step": 15975 }, { "epoch": 4.98, "grad_norm": 0.6463751792907715, "learning_rate": 8.32144930298578e-06, "loss": 0.0136, "step": 15980 }, { "epoch": 4.99, "grad_norm": 0.4785372316837311, "learning_rate": 8.316082845905815e-06, "loss": 0.0163, "step": 15985 }, { "epoch": 4.99, "grad_norm": 0.4490886330604553, "learning_rate": 8.310716887880856e-06, "loss": 0.0157, "step": 15990 }, { "epoch": 4.99, "grad_norm": 0.9457346796989441, "learning_rate": 8.305351430501171e-06, "loss": 0.0127, "step": 15995 }, { "epoch": 4.99, "grad_norm": 0.34795841574668884, "learning_rate": 8.29998647535691e-06, "loss": 0.0121, "step": 16000 }, { "epoch": 4.99, "grad_norm": 0.3530838191509247, "learning_rate": 8.294622024038055e-06, "loss": 0.0157, "step": 16005 }, { "epoch": 4.99, "grad_norm": 0.6324779391288757, "learning_rate": 8.289258078134446e-06, "loss": 0.014, "step": 16010 }, { "epoch": 5.0, "grad_norm": 0.674531102180481, "learning_rate": 8.283894639235773e-06, "loss": 0.0165, "step": 16015 }, { "epoch": 5.0, "grad_norm": 0.5302462577819824, "learning_rate": 8.278531708931573e-06, "loss": 0.0169, "step": 16020 }, { "epoch": 5.0, "grad_norm": 0.5076550245285034, "learning_rate": 8.273169288811236e-06, "loss": 0.0164, "step": 16025 }, { "epoch": 5.0, "grad_norm": 0.5246521234512329, "learning_rate": 8.267807380463993e-06, "loss": 0.012, "step": 16030 }, { "epoch": 5.0, "grad_norm": 0.37405821681022644, "learning_rate": 8.262445985478933e-06, "loss": 0.0099, "step": 16035 }, { "epoch": 5.0, "grad_norm": 0.4800686836242676, "learning_rate": 8.257085105444985e-06, "loss": 0.0066, "step": 16040 }, { "epoch": 5.0, "grad_norm": 0.4580146372318268, "learning_rate": 8.251724741950934e-06, "loss": 0.0065, "step": 16045 }, { "epoch": 5.01, "grad_norm": 0.4707932770252228, "learning_rate": 8.246364896585404e-06, "loss": 0.0094, "step": 16050 }, { "epoch": 5.01, "grad_norm": 0.23127494752407074, "learning_rate": 8.241005570936871e-06, "loss": 0.0072, "step": 16055 }, { "epoch": 5.01, "grad_norm": 0.7896751761436462, "learning_rate": 8.235646766593654e-06, "loss": 0.0086, "step": 16060 }, { "epoch": 5.01, "grad_norm": 0.3771972358226776, "learning_rate": 8.230288485143913e-06, "loss": 0.0096, "step": 16065 }, { "epoch": 5.01, "grad_norm": 0.31021884083747864, "learning_rate": 8.224930728175663e-06, "loss": 0.0079, "step": 16070 }, { "epoch": 5.01, "grad_norm": 0.5711778998374939, "learning_rate": 8.219573497276756e-06, "loss": 0.0109, "step": 16075 }, { "epoch": 5.02, "grad_norm": 0.3094363510608673, "learning_rate": 8.214216794034889e-06, "loss": 0.0123, "step": 16080 }, { "epoch": 5.02, "grad_norm": 0.5012419819831848, "learning_rate": 8.208860620037608e-06, "loss": 0.0093, "step": 16085 }, { "epoch": 5.02, "grad_norm": 0.26737749576568604, "learning_rate": 8.203504976872298e-06, "loss": 0.0066, "step": 16090 }, { "epoch": 5.02, "grad_norm": 0.5392455458641052, "learning_rate": 8.19814986612619e-06, "loss": 0.0064, "step": 16095 }, { "epoch": 5.02, "grad_norm": 0.4574849009513855, "learning_rate": 8.192795289386343e-06, "loss": 0.0084, "step": 16100 }, { "epoch": 5.02, "grad_norm": 0.614558756351471, "learning_rate": 8.187441248239681e-06, "loss": 0.0085, "step": 16105 }, { "epoch": 5.02, "grad_norm": 0.3227488696575165, "learning_rate": 8.18208774427295e-06, "loss": 0.0082, "step": 16110 }, { "epoch": 5.03, "grad_norm": 0.3623669147491455, "learning_rate": 8.176734779072747e-06, "loss": 0.0071, "step": 16115 }, { "epoch": 5.03, "grad_norm": 0.20663155615329742, "learning_rate": 8.171382354225504e-06, "loss": 0.0074, "step": 16120 }, { "epoch": 5.03, "grad_norm": 0.4030447006225586, "learning_rate": 8.166030471317497e-06, "loss": 0.0109, "step": 16125 }, { "epoch": 5.03, "grad_norm": 0.3899942934513092, "learning_rate": 8.160679131934842e-06, "loss": 0.0081, "step": 16130 }, { "epoch": 5.03, "grad_norm": 0.5883195400238037, "learning_rate": 8.15532833766349e-06, "loss": 0.0062, "step": 16135 }, { "epoch": 5.03, "grad_norm": 0.509239137172699, "learning_rate": 8.149978090089228e-06, "loss": 0.0059, "step": 16140 }, { "epoch": 5.04, "grad_norm": 0.18122568726539612, "learning_rate": 8.144628390797688e-06, "loss": 0.0079, "step": 16145 }, { "epoch": 5.04, "grad_norm": 0.346282422542572, "learning_rate": 8.139279241374338e-06, "loss": 0.0067, "step": 16150 }, { "epoch": 5.04, "grad_norm": 0.5716591477394104, "learning_rate": 8.133930643404479e-06, "loss": 0.0069, "step": 16155 }, { "epoch": 5.04, "grad_norm": 0.3420288860797882, "learning_rate": 8.128582598473254e-06, "loss": 0.0071, "step": 16160 }, { "epoch": 5.04, "grad_norm": 0.4361545145511627, "learning_rate": 8.123235108165636e-06, "loss": 0.009, "step": 16165 }, { "epoch": 5.04, "grad_norm": 0.3243066072463989, "learning_rate": 8.117888174066439e-06, "loss": 0.0087, "step": 16170 }, { "epoch": 5.04, "grad_norm": 0.2942553460597992, "learning_rate": 8.11254179776031e-06, "loss": 0.0061, "step": 16175 }, { "epoch": 5.05, "grad_norm": 0.24585643410682678, "learning_rate": 8.107195980831726e-06, "loss": 0.0094, "step": 16180 }, { "epoch": 5.05, "grad_norm": 0.2915532886981964, "learning_rate": 8.101850724865005e-06, "loss": 0.0049, "step": 16185 }, { "epoch": 5.05, "grad_norm": 0.25451603531837463, "learning_rate": 8.096506031444302e-06, "loss": 0.0079, "step": 16190 }, { "epoch": 5.05, "grad_norm": 0.315114289522171, "learning_rate": 8.091161902153594e-06, "loss": 0.0079, "step": 16195 }, { "epoch": 5.05, "grad_norm": 0.4226455092430115, "learning_rate": 8.085818338576702e-06, "loss": 0.0066, "step": 16200 }, { "epoch": 5.05, "grad_norm": 0.4661683440208435, "learning_rate": 8.08047534229727e-06, "loss": 0.0088, "step": 16205 }, { "epoch": 5.06, "grad_norm": 0.4152611494064331, "learning_rate": 8.075132914898776e-06, "loss": 0.0053, "step": 16210 }, { "epoch": 5.06, "grad_norm": 0.23992493748664856, "learning_rate": 8.069791057964535e-06, "loss": 0.0048, "step": 16215 }, { "epoch": 5.06, "grad_norm": 0.4227517545223236, "learning_rate": 8.06444977307769e-06, "loss": 0.0073, "step": 16220 }, { "epoch": 5.06, "grad_norm": 0.176507830619812, "learning_rate": 8.059109061821208e-06, "loss": 0.0058, "step": 16225 }, { "epoch": 5.06, "grad_norm": 0.4533865749835968, "learning_rate": 8.053768925777897e-06, "loss": 0.0088, "step": 16230 }, { "epoch": 5.06, "grad_norm": 0.4857219159603119, "learning_rate": 8.048429366530387e-06, "loss": 0.0086, "step": 16235 }, { "epoch": 5.07, "grad_norm": 0.4926903247833252, "learning_rate": 8.043090385661147e-06, "loss": 0.0075, "step": 16240 }, { "epoch": 5.07, "grad_norm": 0.26682066917419434, "learning_rate": 8.037751984752453e-06, "loss": 0.0061, "step": 16245 }, { "epoch": 5.07, "grad_norm": 0.44272875785827637, "learning_rate": 8.032414165386433e-06, "loss": 0.0075, "step": 16250 }, { "epoch": 5.07, "grad_norm": 0.3670891523361206, "learning_rate": 8.027076929145028e-06, "loss": 0.0086, "step": 16255 }, { "epoch": 5.07, "grad_norm": 0.3133547902107239, "learning_rate": 8.021740277610015e-06, "loss": 0.0058, "step": 16260 }, { "epoch": 5.07, "grad_norm": 0.43501976132392883, "learning_rate": 8.016404212362989e-06, "loss": 0.0054, "step": 16265 }, { "epoch": 5.07, "grad_norm": 0.41824793815612793, "learning_rate": 8.011068734985377e-06, "loss": 0.0088, "step": 16270 }, { "epoch": 5.08, "grad_norm": 0.5613520741462708, "learning_rate": 8.005733847058438e-06, "loss": 0.0073, "step": 16275 }, { "epoch": 5.08, "grad_norm": 0.4292289912700653, "learning_rate": 8.000399550163237e-06, "loss": 0.0079, "step": 16280 }, { "epoch": 5.08, "grad_norm": 0.5372112989425659, "learning_rate": 7.99506584588068e-06, "loss": 0.0129, "step": 16285 }, { "epoch": 5.08, "grad_norm": 0.37449565529823303, "learning_rate": 7.989732735791495e-06, "loss": 0.0078, "step": 16290 }, { "epoch": 5.08, "grad_norm": 0.26634928584098816, "learning_rate": 7.984400221476233e-06, "loss": 0.0056, "step": 16295 }, { "epoch": 5.08, "grad_norm": 0.3031364977359772, "learning_rate": 7.979068304515265e-06, "loss": 0.0039, "step": 16300 }, { "epoch": 5.09, "grad_norm": 0.6217584609985352, "learning_rate": 7.973736986488789e-06, "loss": 0.0059, "step": 16305 }, { "epoch": 5.09, "grad_norm": 0.40330415964126587, "learning_rate": 7.96840626897682e-06, "loss": 0.007, "step": 16310 }, { "epoch": 5.09, "grad_norm": 0.30831795930862427, "learning_rate": 7.963076153559207e-06, "loss": 0.0043, "step": 16315 }, { "epoch": 5.09, "grad_norm": 0.4585668444633484, "learning_rate": 7.957746641815603e-06, "loss": 0.0068, "step": 16320 }, { "epoch": 5.09, "grad_norm": 0.25789645314216614, "learning_rate": 7.952417735325497e-06, "loss": 0.0073, "step": 16325 }, { "epoch": 5.09, "grad_norm": 0.5011563897132874, "learning_rate": 7.94708943566819e-06, "loss": 0.0117, "step": 16330 }, { "epoch": 5.09, "grad_norm": 0.5686031579971313, "learning_rate": 7.941761744422811e-06, "loss": 0.0107, "step": 16335 }, { "epoch": 5.1, "grad_norm": 0.4478742778301239, "learning_rate": 7.9364346631683e-06, "loss": 0.0066, "step": 16340 }, { "epoch": 5.1, "grad_norm": 0.4688143730163574, "learning_rate": 7.931108193483423e-06, "loss": 0.0128, "step": 16345 }, { "epoch": 5.1, "grad_norm": 0.34905996918678284, "learning_rate": 7.925782336946764e-06, "loss": 0.0089, "step": 16350 }, { "epoch": 5.1, "grad_norm": 0.3972848355770111, "learning_rate": 7.920457095136715e-06, "loss": 0.0091, "step": 16355 }, { "epoch": 5.1, "grad_norm": 0.18540921807289124, "learning_rate": 7.915132469631501e-06, "loss": 0.006, "step": 16360 }, { "epoch": 5.1, "grad_norm": 0.46762675046920776, "learning_rate": 7.909808462009156e-06, "loss": 0.007, "step": 16365 }, { "epoch": 5.11, "grad_norm": 0.1555611789226532, "learning_rate": 7.90448507384753e-06, "loss": 0.005, "step": 16370 }, { "epoch": 5.11, "grad_norm": 0.6360225081443787, "learning_rate": 7.899162306724293e-06, "loss": 0.0076, "step": 16375 }, { "epoch": 5.11, "grad_norm": 0.5464015007019043, "learning_rate": 7.893840162216933e-06, "loss": 0.0081, "step": 16380 }, { "epoch": 5.11, "grad_norm": 0.37012243270874023, "learning_rate": 7.88851864190275e-06, "loss": 0.0061, "step": 16385 }, { "epoch": 5.11, "grad_norm": 0.30513015389442444, "learning_rate": 7.883197747358851e-06, "loss": 0.0057, "step": 16390 }, { "epoch": 5.11, "grad_norm": 0.500765323638916, "learning_rate": 7.877877480162173e-06, "loss": 0.0061, "step": 16395 }, { "epoch": 5.12, "grad_norm": 0.5255210399627686, "learning_rate": 7.872557841889459e-06, "loss": 0.006, "step": 16400 }, { "epoch": 5.12, "grad_norm": 0.43981844186782837, "learning_rate": 7.867238834117266e-06, "loss": 0.0068, "step": 16405 }, { "epoch": 5.12, "grad_norm": 0.4827001392841339, "learning_rate": 7.861920458421963e-06, "loss": 0.0082, "step": 16410 }, { "epoch": 5.12, "grad_norm": 0.5710332989692688, "learning_rate": 7.856602716379734e-06, "loss": 0.0095, "step": 16415 }, { "epoch": 5.12, "grad_norm": 0.3762331008911133, "learning_rate": 7.85128560956658e-06, "loss": 0.0058, "step": 16420 }, { "epoch": 5.12, "grad_norm": 0.7430204749107361, "learning_rate": 7.8459691395583e-06, "loss": 0.009, "step": 16425 }, { "epoch": 5.12, "grad_norm": 0.47367557883262634, "learning_rate": 7.840653307930511e-06, "loss": 0.0082, "step": 16430 }, { "epoch": 5.13, "grad_norm": 0.7471944093704224, "learning_rate": 7.83533811625865e-06, "loss": 0.0095, "step": 16435 }, { "epoch": 5.13, "grad_norm": 0.3423488438129425, "learning_rate": 7.830023566117954e-06, "loss": 0.006, "step": 16440 }, { "epoch": 5.13, "grad_norm": 0.6812673807144165, "learning_rate": 7.824709659083471e-06, "loss": 0.0068, "step": 16445 }, { "epoch": 5.13, "grad_norm": 0.2431548535823822, "learning_rate": 7.819396396730062e-06, "loss": 0.0069, "step": 16450 }, { "epoch": 5.13, "grad_norm": 0.22807416319847107, "learning_rate": 7.814083780632392e-06, "loss": 0.0073, "step": 16455 }, { "epoch": 5.13, "grad_norm": 0.495912104845047, "learning_rate": 7.808771812364942e-06, "loss": 0.0071, "step": 16460 }, { "epoch": 5.14, "grad_norm": 0.37738263607025146, "learning_rate": 7.80346049350199e-06, "loss": 0.0069, "step": 16465 }, { "epoch": 5.14, "grad_norm": 0.275280624628067, "learning_rate": 7.798149825617632e-06, "loss": 0.006, "step": 16470 }, { "epoch": 5.14, "grad_norm": 0.23672664165496826, "learning_rate": 7.792839810285762e-06, "loss": 0.0075, "step": 16475 }, { "epoch": 5.14, "grad_norm": 0.2902403771877289, "learning_rate": 7.787530449080092e-06, "loss": 0.0057, "step": 16480 }, { "epoch": 5.14, "grad_norm": 0.41451671719551086, "learning_rate": 7.782221743574131e-06, "loss": 0.0067, "step": 16485 }, { "epoch": 5.14, "grad_norm": 0.5114469528198242, "learning_rate": 7.776913695341195e-06, "loss": 0.0067, "step": 16490 }, { "epoch": 5.14, "grad_norm": 0.7483134269714355, "learning_rate": 7.771606305954413e-06, "loss": 0.0073, "step": 16495 }, { "epoch": 5.15, "grad_norm": 0.6692242622375488, "learning_rate": 7.766299576986702e-06, "loss": 0.006, "step": 16500 }, { "epoch": 5.15, "grad_norm": 0.3661095201969147, "learning_rate": 7.7609935100108e-06, "loss": 0.0071, "step": 16505 }, { "epoch": 5.15, "grad_norm": 0.6875879764556885, "learning_rate": 7.75568810659924e-06, "loss": 0.0059, "step": 16510 }, { "epoch": 5.15, "grad_norm": 0.6691876649856567, "learning_rate": 7.750383368324362e-06, "loss": 0.0083, "step": 16515 }, { "epoch": 5.15, "grad_norm": 0.4257379472255707, "learning_rate": 7.745079296758305e-06, "loss": 0.0063, "step": 16520 }, { "epoch": 5.15, "grad_norm": 0.509445071220398, "learning_rate": 7.739775893473018e-06, "loss": 0.0089, "step": 16525 }, { "epoch": 5.16, "grad_norm": 0.48248153924942017, "learning_rate": 7.734473160040248e-06, "loss": 0.0086, "step": 16530 }, { "epoch": 5.16, "grad_norm": 0.311449259519577, "learning_rate": 7.729171098031532e-06, "loss": 0.0056, "step": 16535 }, { "epoch": 5.16, "grad_norm": 0.5185677409172058, "learning_rate": 7.723869709018226e-06, "loss": 0.0065, "step": 16540 }, { "epoch": 5.16, "grad_norm": 0.3155025243759155, "learning_rate": 7.718568994571479e-06, "loss": 0.007, "step": 16545 }, { "epoch": 5.16, "grad_norm": 0.6853141784667969, "learning_rate": 7.713268956262237e-06, "loss": 0.0074, "step": 16550 }, { "epoch": 5.16, "grad_norm": 0.40242108702659607, "learning_rate": 7.707969595661253e-06, "loss": 0.0061, "step": 16555 }, { "epoch": 5.17, "grad_norm": 0.35108885169029236, "learning_rate": 7.702670914339067e-06, "loss": 0.0073, "step": 16560 }, { "epoch": 5.17, "grad_norm": 0.5612503290176392, "learning_rate": 7.697372913866041e-06, "loss": 0.0081, "step": 16565 }, { "epoch": 5.17, "grad_norm": 0.19322334229946136, "learning_rate": 7.692075595812303e-06, "loss": 0.0072, "step": 16570 }, { "epoch": 5.17, "grad_norm": 0.4028273820877075, "learning_rate": 7.686778961747799e-06, "loss": 0.0103, "step": 16575 }, { "epoch": 5.17, "grad_norm": 0.47766733169555664, "learning_rate": 7.681483013242275e-06, "loss": 0.0092, "step": 16580 }, { "epoch": 5.17, "grad_norm": 0.3629557490348816, "learning_rate": 7.676187751865266e-06, "loss": 0.0084, "step": 16585 }, { "epoch": 5.17, "grad_norm": 0.30036285519599915, "learning_rate": 7.6708931791861e-06, "loss": 0.0088, "step": 16590 }, { "epoch": 5.18, "grad_norm": 0.5219706296920776, "learning_rate": 7.665599296773911e-06, "loss": 0.0074, "step": 16595 }, { "epoch": 5.18, "grad_norm": 0.4829447567462921, "learning_rate": 7.660306106197623e-06, "loss": 0.0044, "step": 16600 }, { "epoch": 5.18, "grad_norm": 0.3575102984905243, "learning_rate": 7.655013609025952e-06, "loss": 0.0078, "step": 16605 }, { "epoch": 5.18, "grad_norm": 0.36241811513900757, "learning_rate": 7.649721806827413e-06, "loss": 0.0064, "step": 16610 }, { "epoch": 5.18, "grad_norm": 0.4477905035018921, "learning_rate": 7.644430701170314e-06, "loss": 0.0054, "step": 16615 }, { "epoch": 5.18, "grad_norm": 0.5018873810768127, "learning_rate": 7.639140293622755e-06, "loss": 0.0085, "step": 16620 }, { "epoch": 5.19, "grad_norm": 0.1742485612630844, "learning_rate": 7.633850585752634e-06, "loss": 0.0064, "step": 16625 }, { "epoch": 5.19, "grad_norm": 0.408871591091156, "learning_rate": 7.628561579127635e-06, "loss": 0.0069, "step": 16630 }, { "epoch": 5.19, "grad_norm": 0.4098290801048279, "learning_rate": 7.623273275315237e-06, "loss": 0.0051, "step": 16635 }, { "epoch": 5.19, "grad_norm": 0.3893098533153534, "learning_rate": 7.617985675882716e-06, "loss": 0.0071, "step": 16640 }, { "epoch": 5.19, "grad_norm": 0.2370014637708664, "learning_rate": 7.612698782397128e-06, "loss": 0.0065, "step": 16645 }, { "epoch": 5.19, "grad_norm": 0.3227474093437195, "learning_rate": 7.607412596425327e-06, "loss": 0.0053, "step": 16650 }, { "epoch": 5.19, "grad_norm": 0.3771594166755676, "learning_rate": 7.60212711953396e-06, "loss": 0.0074, "step": 16655 }, { "epoch": 5.2, "grad_norm": 0.28053686022758484, "learning_rate": 7.596842353289455e-06, "loss": 0.0074, "step": 16660 }, { "epoch": 5.2, "grad_norm": 0.30859634280204773, "learning_rate": 7.5915582992580375e-06, "loss": 0.0062, "step": 16665 }, { "epoch": 5.2, "grad_norm": 0.3582562506198883, "learning_rate": 7.586274959005719e-06, "loss": 0.0077, "step": 16670 }, { "epoch": 5.2, "grad_norm": 0.5159679055213928, "learning_rate": 7.580992334098305e-06, "loss": 0.0093, "step": 16675 }, { "epoch": 5.2, "grad_norm": 0.3801780343055725, "learning_rate": 7.57571042610137e-06, "loss": 0.0064, "step": 16680 }, { "epoch": 5.2, "grad_norm": 0.3430151045322418, "learning_rate": 7.5704292365803e-06, "loss": 0.0072, "step": 16685 }, { "epoch": 5.21, "grad_norm": 0.25590091943740845, "learning_rate": 7.565148767100254e-06, "loss": 0.0068, "step": 16690 }, { "epoch": 5.21, "grad_norm": 0.42597323656082153, "learning_rate": 7.559869019226182e-06, "loss": 0.0068, "step": 16695 }, { "epoch": 5.21, "grad_norm": 0.19612255692481995, "learning_rate": 7.55458999452282e-06, "loss": 0.0068, "step": 16700 }, { "epoch": 5.21, "grad_norm": 0.16416482627391815, "learning_rate": 7.549311694554684e-06, "loss": 0.0055, "step": 16705 }, { "epoch": 5.21, "grad_norm": 0.4428281784057617, "learning_rate": 7.5440341208860915e-06, "loss": 0.009, "step": 16710 }, { "epoch": 5.21, "grad_norm": 0.3866334855556488, "learning_rate": 7.538757275081121e-06, "loss": 0.0053, "step": 16715 }, { "epoch": 5.21, "grad_norm": 0.5221220850944519, "learning_rate": 7.5334811587036536e-06, "loss": 0.0095, "step": 16720 }, { "epoch": 5.22, "grad_norm": 0.5159658193588257, "learning_rate": 7.528205773317345e-06, "loss": 0.0079, "step": 16725 }, { "epoch": 5.22, "grad_norm": 0.36763742566108704, "learning_rate": 7.522931120485643e-06, "loss": 0.005, "step": 16730 }, { "epoch": 5.22, "grad_norm": 0.7494738101959229, "learning_rate": 7.517657201771768e-06, "loss": 0.0051, "step": 16735 }, { "epoch": 5.22, "grad_norm": 0.41462403535842896, "learning_rate": 7.512384018738731e-06, "loss": 0.0089, "step": 16740 }, { "epoch": 5.22, "grad_norm": 0.36195388436317444, "learning_rate": 7.507111572949323e-06, "loss": 0.0053, "step": 16745 }, { "epoch": 5.22, "grad_norm": 0.48771071434020996, "learning_rate": 7.501839865966109e-06, "loss": 0.008, "step": 16750 }, { "epoch": 5.23, "grad_norm": 0.4612463414669037, "learning_rate": 7.496568899351445e-06, "loss": 0.01, "step": 16755 }, { "epoch": 5.23, "grad_norm": 0.19453437626361847, "learning_rate": 7.491298674667464e-06, "loss": 0.0054, "step": 16760 }, { "epoch": 5.23, "grad_norm": 0.5014907121658325, "learning_rate": 7.486029193476075e-06, "loss": 0.0068, "step": 16765 }, { "epoch": 5.23, "grad_norm": 0.7443044781684875, "learning_rate": 7.480760457338978e-06, "loss": 0.0096, "step": 16770 }, { "epoch": 5.23, "grad_norm": 0.2169743776321411, "learning_rate": 7.47549246781764e-06, "loss": 0.0117, "step": 16775 }, { "epoch": 5.23, "grad_norm": 0.5546299815177917, "learning_rate": 7.470225226473311e-06, "loss": 0.0072, "step": 16780 }, { "epoch": 5.24, "grad_norm": 0.8305453658103943, "learning_rate": 7.464958734867025e-06, "loss": 0.0061, "step": 16785 }, { "epoch": 5.24, "grad_norm": 0.44384169578552246, "learning_rate": 7.459692994559582e-06, "loss": 0.008, "step": 16790 }, { "epoch": 5.24, "grad_norm": 0.44968289136886597, "learning_rate": 7.4544280071115684e-06, "loss": 0.0084, "step": 16795 }, { "epoch": 5.24, "grad_norm": 0.5661505460739136, "learning_rate": 7.449163774083345e-06, "loss": 0.0059, "step": 16800 }, { "epoch": 5.24, "grad_norm": 0.669439435005188, "learning_rate": 7.443900297035049e-06, "loss": 0.0097, "step": 16805 }, { "epoch": 5.24, "grad_norm": 0.5095635056495667, "learning_rate": 7.4386375775265916e-06, "loss": 0.0069, "step": 16810 }, { "epoch": 5.24, "grad_norm": 0.3544565439224243, "learning_rate": 7.433375617117666e-06, "loss": 0.0067, "step": 16815 }, { "epoch": 5.25, "grad_norm": 0.48801660537719727, "learning_rate": 7.428114417367738e-06, "loss": 0.0079, "step": 16820 }, { "epoch": 5.25, "grad_norm": 0.4711840748786926, "learning_rate": 7.422853979836035e-06, "loss": 0.0078, "step": 16825 }, { "epoch": 5.25, "grad_norm": 0.43352481722831726, "learning_rate": 7.417594306081577e-06, "loss": 0.0075, "step": 16830 }, { "epoch": 5.25, "grad_norm": 0.3749412000179291, "learning_rate": 7.412335397663149e-06, "loss": 0.0089, "step": 16835 }, { "epoch": 5.25, "grad_norm": 0.17959199845790863, "learning_rate": 7.407077256139309e-06, "loss": 0.0073, "step": 16840 }, { "epoch": 5.25, "grad_norm": 0.40305107831954956, "learning_rate": 7.4018198830683905e-06, "loss": 0.0074, "step": 16845 }, { "epoch": 5.26, "grad_norm": 0.30933234095573425, "learning_rate": 7.396563280008494e-06, "loss": 0.0094, "step": 16850 }, { "epoch": 5.26, "grad_norm": 0.267681360244751, "learning_rate": 7.391307448517506e-06, "loss": 0.0074, "step": 16855 }, { "epoch": 5.26, "grad_norm": 0.33332905173301697, "learning_rate": 7.38605239015306e-06, "loss": 0.0093, "step": 16860 }, { "epoch": 5.26, "grad_norm": 0.18610811233520508, "learning_rate": 7.380798106472582e-06, "loss": 0.0094, "step": 16865 }, { "epoch": 5.26, "grad_norm": 0.5006966590881348, "learning_rate": 7.375544599033255e-06, "loss": 0.0074, "step": 16870 }, { "epoch": 5.26, "grad_norm": 0.37795162200927734, "learning_rate": 7.370291869392045e-06, "loss": 0.0091, "step": 16875 }, { "epoch": 5.26, "grad_norm": 0.5025229454040527, "learning_rate": 7.365039919105675e-06, "loss": 0.0085, "step": 16880 }, { "epoch": 5.27, "grad_norm": 0.3851081430912018, "learning_rate": 7.3597887497306455e-06, "loss": 0.0092, "step": 16885 }, { "epoch": 5.27, "grad_norm": 0.2460136115550995, "learning_rate": 7.35453836282322e-06, "loss": 0.0071, "step": 16890 }, { "epoch": 5.27, "grad_norm": 0.3819717764854431, "learning_rate": 7.34928875993943e-06, "loss": 0.0067, "step": 16895 }, { "epoch": 5.27, "grad_norm": 0.40334203839302063, "learning_rate": 7.34403994263508e-06, "loss": 0.0101, "step": 16900 }, { "epoch": 5.27, "grad_norm": 0.22943516075611115, "learning_rate": 7.338791912465737e-06, "loss": 0.0094, "step": 16905 }, { "epoch": 5.27, "grad_norm": 0.2664775848388672, "learning_rate": 7.333544670986734e-06, "loss": 0.0081, "step": 16910 }, { "epoch": 5.28, "grad_norm": 0.3135198950767517, "learning_rate": 7.328298219753177e-06, "loss": 0.0081, "step": 16915 }, { "epoch": 5.28, "grad_norm": 0.5245127081871033, "learning_rate": 7.323052560319932e-06, "loss": 0.0089, "step": 16920 }, { "epoch": 5.28, "grad_norm": 0.3170236647129059, "learning_rate": 7.317807694241635e-06, "loss": 0.0057, "step": 16925 }, { "epoch": 5.28, "grad_norm": 0.3144915997982025, "learning_rate": 7.3125636230726745e-06, "loss": 0.0076, "step": 16930 }, { "epoch": 5.28, "grad_norm": 0.4207755923271179, "learning_rate": 7.307320348367219e-06, "loss": 0.0059, "step": 16935 }, { "epoch": 5.28, "grad_norm": 0.6968371868133545, "learning_rate": 7.302077871679194e-06, "loss": 0.0104, "step": 16940 }, { "epoch": 5.29, "grad_norm": 0.3698020577430725, "learning_rate": 7.296836194562287e-06, "loss": 0.0128, "step": 16945 }, { "epoch": 5.29, "grad_norm": 0.5103458166122437, "learning_rate": 7.291595318569951e-06, "loss": 0.007, "step": 16950 }, { "epoch": 5.29, "grad_norm": 0.2294590026140213, "learning_rate": 7.286355245255399e-06, "loss": 0.0065, "step": 16955 }, { "epoch": 5.29, "grad_norm": 0.8034264445304871, "learning_rate": 7.281115976171612e-06, "loss": 0.0068, "step": 16960 }, { "epoch": 5.29, "grad_norm": 0.5444835424423218, "learning_rate": 7.2758775128713325e-06, "loss": 0.006, "step": 16965 }, { "epoch": 5.29, "grad_norm": 0.4064411520957947, "learning_rate": 7.270639856907047e-06, "loss": 0.0084, "step": 16970 }, { "epoch": 5.29, "grad_norm": 0.17976729571819305, "learning_rate": 7.265403009831027e-06, "loss": 0.0062, "step": 16975 }, { "epoch": 5.3, "grad_norm": 0.5045095086097717, "learning_rate": 7.260166973195292e-06, "loss": 0.0076, "step": 16980 }, { "epoch": 5.3, "grad_norm": 0.32540497183799744, "learning_rate": 7.254931748551619e-06, "loss": 0.0069, "step": 16985 }, { "epoch": 5.3, "grad_norm": 0.5425432920455933, "learning_rate": 7.2496973374515535e-06, "loss": 0.0081, "step": 16990 }, { "epoch": 5.3, "grad_norm": 0.34949442744255066, "learning_rate": 7.244463741446387e-06, "loss": 0.0074, "step": 16995 }, { "epoch": 5.3, "grad_norm": 0.37236133217811584, "learning_rate": 7.239230962087189e-06, "loss": 0.0093, "step": 17000 }, { "epoch": 5.3, "grad_norm": 0.3425101339817047, "learning_rate": 7.233999000924763e-06, "loss": 0.0067, "step": 17005 }, { "epoch": 5.31, "grad_norm": 0.5776107311248779, "learning_rate": 7.2287678595096865e-06, "loss": 0.0098, "step": 17010 }, { "epoch": 5.31, "grad_norm": 0.39168134331703186, "learning_rate": 7.223537539392287e-06, "loss": 0.0114, "step": 17015 }, { "epoch": 5.31, "grad_norm": 0.5078431963920593, "learning_rate": 7.218308042122655e-06, "loss": 0.0099, "step": 17020 }, { "epoch": 5.31, "grad_norm": 0.19572730362415314, "learning_rate": 7.213079369250632e-06, "loss": 0.006, "step": 17025 }, { "epoch": 5.31, "grad_norm": 0.2532220780849457, "learning_rate": 7.2078515223258165e-06, "loss": 0.0066, "step": 17030 }, { "epoch": 5.31, "grad_norm": 0.38014352321624756, "learning_rate": 7.202624502897564e-06, "loss": 0.0078, "step": 17035 }, { "epoch": 5.31, "grad_norm": 0.39346054196357727, "learning_rate": 7.197398312514978e-06, "loss": 0.0074, "step": 17040 }, { "epoch": 5.32, "grad_norm": 0.3673287630081177, "learning_rate": 7.192172952726923e-06, "loss": 0.0074, "step": 17045 }, { "epoch": 5.32, "grad_norm": 0.42266541719436646, "learning_rate": 7.186948425082017e-06, "loss": 0.0072, "step": 17050 }, { "epoch": 5.32, "grad_norm": 0.7222540378570557, "learning_rate": 7.181724731128627e-06, "loss": 0.0065, "step": 17055 }, { "epoch": 5.32, "grad_norm": 0.35174280405044556, "learning_rate": 7.176501872414881e-06, "loss": 0.0091, "step": 17060 }, { "epoch": 5.32, "grad_norm": 0.591694712638855, "learning_rate": 7.1712798504886506e-06, "loss": 0.0071, "step": 17065 }, { "epoch": 5.32, "grad_norm": 0.6140367984771729, "learning_rate": 7.166058666897565e-06, "loss": 0.0107, "step": 17070 }, { "epoch": 5.33, "grad_norm": 0.2917439341545105, "learning_rate": 7.160838323189001e-06, "loss": 0.0056, "step": 17075 }, { "epoch": 5.33, "grad_norm": 0.4179360866546631, "learning_rate": 7.1556188209100884e-06, "loss": 0.0087, "step": 17080 }, { "epoch": 5.33, "grad_norm": 0.5467280149459839, "learning_rate": 7.150400161607709e-06, "loss": 0.0091, "step": 17085 }, { "epoch": 5.33, "grad_norm": 0.7273814082145691, "learning_rate": 7.145182346828492e-06, "loss": 0.0076, "step": 17090 }, { "epoch": 5.33, "grad_norm": 0.42867380380630493, "learning_rate": 7.139965378118821e-06, "loss": 0.0068, "step": 17095 }, { "epoch": 5.33, "grad_norm": 0.2701338529586792, "learning_rate": 7.13474925702482e-06, "loss": 0.0088, "step": 17100 }, { "epoch": 5.33, "grad_norm": 0.5208498239517212, "learning_rate": 7.129533985092378e-06, "loss": 0.0063, "step": 17105 }, { "epoch": 5.34, "grad_norm": 0.47218087315559387, "learning_rate": 7.124319563867112e-06, "loss": 0.0056, "step": 17110 }, { "epoch": 5.34, "grad_norm": 0.6032540202140808, "learning_rate": 7.119105994894398e-06, "loss": 0.0091, "step": 17115 }, { "epoch": 5.34, "grad_norm": 0.2994047701358795, "learning_rate": 7.113893279719359e-06, "loss": 0.0071, "step": 17120 }, { "epoch": 5.34, "grad_norm": 0.4658941924571991, "learning_rate": 7.108681419886867e-06, "loss": 0.0075, "step": 17125 }, { "epoch": 5.34, "grad_norm": 0.17439764738082886, "learning_rate": 7.103470416941536e-06, "loss": 0.0064, "step": 17130 }, { "epoch": 5.34, "grad_norm": 0.3298243582248688, "learning_rate": 7.098260272427726e-06, "loss": 0.0063, "step": 17135 }, { "epoch": 5.35, "grad_norm": 0.5897051095962524, "learning_rate": 7.093050987889547e-06, "loss": 0.0052, "step": 17140 }, { "epoch": 5.35, "grad_norm": 0.3087965250015259, "learning_rate": 7.087842564870851e-06, "loss": 0.0057, "step": 17145 }, { "epoch": 5.35, "grad_norm": 0.19054736196994781, "learning_rate": 7.08263500491523e-06, "loss": 0.0102, "step": 17150 }, { "epoch": 5.35, "grad_norm": 0.3062085211277008, "learning_rate": 7.077428309566033e-06, "loss": 0.0068, "step": 17155 }, { "epoch": 5.35, "grad_norm": 0.3566223382949829, "learning_rate": 7.072222480366335e-06, "loss": 0.0059, "step": 17160 }, { "epoch": 5.35, "grad_norm": 0.33739137649536133, "learning_rate": 7.067017518858975e-06, "loss": 0.0079, "step": 17165 }, { "epoch": 5.36, "grad_norm": 0.3130680322647095, "learning_rate": 7.061813426586519e-06, "loss": 0.0075, "step": 17170 }, { "epoch": 5.36, "grad_norm": 0.15110720694065094, "learning_rate": 7.056610205091281e-06, "loss": 0.0073, "step": 17175 }, { "epoch": 5.36, "grad_norm": 0.5237319469451904, "learning_rate": 7.05140785591532e-06, "loss": 0.0064, "step": 17180 }, { "epoch": 5.36, "grad_norm": 0.5438427925109863, "learning_rate": 7.0462063806004255e-06, "loss": 0.0072, "step": 17185 }, { "epoch": 5.36, "grad_norm": 0.3356355130672455, "learning_rate": 7.041005780688142e-06, "loss": 0.006, "step": 17190 }, { "epoch": 5.36, "grad_norm": 0.338467538356781, "learning_rate": 7.035806057719743e-06, "loss": 0.0068, "step": 17195 }, { "epoch": 5.36, "grad_norm": 0.17586380243301392, "learning_rate": 7.030607213236249e-06, "loss": 0.0064, "step": 17200 }, { "epoch": 5.37, "grad_norm": 0.47041377425193787, "learning_rate": 7.025409248778421e-06, "loss": 0.0071, "step": 17205 }, { "epoch": 5.37, "grad_norm": 0.37697193026542664, "learning_rate": 7.020212165886755e-06, "loss": 0.0067, "step": 17210 }, { "epoch": 5.37, "grad_norm": 0.4052976965904236, "learning_rate": 7.015015966101488e-06, "loss": 0.0092, "step": 17215 }, { "epoch": 5.37, "grad_norm": 0.5404878258705139, "learning_rate": 7.009820650962593e-06, "loss": 0.0097, "step": 17220 }, { "epoch": 5.37, "grad_norm": 0.3760913908481598, "learning_rate": 7.004626222009782e-06, "loss": 0.0066, "step": 17225 }, { "epoch": 5.37, "grad_norm": 0.5538203120231628, "learning_rate": 6.9994326807825045e-06, "loss": 0.0085, "step": 17230 }, { "epoch": 5.38, "grad_norm": 0.5948095917701721, "learning_rate": 6.994240028819947e-06, "loss": 0.0079, "step": 17235 }, { "epoch": 5.38, "grad_norm": 0.5411348938941956, "learning_rate": 6.989048267661036e-06, "loss": 0.0085, "step": 17240 }, { "epoch": 5.38, "grad_norm": 0.4167710244655609, "learning_rate": 6.983857398844425e-06, "loss": 0.0067, "step": 17245 }, { "epoch": 5.38, "grad_norm": 0.5900302529335022, "learning_rate": 6.978667423908517e-06, "loss": 0.0066, "step": 17250 }, { "epoch": 5.38, "grad_norm": 0.3182935118675232, "learning_rate": 6.973478344391433e-06, "loss": 0.0097, "step": 17255 }, { "epoch": 5.38, "grad_norm": 0.4234834313392639, "learning_rate": 6.9682901618310385e-06, "loss": 0.0087, "step": 17260 }, { "epoch": 5.38, "grad_norm": 0.34514597058296204, "learning_rate": 6.963102877764936e-06, "loss": 0.0066, "step": 17265 }, { "epoch": 5.39, "grad_norm": 0.3347316086292267, "learning_rate": 6.9579164937304544e-06, "loss": 0.0072, "step": 17270 }, { "epoch": 5.39, "grad_norm": 0.364444375038147, "learning_rate": 6.952731011264662e-06, "loss": 0.0048, "step": 17275 }, { "epoch": 5.39, "grad_norm": 0.3441850543022156, "learning_rate": 6.9475464319043554e-06, "loss": 0.0068, "step": 17280 }, { "epoch": 5.39, "grad_norm": 0.5691609978675842, "learning_rate": 6.942362757186065e-06, "loss": 0.0094, "step": 17285 }, { "epoch": 5.39, "grad_norm": 0.5621780157089233, "learning_rate": 6.937179988646056e-06, "loss": 0.0072, "step": 17290 }, { "epoch": 5.39, "grad_norm": 0.4421490430831909, "learning_rate": 6.931998127820316e-06, "loss": 0.0089, "step": 17295 }, { "epoch": 5.4, "grad_norm": 0.5314481258392334, "learning_rate": 6.926817176244576e-06, "loss": 0.0112, "step": 17300 }, { "epoch": 5.4, "grad_norm": 0.2780129313468933, "learning_rate": 6.921637135454286e-06, "loss": 0.0075, "step": 17305 }, { "epoch": 5.4, "grad_norm": 0.5449675917625427, "learning_rate": 6.916458006984639e-06, "loss": 0.0063, "step": 17310 }, { "epoch": 5.4, "grad_norm": 0.5825093388557434, "learning_rate": 6.9112797923705445e-06, "loss": 0.009, "step": 17315 }, { "epoch": 5.4, "grad_norm": 1.126198410987854, "learning_rate": 6.906102493146648e-06, "loss": 0.0096, "step": 17320 }, { "epoch": 5.4, "grad_norm": 0.5498197078704834, "learning_rate": 6.9009261108473255e-06, "loss": 0.0053, "step": 17325 }, { "epoch": 5.41, "grad_norm": 0.38959771394729614, "learning_rate": 6.895750647006674e-06, "loss": 0.0081, "step": 17330 }, { "epoch": 5.41, "grad_norm": 0.4668501913547516, "learning_rate": 6.890576103158524e-06, "loss": 0.0076, "step": 17335 }, { "epoch": 5.41, "grad_norm": 0.4254097640514374, "learning_rate": 6.885402480836431e-06, "loss": 0.008, "step": 17340 }, { "epoch": 5.41, "grad_norm": 0.38289356231689453, "learning_rate": 6.880229781573678e-06, "loss": 0.0083, "step": 17345 }, { "epoch": 5.41, "grad_norm": 0.3994235694408417, "learning_rate": 6.875058006903279e-06, "loss": 0.0076, "step": 17350 }, { "epoch": 5.41, "grad_norm": 0.33938008546829224, "learning_rate": 6.8698871583579666e-06, "loss": 0.008, "step": 17355 }, { "epoch": 5.41, "grad_norm": 0.5851308703422546, "learning_rate": 6.864717237470205e-06, "loss": 0.0071, "step": 17360 }, { "epoch": 5.42, "grad_norm": 0.5873516798019409, "learning_rate": 6.859548245772174e-06, "loss": 0.0088, "step": 17365 }, { "epoch": 5.42, "grad_norm": 0.1811552196741104, "learning_rate": 6.854380184795789e-06, "loss": 0.0054, "step": 17370 }, { "epoch": 5.42, "grad_norm": 0.8535553812980652, "learning_rate": 6.849213056072685e-06, "loss": 0.0057, "step": 17375 }, { "epoch": 5.42, "grad_norm": 0.2805171608924866, "learning_rate": 6.844046861134221e-06, "loss": 0.0073, "step": 17380 }, { "epoch": 5.42, "grad_norm": 0.388770192861557, "learning_rate": 6.838881601511478e-06, "loss": 0.006, "step": 17385 }, { "epoch": 5.42, "grad_norm": 0.7432388663291931, "learning_rate": 6.83371727873526e-06, "loss": 0.008, "step": 17390 }, { "epoch": 5.43, "grad_norm": 0.41411373019218445, "learning_rate": 6.828553894336102e-06, "loss": 0.0083, "step": 17395 }, { "epoch": 5.43, "grad_norm": 0.3691873550415039, "learning_rate": 6.823391449844242e-06, "loss": 0.0092, "step": 17400 }, { "epoch": 5.43, "grad_norm": 0.21740001440048218, "learning_rate": 6.818229946789654e-06, "loss": 0.0069, "step": 17405 }, { "epoch": 5.43, "grad_norm": 0.20824594795703888, "learning_rate": 6.8130693867020325e-06, "loss": 0.0049, "step": 17410 }, { "epoch": 5.43, "grad_norm": 0.1659618467092514, "learning_rate": 6.807909771110788e-06, "loss": 0.0071, "step": 17415 }, { "epoch": 5.43, "grad_norm": 0.71225905418396, "learning_rate": 6.802751101545053e-06, "loss": 0.0084, "step": 17420 }, { "epoch": 5.43, "grad_norm": 0.3749540448188782, "learning_rate": 6.7975933795336805e-06, "loss": 0.0067, "step": 17425 }, { "epoch": 5.44, "grad_norm": 0.45130300521850586, "learning_rate": 6.792436606605242e-06, "loss": 0.006, "step": 17430 }, { "epoch": 5.44, "grad_norm": 0.4924355745315552, "learning_rate": 6.787280784288022e-06, "loss": 0.0095, "step": 17435 }, { "epoch": 5.44, "grad_norm": 0.5896020531654358, "learning_rate": 6.7821259141100326e-06, "loss": 0.009, "step": 17440 }, { "epoch": 5.44, "grad_norm": 0.43783894181251526, "learning_rate": 6.776971997598999e-06, "loss": 0.0106, "step": 17445 }, { "epoch": 5.44, "grad_norm": 0.5060574412345886, "learning_rate": 6.771819036282362e-06, "loss": 0.0075, "step": 17450 }, { "epoch": 5.44, "grad_norm": 0.4454652667045593, "learning_rate": 6.766667031687286e-06, "loss": 0.0075, "step": 17455 }, { "epoch": 5.45, "grad_norm": 0.17911115288734436, "learning_rate": 6.761515985340647e-06, "loss": 0.0086, "step": 17460 }, { "epoch": 5.45, "grad_norm": 0.4743806719779968, "learning_rate": 6.756365898769035e-06, "loss": 0.0078, "step": 17465 }, { "epoch": 5.45, "grad_norm": 0.8190107941627502, "learning_rate": 6.751216773498762e-06, "loss": 0.0089, "step": 17470 }, { "epoch": 5.45, "grad_norm": 0.19955362379550934, "learning_rate": 6.746068611055847e-06, "loss": 0.0078, "step": 17475 }, { "epoch": 5.45, "grad_norm": 0.265946626663208, "learning_rate": 6.740921412966029e-06, "loss": 0.0067, "step": 17480 }, { "epoch": 5.45, "grad_norm": 0.22414854168891907, "learning_rate": 6.73577518075476e-06, "loss": 0.0084, "step": 17485 }, { "epoch": 5.46, "grad_norm": 1.177064299583435, "learning_rate": 6.7306299159472045e-06, "loss": 0.0066, "step": 17490 }, { "epoch": 5.46, "grad_norm": 0.3832259476184845, "learning_rate": 6.7254856200682465e-06, "loss": 0.0085, "step": 17495 }, { "epoch": 5.46, "grad_norm": 0.281180202960968, "learning_rate": 6.720342294642474e-06, "loss": 0.0067, "step": 17500 }, { "epoch": 5.46, "grad_norm": 0.3122906982898712, "learning_rate": 6.715199941194196e-06, "loss": 0.0092, "step": 17505 }, { "epoch": 5.46, "grad_norm": 0.33859917521476746, "learning_rate": 6.71005856124742e-06, "loss": 0.0065, "step": 17510 }, { "epoch": 5.46, "grad_norm": 0.3163743019104004, "learning_rate": 6.70491815632588e-06, "loss": 0.0075, "step": 17515 }, { "epoch": 5.46, "grad_norm": 0.39969271421432495, "learning_rate": 6.699778727953015e-06, "loss": 0.0059, "step": 17520 }, { "epoch": 5.47, "grad_norm": 0.31698331236839294, "learning_rate": 6.694640277651973e-06, "loss": 0.0094, "step": 17525 }, { "epoch": 5.47, "grad_norm": 0.393361359834671, "learning_rate": 6.6895028069456135e-06, "loss": 0.008, "step": 17530 }, { "epoch": 5.47, "grad_norm": 0.3493746817111969, "learning_rate": 6.684366317356503e-06, "loss": 0.0056, "step": 17535 }, { "epoch": 5.47, "grad_norm": 0.25108638405799866, "learning_rate": 6.67923081040693e-06, "loss": 0.0044, "step": 17540 }, { "epoch": 5.47, "grad_norm": 0.5176618695259094, "learning_rate": 6.674096287618869e-06, "loss": 0.0055, "step": 17545 }, { "epoch": 5.47, "grad_norm": 0.3980843424797058, "learning_rate": 6.668962750514022e-06, "loss": 0.0088, "step": 17550 }, { "epoch": 5.48, "grad_norm": 0.37212374806404114, "learning_rate": 6.663830200613792e-06, "loss": 0.0067, "step": 17555 }, { "epoch": 5.48, "grad_norm": 0.334362655878067, "learning_rate": 6.658698639439289e-06, "loss": 0.0099, "step": 17560 }, { "epoch": 5.48, "grad_norm": 0.6859321594238281, "learning_rate": 6.6535680685113325e-06, "loss": 0.0096, "step": 17565 }, { "epoch": 5.48, "grad_norm": 0.7242924571037292, "learning_rate": 6.648438489350445e-06, "loss": 0.0092, "step": 17570 }, { "epoch": 5.48, "grad_norm": 0.35120564699172974, "learning_rate": 6.64330990347686e-06, "loss": 0.0072, "step": 17575 }, { "epoch": 5.48, "grad_norm": 0.27487900853157043, "learning_rate": 6.638182312410509e-06, "loss": 0.0047, "step": 17580 }, { "epoch": 5.48, "grad_norm": 0.8260858654975891, "learning_rate": 6.633055717671035e-06, "loss": 0.0079, "step": 17585 }, { "epoch": 5.49, "grad_norm": 0.2853456139564514, "learning_rate": 6.627930120777786e-06, "loss": 0.0062, "step": 17590 }, { "epoch": 5.49, "grad_norm": 0.4730463922023773, "learning_rate": 6.622805523249807e-06, "loss": 0.0071, "step": 17595 }, { "epoch": 5.49, "grad_norm": 0.4280153214931488, "learning_rate": 6.617681926605859e-06, "loss": 0.0101, "step": 17600 }, { "epoch": 5.49, "grad_norm": 0.4409542679786682, "learning_rate": 6.612559332364395e-06, "loss": 0.0078, "step": 17605 }, { "epoch": 5.49, "grad_norm": 0.47959208488464355, "learning_rate": 6.607437742043577e-06, "loss": 0.0087, "step": 17610 }, { "epoch": 5.49, "grad_norm": 0.32082483172416687, "learning_rate": 6.602317157161269e-06, "loss": 0.0072, "step": 17615 }, { "epoch": 5.5, "grad_norm": 0.4335188865661621, "learning_rate": 6.597197579235033e-06, "loss": 0.0092, "step": 17620 }, { "epoch": 5.5, "grad_norm": 0.3355504870414734, "learning_rate": 6.592079009782135e-06, "loss": 0.0067, "step": 17625 }, { "epoch": 5.5, "grad_norm": 0.4183447062969208, "learning_rate": 6.586961450319544e-06, "loss": 0.0061, "step": 17630 }, { "epoch": 5.5, "grad_norm": 0.4092869162559509, "learning_rate": 6.58184490236393e-06, "loss": 0.0084, "step": 17635 }, { "epoch": 5.5, "grad_norm": 0.7920547127723694, "learning_rate": 6.576729367431658e-06, "loss": 0.0067, "step": 17640 }, { "epoch": 5.5, "grad_norm": 0.35771897435188293, "learning_rate": 6.571614847038798e-06, "loss": 0.0113, "step": 17645 }, { "epoch": 5.5, "grad_norm": 0.46013301610946655, "learning_rate": 6.566501342701124e-06, "loss": 0.009, "step": 17650 }, { "epoch": 5.51, "grad_norm": 0.39704209566116333, "learning_rate": 6.561388855934088e-06, "loss": 0.006, "step": 17655 }, { "epoch": 5.51, "grad_norm": 0.4999745190143585, "learning_rate": 6.5562773882528655e-06, "loss": 0.0085, "step": 17660 }, { "epoch": 5.51, "grad_norm": 0.4186748266220093, "learning_rate": 6.5511669411723165e-06, "loss": 0.006, "step": 17665 }, { "epoch": 5.51, "grad_norm": 0.5002567172050476, "learning_rate": 6.546057516207002e-06, "loss": 0.0109, "step": 17670 }, { "epoch": 5.51, "grad_norm": 0.5059384107589722, "learning_rate": 6.540949114871178e-06, "loss": 0.01, "step": 17675 }, { "epoch": 5.51, "grad_norm": 0.36181607842445374, "learning_rate": 6.535841738678798e-06, "loss": 0.0081, "step": 17680 }, { "epoch": 5.52, "grad_norm": 0.4996640086174011, "learning_rate": 6.53073538914352e-06, "loss": 0.0096, "step": 17685 }, { "epoch": 5.52, "grad_norm": 0.430021196603775, "learning_rate": 6.525630067778679e-06, "loss": 0.0076, "step": 17690 }, { "epoch": 5.52, "grad_norm": 0.5806915163993835, "learning_rate": 6.52052577609732e-06, "loss": 0.0095, "step": 17695 }, { "epoch": 5.52, "grad_norm": 0.16090747714042664, "learning_rate": 6.515422515612181e-06, "loss": 0.0054, "step": 17700 }, { "epoch": 5.52, "grad_norm": 0.35832875967025757, "learning_rate": 6.510320287835692e-06, "loss": 0.0078, "step": 17705 }, { "epoch": 5.52, "grad_norm": 0.5224231481552124, "learning_rate": 6.5052190942799774e-06, "loss": 0.0077, "step": 17710 }, { "epoch": 5.53, "grad_norm": 0.6822590827941895, "learning_rate": 6.500118936456854e-06, "loss": 0.0063, "step": 17715 }, { "epoch": 5.53, "grad_norm": 0.22760039567947388, "learning_rate": 6.495019815877836e-06, "loss": 0.0086, "step": 17720 }, { "epoch": 5.53, "grad_norm": 0.47752487659454346, "learning_rate": 6.489921734054122e-06, "loss": 0.0079, "step": 17725 }, { "epoch": 5.53, "grad_norm": 0.44208142161369324, "learning_rate": 6.484824692496611e-06, "loss": 0.0086, "step": 17730 }, { "epoch": 5.53, "grad_norm": 0.3971075713634491, "learning_rate": 6.479728692715889e-06, "loss": 0.0091, "step": 17735 }, { "epoch": 5.53, "grad_norm": 0.48173531889915466, "learning_rate": 6.474633736222234e-06, "loss": 0.0104, "step": 17740 }, { "epoch": 5.53, "grad_norm": 0.3530065417289734, "learning_rate": 6.469539824525617e-06, "loss": 0.0077, "step": 17745 }, { "epoch": 5.54, "grad_norm": 0.6809261441230774, "learning_rate": 6.4644469591357e-06, "loss": 0.0073, "step": 17750 }, { "epoch": 5.54, "grad_norm": 0.2195349484682083, "learning_rate": 6.45935514156183e-06, "loss": 0.006, "step": 17755 }, { "epoch": 5.54, "grad_norm": 0.3710613548755646, "learning_rate": 6.454264373313046e-06, "loss": 0.0092, "step": 17760 }, { "epoch": 5.54, "grad_norm": 0.37743639945983887, "learning_rate": 6.449174655898076e-06, "loss": 0.0073, "step": 17765 }, { "epoch": 5.54, "grad_norm": 0.28303930163383484, "learning_rate": 6.444085990825338e-06, "loss": 0.0072, "step": 17770 }, { "epoch": 5.54, "grad_norm": 0.5262421369552612, "learning_rate": 6.4389983796029365e-06, "loss": 0.0106, "step": 17775 }, { "epoch": 5.55, "grad_norm": 0.4436008036136627, "learning_rate": 6.433911823738663e-06, "loss": 0.0103, "step": 17780 }, { "epoch": 5.55, "grad_norm": 0.4982847571372986, "learning_rate": 6.428826324739999e-06, "loss": 0.0065, "step": 17785 }, { "epoch": 5.55, "grad_norm": 0.1745348423719406, "learning_rate": 6.42374188411411e-06, "loss": 0.0067, "step": 17790 }, { "epoch": 5.55, "grad_norm": 0.8984200358390808, "learning_rate": 6.418658503367854e-06, "loss": 0.009, "step": 17795 }, { "epoch": 5.55, "grad_norm": 0.415552020072937, "learning_rate": 6.41357618400776e-06, "loss": 0.0089, "step": 17800 }, { "epoch": 5.55, "grad_norm": 0.3550618886947632, "learning_rate": 6.4084949275400585e-06, "loss": 0.0062, "step": 17805 }, { "epoch": 5.55, "grad_norm": 0.32670262455940247, "learning_rate": 6.403414735470659e-06, "loss": 0.0093, "step": 17810 }, { "epoch": 5.56, "grad_norm": 0.18545816838741302, "learning_rate": 6.398335609305153e-06, "loss": 0.0064, "step": 17815 }, { "epoch": 5.56, "grad_norm": 0.3328353464603424, "learning_rate": 6.39325755054882e-06, "loss": 0.008, "step": 17820 }, { "epoch": 5.56, "grad_norm": 0.2967495620250702, "learning_rate": 6.388180560706619e-06, "loss": 0.0054, "step": 17825 }, { "epoch": 5.56, "grad_norm": 0.2916224002838135, "learning_rate": 6.383104641283203e-06, "loss": 0.0063, "step": 17830 }, { "epoch": 5.56, "grad_norm": 0.4712311923503876, "learning_rate": 6.378029793782889e-06, "loss": 0.0073, "step": 17835 }, { "epoch": 5.56, "grad_norm": 0.712516725063324, "learning_rate": 6.37295601970969e-06, "loss": 0.0078, "step": 17840 }, { "epoch": 5.57, "grad_norm": 0.32948067784309387, "learning_rate": 6.3678833205673004e-06, "loss": 0.0076, "step": 17845 }, { "epoch": 5.57, "grad_norm": 1.3793821334838867, "learning_rate": 6.362811697859093e-06, "loss": 0.0141, "step": 17850 }, { "epoch": 5.57, "grad_norm": 0.2590395212173462, "learning_rate": 6.357741153088119e-06, "loss": 0.007, "step": 17855 }, { "epoch": 5.57, "grad_norm": 0.37518686056137085, "learning_rate": 6.352671687757116e-06, "loss": 0.0074, "step": 17860 }, { "epoch": 5.57, "grad_norm": 0.4013412594795227, "learning_rate": 6.347603303368499e-06, "loss": 0.008, "step": 17865 }, { "epoch": 5.57, "grad_norm": 0.5398988723754883, "learning_rate": 6.342536001424361e-06, "loss": 0.0092, "step": 17870 }, { "epoch": 5.58, "grad_norm": 0.4342360496520996, "learning_rate": 6.337469783426474e-06, "loss": 0.0111, "step": 17875 }, { "epoch": 5.58, "grad_norm": 0.3777037560939789, "learning_rate": 6.332404650876294e-06, "loss": 0.0052, "step": 17880 }, { "epoch": 5.58, "grad_norm": 0.3465896248817444, "learning_rate": 6.327340605274945e-06, "loss": 0.0076, "step": 17885 }, { "epoch": 5.58, "grad_norm": 0.43914374709129333, "learning_rate": 6.322277648123245e-06, "loss": 0.0079, "step": 17890 }, { "epoch": 5.58, "grad_norm": 0.1666235476732254, "learning_rate": 6.317215780921675e-06, "loss": 0.0077, "step": 17895 }, { "epoch": 5.58, "grad_norm": 0.3394373059272766, "learning_rate": 6.3121550051704e-06, "loss": 0.0067, "step": 17900 }, { "epoch": 5.58, "grad_norm": 0.37623217701911926, "learning_rate": 6.3070953223692554e-06, "loss": 0.01, "step": 17905 }, { "epoch": 5.59, "grad_norm": 0.49671798944473267, "learning_rate": 6.30203673401776e-06, "loss": 0.01, "step": 17910 }, { "epoch": 5.59, "grad_norm": 0.6104849576950073, "learning_rate": 6.296979241615105e-06, "loss": 0.0113, "step": 17915 }, { "epoch": 5.59, "grad_norm": 0.4916791617870331, "learning_rate": 6.291922846660154e-06, "loss": 0.0092, "step": 17920 }, { "epoch": 5.59, "grad_norm": 0.32638782262802124, "learning_rate": 6.28686755065145e-06, "loss": 0.0064, "step": 17925 }, { "epoch": 5.59, "grad_norm": 0.44251349568367004, "learning_rate": 6.281813355087207e-06, "loss": 0.0058, "step": 17930 }, { "epoch": 5.59, "grad_norm": 0.35004180669784546, "learning_rate": 6.276760261465325e-06, "loss": 0.0108, "step": 17935 }, { "epoch": 5.6, "grad_norm": 0.526656985282898, "learning_rate": 6.27170827128335e-06, "loss": 0.007, "step": 17940 }, { "epoch": 5.6, "grad_norm": 0.928345799446106, "learning_rate": 6.266657386038526e-06, "loss": 0.0077, "step": 17945 }, { "epoch": 5.6, "grad_norm": 0.5705955624580383, "learning_rate": 6.261607607227761e-06, "loss": 0.0095, "step": 17950 }, { "epoch": 5.6, "grad_norm": 0.5848002433776855, "learning_rate": 6.256558936347636e-06, "loss": 0.0098, "step": 17955 }, { "epoch": 5.6, "grad_norm": 0.2608962655067444, "learning_rate": 6.251511374894403e-06, "loss": 0.0077, "step": 17960 }, { "epoch": 5.6, "grad_norm": 0.2794182300567627, "learning_rate": 6.246464924363983e-06, "loss": 0.0049, "step": 17965 }, { "epoch": 5.6, "grad_norm": 0.4642208218574524, "learning_rate": 6.24141958625197e-06, "loss": 0.0089, "step": 17970 }, { "epoch": 5.61, "grad_norm": 0.5016393661499023, "learning_rate": 6.236375362053635e-06, "loss": 0.007, "step": 17975 }, { "epoch": 5.61, "grad_norm": 1.0015122890472412, "learning_rate": 6.231332253263904e-06, "loss": 0.0093, "step": 17980 }, { "epoch": 5.61, "grad_norm": 0.5549787282943726, "learning_rate": 6.226290261377381e-06, "loss": 0.0065, "step": 17985 }, { "epoch": 5.61, "grad_norm": 0.456961452960968, "learning_rate": 6.221249387888343e-06, "loss": 0.0084, "step": 17990 }, { "epoch": 5.61, "grad_norm": 0.4469737708568573, "learning_rate": 6.216209634290729e-06, "loss": 0.0056, "step": 17995 }, { "epoch": 5.61, "grad_norm": 0.3027060329914093, "learning_rate": 6.211171002078149e-06, "loss": 0.0042, "step": 18000 }, { "epoch": 5.62, "grad_norm": 0.2818998396396637, "learning_rate": 6.2061334927438784e-06, "loss": 0.0073, "step": 18005 }, { "epoch": 5.62, "grad_norm": 0.31544992327690125, "learning_rate": 6.201097107780865e-06, "loss": 0.0076, "step": 18010 }, { "epoch": 5.62, "grad_norm": 0.2734914720058441, "learning_rate": 6.1960618486817145e-06, "loss": 0.0065, "step": 18015 }, { "epoch": 5.62, "grad_norm": 0.21664875745773315, "learning_rate": 6.191027716938706e-06, "loss": 0.0061, "step": 18020 }, { "epoch": 5.62, "grad_norm": 0.5399858951568604, "learning_rate": 6.185994714043786e-06, "loss": 0.0076, "step": 18025 }, { "epoch": 5.62, "grad_norm": 0.5281535387039185, "learning_rate": 6.180962841488556e-06, "loss": 0.0101, "step": 18030 }, { "epoch": 5.63, "grad_norm": 0.4660307765007019, "learning_rate": 6.175932100764296e-06, "loss": 0.0087, "step": 18035 }, { "epoch": 5.63, "grad_norm": 0.42953482270240784, "learning_rate": 6.170902493361941e-06, "loss": 0.0063, "step": 18040 }, { "epoch": 5.63, "grad_norm": 0.21518750488758087, "learning_rate": 6.1658740207720975e-06, "loss": 0.009, "step": 18045 }, { "epoch": 5.63, "grad_norm": 0.35465386509895325, "learning_rate": 6.160846684485027e-06, "loss": 0.0077, "step": 18050 }, { "epoch": 5.63, "grad_norm": 0.8630399107933044, "learning_rate": 6.155820485990658e-06, "loss": 0.0074, "step": 18055 }, { "epoch": 5.63, "grad_norm": 0.34728437662124634, "learning_rate": 6.1507954267785865e-06, "loss": 0.006, "step": 18060 }, { "epoch": 5.63, "grad_norm": 0.7103444933891296, "learning_rate": 6.145771508338063e-06, "loss": 0.0077, "step": 18065 }, { "epoch": 5.64, "grad_norm": 0.4810100793838501, "learning_rate": 6.140748732158005e-06, "loss": 0.0091, "step": 18070 }, { "epoch": 5.64, "grad_norm": 0.2638356685638428, "learning_rate": 6.1357270997269894e-06, "loss": 0.0059, "step": 18075 }, { "epoch": 5.64, "grad_norm": 0.2502276599407196, "learning_rate": 6.130706612533262e-06, "loss": 0.0073, "step": 18080 }, { "epoch": 5.64, "grad_norm": 0.4023531675338745, "learning_rate": 6.125687272064713e-06, "loss": 0.0055, "step": 18085 }, { "epoch": 5.64, "grad_norm": 0.4165780544281006, "learning_rate": 6.120669079808902e-06, "loss": 0.0063, "step": 18090 }, { "epoch": 5.64, "grad_norm": 0.3536226451396942, "learning_rate": 6.115652037253054e-06, "loss": 0.0084, "step": 18095 }, { "epoch": 5.65, "grad_norm": 0.2316283881664276, "learning_rate": 6.110636145884045e-06, "loss": 0.0072, "step": 18100 }, { "epoch": 5.65, "grad_norm": 0.09715107083320618, "learning_rate": 6.105621407188411e-06, "loss": 0.0048, "step": 18105 }, { "epoch": 5.65, "grad_norm": 0.6509740948677063, "learning_rate": 6.100607822652351e-06, "loss": 0.0071, "step": 18110 }, { "epoch": 5.65, "grad_norm": 0.48909834027290344, "learning_rate": 6.095595393761713e-06, "loss": 0.0058, "step": 18115 }, { "epoch": 5.65, "grad_norm": 0.44172388315200806, "learning_rate": 6.090584122002018e-06, "loss": 0.0085, "step": 18120 }, { "epoch": 5.65, "grad_norm": 0.5661274790763855, "learning_rate": 6.085574008858423e-06, "loss": 0.006, "step": 18125 }, { "epoch": 5.65, "grad_norm": 0.5164355635643005, "learning_rate": 6.0805650558157595e-06, "loss": 0.0064, "step": 18130 }, { "epoch": 5.66, "grad_norm": 0.35191330313682556, "learning_rate": 6.075557264358504e-06, "loss": 0.0055, "step": 18135 }, { "epoch": 5.66, "grad_norm": 0.46775275468826294, "learning_rate": 6.070550635970798e-06, "loss": 0.0096, "step": 18140 }, { "epoch": 5.66, "grad_norm": 0.3156464993953705, "learning_rate": 6.065545172136431e-06, "loss": 0.0073, "step": 18145 }, { "epoch": 5.66, "grad_norm": 0.27590519189834595, "learning_rate": 6.060540874338849e-06, "loss": 0.0068, "step": 18150 }, { "epoch": 5.66, "grad_norm": 0.28529655933380127, "learning_rate": 6.05553774406116e-06, "loss": 0.0088, "step": 18155 }, { "epoch": 5.66, "grad_norm": 0.48952776193618774, "learning_rate": 6.050535782786111e-06, "loss": 0.007, "step": 18160 }, { "epoch": 5.67, "grad_norm": 0.587226390838623, "learning_rate": 6.045534991996114e-06, "loss": 0.0072, "step": 18165 }, { "epoch": 5.67, "grad_norm": 0.2564910650253296, "learning_rate": 6.0405353731732296e-06, "loss": 0.0084, "step": 18170 }, { "epoch": 5.67, "grad_norm": 0.5341393351554871, "learning_rate": 6.035536927799173e-06, "loss": 0.0096, "step": 18175 }, { "epoch": 5.67, "grad_norm": 0.4170636832714081, "learning_rate": 6.0305396573553145e-06, "loss": 0.0063, "step": 18180 }, { "epoch": 5.67, "grad_norm": 0.5394135117530823, "learning_rate": 6.025543563322671e-06, "loss": 0.0093, "step": 18185 }, { "epoch": 5.67, "grad_norm": 0.14204931259155273, "learning_rate": 6.0205486471819115e-06, "loss": 0.0068, "step": 18190 }, { "epoch": 5.67, "grad_norm": 0.34632179141044617, "learning_rate": 6.015554910413358e-06, "loss": 0.0086, "step": 18195 }, { "epoch": 5.68, "grad_norm": 0.3683947026729584, "learning_rate": 6.010562354496981e-06, "loss": 0.0065, "step": 18200 }, { "epoch": 5.68, "grad_norm": 0.47035160660743713, "learning_rate": 6.005570980912401e-06, "loss": 0.0046, "step": 18205 }, { "epoch": 5.68, "grad_norm": 0.4578624665737152, "learning_rate": 6.000580791138891e-06, "loss": 0.0073, "step": 18210 }, { "epoch": 5.68, "grad_norm": 0.28652694821357727, "learning_rate": 5.99559178665537e-06, "loss": 0.0085, "step": 18215 }, { "epoch": 5.68, "grad_norm": 0.5046875476837158, "learning_rate": 5.990603968940406e-06, "loss": 0.0065, "step": 18220 }, { "epoch": 5.68, "grad_norm": 0.45822709798812866, "learning_rate": 5.985617339472225e-06, "loss": 0.0067, "step": 18225 }, { "epoch": 5.69, "grad_norm": 0.4730364680290222, "learning_rate": 5.980631899728679e-06, "loss": 0.0116, "step": 18230 }, { "epoch": 5.69, "grad_norm": 0.6081570386886597, "learning_rate": 5.975647651187285e-06, "loss": 0.0111, "step": 18235 }, { "epoch": 5.69, "grad_norm": 0.47355684638023376, "learning_rate": 5.970664595325206e-06, "loss": 0.0071, "step": 18240 }, { "epoch": 5.69, "grad_norm": 0.4323113262653351, "learning_rate": 5.965682733619246e-06, "loss": 0.0056, "step": 18245 }, { "epoch": 5.69, "grad_norm": 0.5472829937934875, "learning_rate": 5.960702067545856e-06, "loss": 0.0084, "step": 18250 }, { "epoch": 5.69, "grad_norm": 0.5550527572631836, "learning_rate": 5.955722598581135e-06, "loss": 0.0122, "step": 18255 }, { "epoch": 5.7, "grad_norm": 0.40815654397010803, "learning_rate": 5.9507443282008285e-06, "loss": 0.0065, "step": 18260 }, { "epoch": 5.7, "grad_norm": 0.4066593647003174, "learning_rate": 5.94576725788032e-06, "loss": 0.0071, "step": 18265 }, { "epoch": 5.7, "grad_norm": 0.3816416263580322, "learning_rate": 5.94079138909464e-06, "loss": 0.0064, "step": 18270 }, { "epoch": 5.7, "grad_norm": 0.5269685983657837, "learning_rate": 5.93581672331847e-06, "loss": 0.006, "step": 18275 }, { "epoch": 5.7, "grad_norm": 0.7609176635742188, "learning_rate": 5.930843262026122e-06, "loss": 0.0066, "step": 18280 }, { "epoch": 5.7, "grad_norm": 0.6766870617866516, "learning_rate": 5.925871006691568e-06, "loss": 0.0062, "step": 18285 }, { "epoch": 5.7, "grad_norm": 0.4741158187389374, "learning_rate": 5.920899958788408e-06, "loss": 0.0094, "step": 18290 }, { "epoch": 5.71, "grad_norm": 0.21361322700977325, "learning_rate": 5.915930119789889e-06, "loss": 0.007, "step": 18295 }, { "epoch": 5.71, "grad_norm": 0.5186450481414795, "learning_rate": 5.910961491168903e-06, "loss": 0.0086, "step": 18300 }, { "epoch": 5.71, "grad_norm": 0.23191529512405396, "learning_rate": 5.905994074397975e-06, "loss": 0.0104, "step": 18305 }, { "epoch": 5.71, "grad_norm": 0.2721780240535736, "learning_rate": 5.9010278709492796e-06, "loss": 0.0087, "step": 18310 }, { "epoch": 5.71, "grad_norm": 0.6018416285514832, "learning_rate": 5.896062882294628e-06, "loss": 0.0071, "step": 18315 }, { "epoch": 5.71, "grad_norm": 0.20528219640254974, "learning_rate": 5.891099109905468e-06, "loss": 0.0077, "step": 18320 }, { "epoch": 5.72, "grad_norm": 0.41088053584098816, "learning_rate": 5.886136555252897e-06, "loss": 0.0089, "step": 18325 }, { "epoch": 5.72, "grad_norm": 0.29491591453552246, "learning_rate": 5.881175219807642e-06, "loss": 0.0074, "step": 18330 }, { "epoch": 5.72, "grad_norm": 0.9241437911987305, "learning_rate": 5.876215105040074e-06, "loss": 0.0081, "step": 18335 }, { "epoch": 5.72, "grad_norm": 0.5419356822967529, "learning_rate": 5.871256212420195e-06, "loss": 0.0097, "step": 18340 }, { "epoch": 5.72, "grad_norm": 0.8657314777374268, "learning_rate": 5.8662985434176545e-06, "loss": 0.0096, "step": 18345 }, { "epoch": 5.72, "grad_norm": 0.5296528339385986, "learning_rate": 5.861342099501732e-06, "loss": 0.0066, "step": 18350 }, { "epoch": 5.72, "grad_norm": 0.3924877643585205, "learning_rate": 5.856386882141349e-06, "loss": 0.0068, "step": 18355 }, { "epoch": 5.73, "grad_norm": 0.5159767866134644, "learning_rate": 5.851432892805058e-06, "loss": 0.0042, "step": 18360 }, { "epoch": 5.73, "grad_norm": 0.39266231656074524, "learning_rate": 5.84648013296105e-06, "loss": 0.0093, "step": 18365 }, { "epoch": 5.73, "grad_norm": 0.7416058778762817, "learning_rate": 5.841528604077162e-06, "loss": 0.0075, "step": 18370 }, { "epoch": 5.73, "grad_norm": 0.6685163378715515, "learning_rate": 5.836578307620845e-06, "loss": 0.0079, "step": 18375 }, { "epoch": 5.73, "grad_norm": 0.9518123269081116, "learning_rate": 5.8316292450592e-06, "loss": 0.0093, "step": 18380 }, { "epoch": 5.73, "grad_norm": 0.47857388854026794, "learning_rate": 5.8266814178589595e-06, "loss": 0.0066, "step": 18385 }, { "epoch": 5.74, "grad_norm": 0.5052703022956848, "learning_rate": 5.821734827486487e-06, "loss": 0.0093, "step": 18390 }, { "epoch": 5.74, "grad_norm": 0.35116446018218994, "learning_rate": 5.81678947540778e-06, "loss": 0.0082, "step": 18395 }, { "epoch": 5.74, "grad_norm": 0.3625754117965698, "learning_rate": 5.811845363088477e-06, "loss": 0.0069, "step": 18400 }, { "epoch": 5.74, "grad_norm": 0.6056107878684998, "learning_rate": 5.8069024919938425e-06, "loss": 0.0099, "step": 18405 }, { "epoch": 5.74, "grad_norm": 0.3495260179042816, "learning_rate": 5.801960863588765e-06, "loss": 0.0073, "step": 18410 }, { "epoch": 5.74, "grad_norm": 0.4010307490825653, "learning_rate": 5.797020479337777e-06, "loss": 0.0067, "step": 18415 }, { "epoch": 5.75, "grad_norm": 0.16561423242092133, "learning_rate": 5.792081340705038e-06, "loss": 0.0069, "step": 18420 }, { "epoch": 5.75, "grad_norm": 0.4529399871826172, "learning_rate": 5.78714344915434e-06, "loss": 0.008, "step": 18425 }, { "epoch": 5.75, "grad_norm": 0.3863770067691803, "learning_rate": 5.7822068061491e-06, "loss": 0.0045, "step": 18430 }, { "epoch": 5.75, "grad_norm": 0.8393487930297852, "learning_rate": 5.777271413152369e-06, "loss": 0.0086, "step": 18435 }, { "epoch": 5.75, "grad_norm": 0.37971916794776917, "learning_rate": 5.772337271626833e-06, "loss": 0.0088, "step": 18440 }, { "epoch": 5.75, "grad_norm": 0.46120578050613403, "learning_rate": 5.7674043830348016e-06, "loss": 0.0077, "step": 18445 }, { "epoch": 5.75, "grad_norm": 0.18467526137828827, "learning_rate": 5.762472748838206e-06, "loss": 0.0055, "step": 18450 }, { "epoch": 5.76, "grad_norm": 0.36761900782585144, "learning_rate": 5.757542370498614e-06, "loss": 0.007, "step": 18455 }, { "epoch": 5.76, "grad_norm": 0.6084608435630798, "learning_rate": 5.752613249477223e-06, "loss": 0.0077, "step": 18460 }, { "epoch": 5.76, "grad_norm": 0.4849078357219696, "learning_rate": 5.747685387234855e-06, "loss": 0.0076, "step": 18465 }, { "epoch": 5.76, "grad_norm": 0.44836604595184326, "learning_rate": 5.742758785231954e-06, "loss": 0.0075, "step": 18470 }, { "epoch": 5.76, "grad_norm": 0.6127037405967712, "learning_rate": 5.7378334449286e-06, "loss": 0.0075, "step": 18475 }, { "epoch": 5.76, "grad_norm": 0.43997830152511597, "learning_rate": 5.732909367784492e-06, "loss": 0.0081, "step": 18480 }, { "epoch": 5.77, "grad_norm": 0.28263017535209656, "learning_rate": 5.727986555258955e-06, "loss": 0.0084, "step": 18485 }, { "epoch": 5.77, "grad_norm": 0.33662813901901245, "learning_rate": 5.723065008810942e-06, "loss": 0.0099, "step": 18490 }, { "epoch": 5.77, "grad_norm": 0.2875825762748718, "learning_rate": 5.718144729899031e-06, "loss": 0.0058, "step": 18495 }, { "epoch": 5.77, "grad_norm": 0.20810118317604065, "learning_rate": 5.713225719981424e-06, "loss": 0.0068, "step": 18500 }, { "epoch": 5.77, "grad_norm": 0.3628634214401245, "learning_rate": 5.708307980515943e-06, "loss": 0.007, "step": 18505 }, { "epoch": 5.77, "grad_norm": 0.7062156796455383, "learning_rate": 5.703391512960037e-06, "loss": 0.0057, "step": 18510 }, { "epoch": 5.77, "grad_norm": 0.43663591146469116, "learning_rate": 5.698476318770778e-06, "loss": 0.0087, "step": 18515 }, { "epoch": 5.78, "grad_norm": 0.3063855767250061, "learning_rate": 5.693562399404862e-06, "loss": 0.0072, "step": 18520 }, { "epoch": 5.78, "grad_norm": 0.3865858316421509, "learning_rate": 5.688649756318603e-06, "loss": 0.0093, "step": 18525 }, { "epoch": 5.78, "grad_norm": 1.110260248184204, "learning_rate": 5.68373839096794e-06, "loss": 0.009, "step": 18530 }, { "epoch": 5.78, "grad_norm": 0.5447413921356201, "learning_rate": 5.678828304808433e-06, "loss": 0.0067, "step": 18535 }, { "epoch": 5.78, "grad_norm": 0.44069159030914307, "learning_rate": 5.673919499295259e-06, "loss": 0.0087, "step": 18540 }, { "epoch": 5.78, "grad_norm": 0.8088098764419556, "learning_rate": 5.669011975883223e-06, "loss": 0.0091, "step": 18545 }, { "epoch": 5.79, "grad_norm": 1.602562427520752, "learning_rate": 5.664105736026747e-06, "loss": 0.0063, "step": 18550 }, { "epoch": 5.79, "grad_norm": 0.43349578976631165, "learning_rate": 5.659200781179861e-06, "loss": 0.0061, "step": 18555 }, { "epoch": 5.79, "grad_norm": 0.4047665596008301, "learning_rate": 5.6542971127962354e-06, "loss": 0.0071, "step": 18560 }, { "epoch": 5.79, "grad_norm": 0.7945146560668945, "learning_rate": 5.649394732329146e-06, "loss": 0.0075, "step": 18565 }, { "epoch": 5.79, "grad_norm": 0.4201711118221283, "learning_rate": 5.6444936412314865e-06, "loss": 0.0096, "step": 18570 }, { "epoch": 5.79, "grad_norm": 0.41688671708106995, "learning_rate": 5.639593840955774e-06, "loss": 0.005, "step": 18575 }, { "epoch": 5.8, "grad_norm": 0.7006680965423584, "learning_rate": 5.6346953329541394e-06, "loss": 0.0088, "step": 18580 }, { "epoch": 5.8, "grad_norm": 0.27040454745292664, "learning_rate": 5.629798118678335e-06, "loss": 0.0081, "step": 18585 }, { "epoch": 5.8, "grad_norm": 0.32489171624183655, "learning_rate": 5.624902199579721e-06, "loss": 0.0071, "step": 18590 }, { "epoch": 5.8, "grad_norm": 0.5387438535690308, "learning_rate": 5.620007577109281e-06, "loss": 0.008, "step": 18595 }, { "epoch": 5.8, "grad_norm": 0.6050368547439575, "learning_rate": 5.6151142527176074e-06, "loss": 0.0101, "step": 18600 }, { "epoch": 5.8, "grad_norm": 0.23941612243652344, "learning_rate": 5.6102222278549226e-06, "loss": 0.0047, "step": 18605 }, { "epoch": 5.8, "grad_norm": 0.6153684258460999, "learning_rate": 5.6053315039710495e-06, "loss": 0.0099, "step": 18610 }, { "epoch": 5.81, "grad_norm": 0.35738781094551086, "learning_rate": 5.6004420825154295e-06, "loss": 0.0072, "step": 18615 }, { "epoch": 5.81, "grad_norm": 0.5066648721694946, "learning_rate": 5.59555396493712e-06, "loss": 0.0067, "step": 18620 }, { "epoch": 5.81, "grad_norm": 0.3617473244667053, "learning_rate": 5.590667152684792e-06, "loss": 0.0081, "step": 18625 }, { "epoch": 5.81, "grad_norm": 0.6097965240478516, "learning_rate": 5.585781647206724e-06, "loss": 0.0059, "step": 18630 }, { "epoch": 5.81, "grad_norm": 0.5452948808670044, "learning_rate": 5.580897449950813e-06, "loss": 0.0082, "step": 18635 }, { "epoch": 5.81, "grad_norm": 0.5515166521072388, "learning_rate": 5.576014562364562e-06, "loss": 0.0079, "step": 18640 }, { "epoch": 5.82, "grad_norm": 0.3684811294078827, "learning_rate": 5.5711329858951e-06, "loss": 0.0082, "step": 18645 }, { "epoch": 5.82, "grad_norm": 0.5962668061256409, "learning_rate": 5.5662527219891535e-06, "loss": 0.0077, "step": 18650 }, { "epoch": 5.82, "grad_norm": 0.2659561038017273, "learning_rate": 5.561373772093065e-06, "loss": 0.0088, "step": 18655 }, { "epoch": 5.82, "grad_norm": 0.34423190355300903, "learning_rate": 5.5564961376527904e-06, "loss": 0.007, "step": 18660 }, { "epoch": 5.82, "grad_norm": 0.40950068831443787, "learning_rate": 5.551619820113885e-06, "loss": 0.0049, "step": 18665 }, { "epoch": 5.82, "grad_norm": 0.3321197032928467, "learning_rate": 5.5467448209215246e-06, "loss": 0.0079, "step": 18670 }, { "epoch": 5.82, "grad_norm": 0.4530077576637268, "learning_rate": 5.541871141520492e-06, "loss": 0.0099, "step": 18675 }, { "epoch": 5.83, "grad_norm": 0.485211580991745, "learning_rate": 5.536998783355175e-06, "loss": 0.0058, "step": 18680 }, { "epoch": 5.83, "grad_norm": 0.5379996299743652, "learning_rate": 5.532127747869572e-06, "loss": 0.01, "step": 18685 }, { "epoch": 5.83, "grad_norm": 0.564601719379425, "learning_rate": 5.527258036507297e-06, "loss": 0.0078, "step": 18690 }, { "epoch": 5.83, "grad_norm": 0.29679882526397705, "learning_rate": 5.522389650711564e-06, "loss": 0.0068, "step": 18695 }, { "epoch": 5.83, "grad_norm": 0.29927995800971985, "learning_rate": 5.5175225919251866e-06, "loss": 0.0066, "step": 18700 }, { "epoch": 5.83, "grad_norm": 0.5875388383865356, "learning_rate": 5.512656861590597e-06, "loss": 0.0058, "step": 18705 }, { "epoch": 5.84, "grad_norm": 0.3787662982940674, "learning_rate": 5.5077924611498305e-06, "loss": 0.0086, "step": 18710 }, { "epoch": 5.84, "grad_norm": 0.41421857476234436, "learning_rate": 5.502929392044528e-06, "loss": 0.0056, "step": 18715 }, { "epoch": 5.84, "grad_norm": 0.16817878186702728, "learning_rate": 5.498067655715935e-06, "loss": 0.0075, "step": 18720 }, { "epoch": 5.84, "grad_norm": 0.46259284019470215, "learning_rate": 5.4932072536049045e-06, "loss": 0.0088, "step": 18725 }, { "epoch": 5.84, "grad_norm": 0.5328652858734131, "learning_rate": 5.488348187151888e-06, "loss": 0.0063, "step": 18730 }, { "epoch": 5.84, "grad_norm": 0.3828234076499939, "learning_rate": 5.483490457796949e-06, "loss": 0.0083, "step": 18735 }, { "epoch": 5.84, "grad_norm": 0.29909276962280273, "learning_rate": 5.478634066979751e-06, "loss": 0.0084, "step": 18740 }, { "epoch": 5.85, "grad_norm": 0.37078049778938293, "learning_rate": 5.473779016139557e-06, "loss": 0.0109, "step": 18745 }, { "epoch": 5.85, "grad_norm": 0.603055477142334, "learning_rate": 5.468925306715243e-06, "loss": 0.008, "step": 18750 }, { "epoch": 5.85, "grad_norm": 0.7020739316940308, "learning_rate": 5.464072940145274e-06, "loss": 0.0077, "step": 18755 }, { "epoch": 5.85, "grad_norm": 0.8944587707519531, "learning_rate": 5.459221917867729e-06, "loss": 0.0058, "step": 18760 }, { "epoch": 5.85, "grad_norm": 0.306552916765213, "learning_rate": 5.454372241320283e-06, "loss": 0.0087, "step": 18765 }, { "epoch": 5.85, "grad_norm": 0.35202908515930176, "learning_rate": 5.449523911940211e-06, "loss": 0.006, "step": 18770 }, { "epoch": 5.86, "grad_norm": 0.4559305012226105, "learning_rate": 5.444676931164393e-06, "loss": 0.0082, "step": 18775 }, { "epoch": 5.86, "grad_norm": 0.2316230684518814, "learning_rate": 5.4398313004293056e-06, "loss": 0.0056, "step": 18780 }, { "epoch": 5.86, "grad_norm": 0.5010417699813843, "learning_rate": 5.4349870211710285e-06, "loss": 0.0096, "step": 18785 }, { "epoch": 5.86, "grad_norm": 0.5433796048164368, "learning_rate": 5.430144094825238e-06, "loss": 0.0107, "step": 18790 }, { "epoch": 5.86, "grad_norm": 0.40989163517951965, "learning_rate": 5.425302522827209e-06, "loss": 0.0061, "step": 18795 }, { "epoch": 5.86, "grad_norm": 0.3090606927871704, "learning_rate": 5.420462306611818e-06, "loss": 0.0061, "step": 18800 }, { "epoch": 5.87, "grad_norm": 0.5436885356903076, "learning_rate": 5.4156234476135385e-06, "loss": 0.0105, "step": 18805 }, { "epoch": 5.87, "grad_norm": 0.6983522176742554, "learning_rate": 5.410785947266442e-06, "loss": 0.0068, "step": 18810 }, { "epoch": 5.87, "grad_norm": 0.24617521464824677, "learning_rate": 5.405949807004196e-06, "loss": 0.0071, "step": 18815 }, { "epoch": 5.87, "grad_norm": 0.5607662200927734, "learning_rate": 5.401115028260064e-06, "loss": 0.0064, "step": 18820 }, { "epoch": 5.87, "grad_norm": 0.6128187775611877, "learning_rate": 5.396281612466912e-06, "loss": 0.0073, "step": 18825 }, { "epoch": 5.87, "grad_norm": 0.46771490573883057, "learning_rate": 5.391449561057194e-06, "loss": 0.0109, "step": 18830 }, { "epoch": 5.87, "grad_norm": 0.19731804728507996, "learning_rate": 5.386618875462963e-06, "loss": 0.0082, "step": 18835 }, { "epoch": 5.88, "grad_norm": 0.39063313603401184, "learning_rate": 5.381789557115877e-06, "loss": 0.0079, "step": 18840 }, { "epoch": 5.88, "grad_norm": 0.2991364300251007, "learning_rate": 5.376961607447162e-06, "loss": 0.0076, "step": 18845 }, { "epoch": 5.88, "grad_norm": 0.3813847005367279, "learning_rate": 5.37213502788767e-06, "loss": 0.0089, "step": 18850 }, { "epoch": 5.88, "grad_norm": 0.46439966559410095, "learning_rate": 5.36730981986783e-06, "loss": 0.0108, "step": 18855 }, { "epoch": 5.88, "grad_norm": 0.4384636878967285, "learning_rate": 5.3624859848176645e-06, "loss": 0.0098, "step": 18860 }, { "epoch": 5.88, "grad_norm": 0.2956441342830658, "learning_rate": 5.357663524166794e-06, "loss": 0.008, "step": 18865 }, { "epoch": 5.89, "grad_norm": 0.3376927971839905, "learning_rate": 5.35284243934443e-06, "loss": 0.0066, "step": 18870 }, { "epoch": 5.89, "grad_norm": 0.6353251934051514, "learning_rate": 5.348022731779377e-06, "loss": 0.0064, "step": 18875 }, { "epoch": 5.89, "grad_norm": 0.651057779788971, "learning_rate": 5.3432044029000255e-06, "loss": 0.0079, "step": 18880 }, { "epoch": 5.89, "grad_norm": 0.3579331338405609, "learning_rate": 5.338387454134364e-06, "loss": 0.0077, "step": 18885 }, { "epoch": 5.89, "grad_norm": 0.1430189609527588, "learning_rate": 5.3335718869099674e-06, "loss": 0.008, "step": 18890 }, { "epoch": 5.89, "grad_norm": 0.538234531879425, "learning_rate": 5.328757702654011e-06, "loss": 0.0102, "step": 18895 }, { "epoch": 5.89, "grad_norm": 0.5153356194496155, "learning_rate": 5.323944902793248e-06, "loss": 0.0084, "step": 18900 }, { "epoch": 5.9, "grad_norm": 0.3739708960056305, "learning_rate": 5.3191334887540295e-06, "loss": 0.0077, "step": 18905 }, { "epoch": 5.9, "grad_norm": 0.6305508613586426, "learning_rate": 5.314323461962294e-06, "loss": 0.0055, "step": 18910 }, { "epoch": 5.9, "grad_norm": 0.4070693254470825, "learning_rate": 5.30951482384356e-06, "loss": 0.0077, "step": 18915 }, { "epoch": 5.9, "grad_norm": 0.5026772618293762, "learning_rate": 5.304707575822947e-06, "loss": 0.0099, "step": 18920 }, { "epoch": 5.9, "grad_norm": 0.4167676866054535, "learning_rate": 5.299901719325158e-06, "loss": 0.0074, "step": 18925 }, { "epoch": 5.9, "grad_norm": 0.3035011291503906, "learning_rate": 5.295097255774477e-06, "loss": 0.0056, "step": 18930 }, { "epoch": 5.91, "grad_norm": 0.4963723421096802, "learning_rate": 5.290294186594791e-06, "loss": 0.0086, "step": 18935 }, { "epoch": 5.91, "grad_norm": 0.38422855734825134, "learning_rate": 5.28549251320956e-06, "loss": 0.0088, "step": 18940 }, { "epoch": 5.91, "grad_norm": 0.39815953373908997, "learning_rate": 5.280692237041836e-06, "loss": 0.0086, "step": 18945 }, { "epoch": 5.91, "grad_norm": 0.36266201734542847, "learning_rate": 5.275893359514256e-06, "loss": 0.0072, "step": 18950 }, { "epoch": 5.91, "grad_norm": 0.20838455855846405, "learning_rate": 5.271095882049035e-06, "loss": 0.0063, "step": 18955 }, { "epoch": 5.91, "grad_norm": 1.448961853981018, "learning_rate": 5.266299806067987e-06, "loss": 0.0086, "step": 18960 }, { "epoch": 5.92, "grad_norm": 0.29663020372390747, "learning_rate": 5.261505132992499e-06, "loss": 0.0077, "step": 18965 }, { "epoch": 5.92, "grad_norm": 0.8893621563911438, "learning_rate": 5.2567118642435515e-06, "loss": 0.0088, "step": 18970 }, { "epoch": 5.92, "grad_norm": 0.4229799211025238, "learning_rate": 5.251920001241695e-06, "loss": 0.0095, "step": 18975 }, { "epoch": 5.92, "grad_norm": 0.8327857255935669, "learning_rate": 5.247129545407085e-06, "loss": 0.0091, "step": 18980 }, { "epoch": 5.92, "grad_norm": 0.147551029920578, "learning_rate": 5.242340498159444e-06, "loss": 0.0077, "step": 18985 }, { "epoch": 5.92, "grad_norm": 0.40383464097976685, "learning_rate": 5.237552860918076e-06, "loss": 0.0079, "step": 18990 }, { "epoch": 5.92, "grad_norm": 0.16825222969055176, "learning_rate": 5.232766635101872e-06, "loss": 0.0059, "step": 18995 }, { "epoch": 5.93, "grad_norm": 0.8230346441268921, "learning_rate": 5.227981822129307e-06, "loss": 0.0074, "step": 19000 }, { "epoch": 5.93, "grad_norm": 0.3880881071090698, "learning_rate": 5.223198423418434e-06, "loss": 0.0077, "step": 19005 }, { "epoch": 5.93, "grad_norm": 0.38730117678642273, "learning_rate": 5.218416440386888e-06, "loss": 0.0074, "step": 19010 }, { "epoch": 5.93, "grad_norm": 0.38056811690330505, "learning_rate": 5.213635874451881e-06, "loss": 0.0081, "step": 19015 }, { "epoch": 5.93, "grad_norm": 0.21690897643566132, "learning_rate": 5.208856727030212e-06, "loss": 0.0058, "step": 19020 }, { "epoch": 5.93, "grad_norm": 0.46819308400154114, "learning_rate": 5.2040789995382535e-06, "loss": 0.0088, "step": 19025 }, { "epoch": 5.94, "grad_norm": 0.30583274364471436, "learning_rate": 5.199302693391958e-06, "loss": 0.0063, "step": 19030 }, { "epoch": 5.94, "grad_norm": 0.5496878027915955, "learning_rate": 5.194527810006862e-06, "loss": 0.0078, "step": 19035 }, { "epoch": 5.94, "grad_norm": 0.38829562067985535, "learning_rate": 5.1897543507980715e-06, "loss": 0.0076, "step": 19040 }, { "epoch": 5.94, "grad_norm": 0.5420771241188049, "learning_rate": 5.184982317180278e-06, "loss": 0.0082, "step": 19045 }, { "epoch": 5.94, "grad_norm": 0.7623363137245178, "learning_rate": 5.180211710567749e-06, "loss": 0.0094, "step": 19050 }, { "epoch": 5.94, "grad_norm": 0.3645646870136261, "learning_rate": 5.175442532374324e-06, "loss": 0.0074, "step": 19055 }, { "epoch": 5.94, "grad_norm": 0.2958046793937683, "learning_rate": 5.170674784013425e-06, "loss": 0.0043, "step": 19060 }, { "epoch": 5.95, "grad_norm": 0.20783288776874542, "learning_rate": 5.1659084668980505e-06, "loss": 0.006, "step": 19065 }, { "epoch": 5.95, "grad_norm": 0.5735898613929749, "learning_rate": 5.16114358244077e-06, "loss": 0.0074, "step": 19070 }, { "epoch": 5.95, "grad_norm": 0.3978206217288971, "learning_rate": 5.156380132053731e-06, "loss": 0.007, "step": 19075 }, { "epoch": 5.95, "grad_norm": 0.15805083513259888, "learning_rate": 5.1516181171486555e-06, "loss": 0.007, "step": 19080 }, { "epoch": 5.95, "grad_norm": 0.33696630597114563, "learning_rate": 5.1468575391368424e-06, "loss": 0.008, "step": 19085 }, { "epoch": 5.95, "grad_norm": 0.5924596190452576, "learning_rate": 5.142098399429167e-06, "loss": 0.008, "step": 19090 }, { "epoch": 5.96, "grad_norm": 0.5464776754379272, "learning_rate": 5.1373406994360605e-06, "loss": 0.0076, "step": 19095 }, { "epoch": 5.96, "grad_norm": 0.5945857167243958, "learning_rate": 5.132584440567554e-06, "loss": 0.0081, "step": 19100 }, { "epoch": 5.96, "grad_norm": 0.28529831767082214, "learning_rate": 5.127829624233236e-06, "loss": 0.0071, "step": 19105 }, { "epoch": 5.96, "grad_norm": 0.3450835943222046, "learning_rate": 5.123076251842267e-06, "loss": 0.0084, "step": 19110 }, { "epoch": 5.96, "grad_norm": 1.0156694650650024, "learning_rate": 5.118324324803386e-06, "loss": 0.0066, "step": 19115 }, { "epoch": 5.96, "grad_norm": 0.6432920098304749, "learning_rate": 5.113573844524899e-06, "loss": 0.0056, "step": 19120 }, { "epoch": 5.97, "grad_norm": 0.20642875134944916, "learning_rate": 5.1088248124146835e-06, "loss": 0.0056, "step": 19125 }, { "epoch": 5.97, "grad_norm": 0.39436250925064087, "learning_rate": 5.1040772298801946e-06, "loss": 0.0073, "step": 19130 }, { "epoch": 5.97, "grad_norm": 0.2230510264635086, "learning_rate": 5.099331098328439e-06, "loss": 0.0088, "step": 19135 }, { "epoch": 5.97, "grad_norm": 0.4602661430835724, "learning_rate": 5.09458641916602e-06, "loss": 0.0064, "step": 19140 }, { "epoch": 5.97, "grad_norm": 0.29295966029167175, "learning_rate": 5.089843193799091e-06, "loss": 0.0054, "step": 19145 }, { "epoch": 5.97, "grad_norm": 0.5813841819763184, "learning_rate": 5.085101423633383e-06, "loss": 0.012, "step": 19150 }, { "epoch": 5.97, "grad_norm": 0.3695999085903168, "learning_rate": 5.08036111007419e-06, "loss": 0.0067, "step": 19155 }, { "epoch": 5.98, "grad_norm": 0.3806309998035431, "learning_rate": 5.07562225452638e-06, "loss": 0.0075, "step": 19160 }, { "epoch": 5.98, "grad_norm": 0.36197715997695923, "learning_rate": 5.0708848583943895e-06, "loss": 0.0082, "step": 19165 }, { "epoch": 5.98, "grad_norm": 0.39807888865470886, "learning_rate": 5.066148923082211e-06, "loss": 0.0084, "step": 19170 }, { "epoch": 5.98, "grad_norm": 0.32840362191200256, "learning_rate": 5.061414449993417e-06, "loss": 0.0049, "step": 19175 }, { "epoch": 5.98, "grad_norm": 0.5978043675422668, "learning_rate": 5.0566814405311394e-06, "loss": 0.0101, "step": 19180 }, { "epoch": 5.98, "grad_norm": 0.3670240342617035, "learning_rate": 5.051949896098085e-06, "loss": 0.0069, "step": 19185 }, { "epoch": 5.99, "grad_norm": 0.379352331161499, "learning_rate": 5.047219818096516e-06, "loss": 0.0052, "step": 19190 }, { "epoch": 5.99, "grad_norm": 0.4825327694416046, "learning_rate": 5.042491207928267e-06, "loss": 0.0075, "step": 19195 }, { "epoch": 5.99, "grad_norm": 0.21042650938034058, "learning_rate": 5.037764066994738e-06, "loss": 0.0072, "step": 19200 }, { "epoch": 5.99, "grad_norm": 0.18937867879867554, "learning_rate": 5.033038396696882e-06, "loss": 0.009, "step": 19205 }, { "epoch": 5.99, "grad_norm": 1.988599419593811, "learning_rate": 5.028314198435229e-06, "loss": 0.0098, "step": 19210 }, { "epoch": 5.99, "grad_norm": 0.38489198684692383, "learning_rate": 5.02359147360987e-06, "loss": 0.0075, "step": 19215 }, { "epoch": 5.99, "grad_norm": 0.5206966996192932, "learning_rate": 5.0188702236204555e-06, "loss": 0.0114, "step": 19220 }, { "epoch": 6.0, "grad_norm": 0.477316290140152, "learning_rate": 5.014150449866197e-06, "loss": 0.0052, "step": 19225 }, { "epoch": 6.0, "grad_norm": 0.34607836604118347, "learning_rate": 5.0094321537458836e-06, "loss": 0.0084, "step": 19230 }, { "epoch": 6.0, "grad_norm": 0.21586057543754578, "learning_rate": 5.004715336657855e-06, "loss": 0.0065, "step": 19235 }, { "epoch": 6.0, "grad_norm": 0.41035139560699463, "learning_rate": 5.000000000000003e-06, "loss": 0.0059, "step": 19240 }, { "epoch": 6.0, "grad_norm": 0.13773435354232788, "learning_rate": 4.995286145169796e-06, "loss": 0.0041, "step": 19245 }, { "epoch": 6.0, "grad_norm": 0.1690952032804489, "learning_rate": 4.9905737735642575e-06, "loss": 0.0043, "step": 19250 }, { "epoch": 6.01, "grad_norm": 0.15634578466415405, "learning_rate": 4.985862886579974e-06, "loss": 0.0034, "step": 19255 }, { "epoch": 6.01, "grad_norm": 0.14361192286014557, "learning_rate": 4.981153485613088e-06, "loss": 0.0048, "step": 19260 }, { "epoch": 6.01, "grad_norm": 0.21888059377670288, "learning_rate": 4.976445572059301e-06, "loss": 0.0042, "step": 19265 }, { "epoch": 6.01, "grad_norm": 0.07288442552089691, "learning_rate": 4.971739147313883e-06, "loss": 0.0024, "step": 19270 }, { "epoch": 6.01, "grad_norm": 0.8259077072143555, "learning_rate": 4.967034212771656e-06, "loss": 0.0044, "step": 19275 }, { "epoch": 6.01, "grad_norm": 0.28250551223754883, "learning_rate": 4.962330769826995e-06, "loss": 0.0045, "step": 19280 }, { "epoch": 6.01, "grad_norm": 0.1364012509584427, "learning_rate": 4.957628819873838e-06, "loss": 0.003, "step": 19285 }, { "epoch": 6.02, "grad_norm": 0.24540041387081146, "learning_rate": 4.952928364305686e-06, "loss": 0.005, "step": 19290 }, { "epoch": 6.02, "grad_norm": 0.31267842650413513, "learning_rate": 4.948229404515588e-06, "loss": 0.004, "step": 19295 }, { "epoch": 6.02, "grad_norm": 0.25432685017585754, "learning_rate": 4.943531941896155e-06, "loss": 0.0036, "step": 19300 }, { "epoch": 6.02, "grad_norm": 0.38814499974250793, "learning_rate": 4.938835977839552e-06, "loss": 0.0026, "step": 19305 }, { "epoch": 6.02, "grad_norm": 0.21670231223106384, "learning_rate": 4.934141513737504e-06, "loss": 0.0034, "step": 19310 }, { "epoch": 6.02, "grad_norm": 0.295858234167099, "learning_rate": 4.929448550981284e-06, "loss": 0.0036, "step": 19315 }, { "epoch": 6.03, "grad_norm": 0.29953882098197937, "learning_rate": 4.924757090961728e-06, "loss": 0.0021, "step": 19320 }, { "epoch": 6.03, "grad_norm": 0.17823810875415802, "learning_rate": 4.920067135069221e-06, "loss": 0.0034, "step": 19325 }, { "epoch": 6.03, "grad_norm": 0.1693033128976822, "learning_rate": 4.915378684693705e-06, "loss": 0.004, "step": 19330 }, { "epoch": 6.03, "grad_norm": 0.1305498629808426, "learning_rate": 4.910691741224675e-06, "loss": 0.0031, "step": 19335 }, { "epoch": 6.03, "grad_norm": 0.30055421590805054, "learning_rate": 4.90600630605118e-06, "loss": 0.003, "step": 19340 }, { "epoch": 6.03, "grad_norm": 0.5468003153800964, "learning_rate": 4.90132238056182e-06, "loss": 0.004, "step": 19345 }, { "epoch": 6.04, "grad_norm": 0.17088231444358826, "learning_rate": 4.896639966144751e-06, "loss": 0.0032, "step": 19350 }, { "epoch": 6.04, "grad_norm": 0.5591351985931396, "learning_rate": 4.891959064187678e-06, "loss": 0.0047, "step": 19355 }, { "epoch": 6.04, "grad_norm": 0.27993714809417725, "learning_rate": 4.8872796760778594e-06, "loss": 0.0036, "step": 19360 }, { "epoch": 6.04, "grad_norm": 0.336864709854126, "learning_rate": 4.882601803202104e-06, "loss": 0.0045, "step": 19365 }, { "epoch": 6.04, "grad_norm": 0.1561504602432251, "learning_rate": 4.877925446946772e-06, "loss": 0.0034, "step": 19370 }, { "epoch": 6.04, "grad_norm": 0.21346211433410645, "learning_rate": 4.873250608697775e-06, "loss": 0.0052, "step": 19375 }, { "epoch": 6.04, "grad_norm": 0.12421856820583344, "learning_rate": 4.868577289840577e-06, "loss": 0.0037, "step": 19380 }, { "epoch": 6.05, "grad_norm": 0.2269437611103058, "learning_rate": 4.863905491760178e-06, "loss": 0.0042, "step": 19385 }, { "epoch": 6.05, "grad_norm": 0.36611998081207275, "learning_rate": 4.859235215841148e-06, "loss": 0.0048, "step": 19390 }, { "epoch": 6.05, "grad_norm": 0.28265631198883057, "learning_rate": 4.854566463467594e-06, "loss": 0.0033, "step": 19395 }, { "epoch": 6.05, "grad_norm": 0.26229220628738403, "learning_rate": 4.8498992360231716e-06, "loss": 0.0035, "step": 19400 }, { "epoch": 6.05, "grad_norm": 0.1218532919883728, "learning_rate": 4.845233534891086e-06, "loss": 0.0033, "step": 19405 }, { "epoch": 6.05, "grad_norm": 0.31334492564201355, "learning_rate": 4.840569361454092e-06, "loss": 0.0043, "step": 19410 }, { "epoch": 6.06, "grad_norm": 0.22502319514751434, "learning_rate": 4.835906717094491e-06, "loss": 0.0031, "step": 19415 }, { "epoch": 6.06, "grad_norm": 0.3406829237937927, "learning_rate": 4.831245603194125e-06, "loss": 0.0029, "step": 19420 }, { "epoch": 6.06, "grad_norm": 0.16087867319583893, "learning_rate": 4.826586021134385e-06, "loss": 0.0032, "step": 19425 }, { "epoch": 6.06, "grad_norm": 0.4197031259536743, "learning_rate": 4.821927972296221e-06, "loss": 0.0033, "step": 19430 }, { "epoch": 6.06, "grad_norm": 0.34231555461883545, "learning_rate": 4.817271458060112e-06, "loss": 0.0042, "step": 19435 }, { "epoch": 6.06, "grad_norm": 0.3761197030544281, "learning_rate": 4.8126164798060895e-06, "loss": 0.0035, "step": 19440 }, { "epoch": 6.06, "grad_norm": 0.2574174702167511, "learning_rate": 4.807963038913728e-06, "loss": 0.0039, "step": 19445 }, { "epoch": 6.07, "grad_norm": 0.3793030083179474, "learning_rate": 4.8033111367621475e-06, "loss": 0.0037, "step": 19450 }, { "epoch": 6.07, "grad_norm": 0.10669315606355667, "learning_rate": 4.798660774730014e-06, "loss": 0.0041, "step": 19455 }, { "epoch": 6.07, "grad_norm": 0.11876823753118515, "learning_rate": 4.794011954195528e-06, "loss": 0.0038, "step": 19460 }, { "epoch": 6.07, "grad_norm": 0.49461352825164795, "learning_rate": 4.789364676536444e-06, "loss": 0.0036, "step": 19465 }, { "epoch": 6.07, "grad_norm": 0.12926629185676575, "learning_rate": 4.784718943130051e-06, "loss": 0.0027, "step": 19470 }, { "epoch": 6.07, "grad_norm": 0.26242661476135254, "learning_rate": 4.78007475535319e-06, "loss": 0.0037, "step": 19475 }, { "epoch": 6.08, "grad_norm": 0.27047592401504517, "learning_rate": 4.7754321145822355e-06, "loss": 0.0037, "step": 19480 }, { "epoch": 6.08, "grad_norm": 0.3222498595714569, "learning_rate": 4.7707910221931065e-06, "loss": 0.004, "step": 19485 }, { "epoch": 6.08, "grad_norm": 0.10554604232311249, "learning_rate": 4.766151479561267e-06, "loss": 0.0033, "step": 19490 }, { "epoch": 6.08, "grad_norm": 0.8339462876319885, "learning_rate": 4.761513488061709e-06, "loss": 0.0027, "step": 19495 }, { "epoch": 6.08, "grad_norm": 0.20556354522705078, "learning_rate": 4.756877049068977e-06, "loss": 0.003, "step": 19500 }, { "epoch": 6.08, "grad_norm": 0.4011286199092865, "learning_rate": 4.752242163957151e-06, "loss": 0.0034, "step": 19505 }, { "epoch": 6.09, "grad_norm": 0.1158180832862854, "learning_rate": 4.747608834099854e-06, "loss": 0.0029, "step": 19510 }, { "epoch": 6.09, "grad_norm": 0.33003300428390503, "learning_rate": 4.7429770608702395e-06, "loss": 0.0037, "step": 19515 }, { "epoch": 6.09, "grad_norm": 0.266290545463562, "learning_rate": 4.738346845641013e-06, "loss": 0.005, "step": 19520 }, { "epoch": 6.09, "grad_norm": 0.38621214032173157, "learning_rate": 4.733718189784411e-06, "loss": 0.0065, "step": 19525 }, { "epoch": 6.09, "grad_norm": 0.38451021909713745, "learning_rate": 4.729091094672199e-06, "loss": 0.004, "step": 19530 }, { "epoch": 6.09, "grad_norm": 0.46946078538894653, "learning_rate": 4.7244655616756944e-06, "loss": 0.0064, "step": 19535 }, { "epoch": 6.09, "grad_norm": 0.10865452140569687, "learning_rate": 4.719841592165743e-06, "loss": 0.0044, "step": 19540 }, { "epoch": 6.1, "grad_norm": 0.23039183020591736, "learning_rate": 4.715219187512733e-06, "loss": 0.0056, "step": 19545 }, { "epoch": 6.1, "grad_norm": 0.29119929671287537, "learning_rate": 4.710598349086584e-06, "loss": 0.0028, "step": 19550 }, { "epoch": 6.1, "grad_norm": 0.25930312275886536, "learning_rate": 4.7059790782567484e-06, "loss": 0.0037, "step": 19555 }, { "epoch": 6.1, "grad_norm": 0.24252845346927643, "learning_rate": 4.701361376392231e-06, "loss": 0.0028, "step": 19560 }, { "epoch": 6.1, "grad_norm": 0.4317205548286438, "learning_rate": 4.696745244861552e-06, "loss": 0.0053, "step": 19565 }, { "epoch": 6.1, "grad_norm": 0.35169634222984314, "learning_rate": 4.692130685032771e-06, "loss": 0.0039, "step": 19570 }, { "epoch": 6.11, "grad_norm": 0.3276447653770447, "learning_rate": 4.687517698273488e-06, "loss": 0.0026, "step": 19575 }, { "epoch": 6.11, "grad_norm": 0.2100311517715454, "learning_rate": 4.682906285950834e-06, "loss": 0.0035, "step": 19580 }, { "epoch": 6.11, "grad_norm": 0.30278119444847107, "learning_rate": 4.67829644943147e-06, "loss": 0.0039, "step": 19585 }, { "epoch": 6.11, "grad_norm": 0.34402674436569214, "learning_rate": 4.673688190081594e-06, "loss": 0.0041, "step": 19590 }, { "epoch": 6.11, "grad_norm": 0.3510453402996063, "learning_rate": 4.669081509266936e-06, "loss": 0.0035, "step": 19595 }, { "epoch": 6.11, "grad_norm": 0.11933588981628418, "learning_rate": 4.664476408352757e-06, "loss": 0.0031, "step": 19600 }, { "epoch": 6.11, "grad_norm": 0.23259113729000092, "learning_rate": 4.659872888703849e-06, "loss": 0.0042, "step": 19605 }, { "epoch": 6.12, "grad_norm": 0.31132641434669495, "learning_rate": 4.6552709516845385e-06, "loss": 0.0051, "step": 19610 }, { "epoch": 6.12, "grad_norm": 0.3782808780670166, "learning_rate": 4.650670598658679e-06, "loss": 0.0038, "step": 19615 }, { "epoch": 6.12, "grad_norm": 0.3041776716709137, "learning_rate": 4.646071830989658e-06, "loss": 0.0026, "step": 19620 }, { "epoch": 6.12, "grad_norm": 0.09638538956642151, "learning_rate": 4.6414746500403904e-06, "loss": 0.0023, "step": 19625 }, { "epoch": 6.12, "grad_norm": 0.38467106223106384, "learning_rate": 4.6368790571733234e-06, "loss": 0.0032, "step": 19630 }, { "epoch": 6.12, "grad_norm": 0.15978440642356873, "learning_rate": 4.632285053750432e-06, "loss": 0.0033, "step": 19635 }, { "epoch": 6.13, "grad_norm": 0.4148300588130951, "learning_rate": 4.627692641133219e-06, "loss": 0.0062, "step": 19640 }, { "epoch": 6.13, "grad_norm": 0.29224053025245667, "learning_rate": 4.62310182068272e-06, "loss": 0.0028, "step": 19645 }, { "epoch": 6.13, "grad_norm": 0.13670198619365692, "learning_rate": 4.618512593759493e-06, "loss": 0.0038, "step": 19650 }, { "epoch": 6.13, "grad_norm": 0.279555082321167, "learning_rate": 4.613924961723629e-06, "loss": 0.003, "step": 19655 }, { "epoch": 6.13, "grad_norm": 0.4272875487804413, "learning_rate": 4.609338925934743e-06, "loss": 0.0034, "step": 19660 }, { "epoch": 6.13, "grad_norm": 0.18500889837741852, "learning_rate": 4.604754487751978e-06, "loss": 0.003, "step": 19665 }, { "epoch": 6.14, "grad_norm": 0.184945747256279, "learning_rate": 4.600171648534008e-06, "loss": 0.003, "step": 19670 }, { "epoch": 6.14, "grad_norm": 0.28593793511390686, "learning_rate": 4.595590409639014e-06, "loss": 0.0035, "step": 19675 }, { "epoch": 6.14, "grad_norm": 0.6415123343467712, "learning_rate": 4.591010772424733e-06, "loss": 0.0033, "step": 19680 }, { "epoch": 6.14, "grad_norm": 0.2268776297569275, "learning_rate": 4.586432738248405e-06, "loss": 0.0029, "step": 19685 }, { "epoch": 6.14, "grad_norm": 0.1716085523366928, "learning_rate": 4.581856308466804e-06, "loss": 0.0039, "step": 19690 }, { "epoch": 6.14, "grad_norm": 0.6511481404304504, "learning_rate": 4.577281484436223e-06, "loss": 0.0028, "step": 19695 }, { "epoch": 6.14, "grad_norm": 0.35789069533348083, "learning_rate": 4.572708267512484e-06, "loss": 0.0059, "step": 19700 }, { "epoch": 6.15, "grad_norm": 0.2436264157295227, "learning_rate": 4.568136659050933e-06, "loss": 0.0032, "step": 19705 }, { "epoch": 6.15, "grad_norm": 0.49384281039237976, "learning_rate": 4.563566660406433e-06, "loss": 0.0056, "step": 19710 }, { "epoch": 6.15, "grad_norm": 0.2979516386985779, "learning_rate": 4.558998272933375e-06, "loss": 0.0039, "step": 19715 }, { "epoch": 6.15, "grad_norm": 0.23512761294841766, "learning_rate": 4.554431497985667e-06, "loss": 0.0064, "step": 19720 }, { "epoch": 6.15, "grad_norm": 0.137066051363945, "learning_rate": 4.549866336916755e-06, "loss": 0.0039, "step": 19725 }, { "epoch": 6.15, "grad_norm": 0.13359549641609192, "learning_rate": 4.545302791079588e-06, "loss": 0.0034, "step": 19730 }, { "epoch": 6.16, "grad_norm": 0.17334330081939697, "learning_rate": 4.540740861826644e-06, "loss": 0.002, "step": 19735 }, { "epoch": 6.16, "grad_norm": 0.2552857995033264, "learning_rate": 4.536180550509928e-06, "loss": 0.0035, "step": 19740 }, { "epoch": 6.16, "grad_norm": 0.6206842660903931, "learning_rate": 4.531621858480949e-06, "loss": 0.0043, "step": 19745 }, { "epoch": 6.16, "grad_norm": 0.30290666222572327, "learning_rate": 4.527064787090752e-06, "loss": 0.0033, "step": 19750 }, { "epoch": 6.16, "grad_norm": 0.2942062020301819, "learning_rate": 4.522509337689893e-06, "loss": 0.0039, "step": 19755 }, { "epoch": 6.16, "grad_norm": 0.28894585371017456, "learning_rate": 4.517955511628449e-06, "loss": 0.0041, "step": 19760 }, { "epoch": 6.16, "grad_norm": 0.3294394612312317, "learning_rate": 4.513403310256023e-06, "loss": 0.0052, "step": 19765 }, { "epoch": 6.17, "grad_norm": 0.17340120673179626, "learning_rate": 4.5088527349217286e-06, "loss": 0.0035, "step": 19770 }, { "epoch": 6.17, "grad_norm": 0.789074182510376, "learning_rate": 4.5043037869741965e-06, "loss": 0.0036, "step": 19775 }, { "epoch": 6.17, "grad_norm": 0.5654256343841553, "learning_rate": 4.499756467761585e-06, "loss": 0.0036, "step": 19780 }, { "epoch": 6.17, "grad_norm": 0.11048149317502975, "learning_rate": 4.495210778631553e-06, "loss": 0.0032, "step": 19785 }, { "epoch": 6.17, "grad_norm": 0.09231693297624588, "learning_rate": 4.490666720931293e-06, "loss": 0.0028, "step": 19790 }, { "epoch": 6.17, "grad_norm": 0.17293661832809448, "learning_rate": 4.486124296007502e-06, "loss": 0.0038, "step": 19795 }, { "epoch": 6.18, "grad_norm": 0.24658824503421783, "learning_rate": 4.481583505206403e-06, "loss": 0.0041, "step": 19800 }, { "epoch": 6.18, "grad_norm": 0.18947665393352509, "learning_rate": 4.4770443498737224e-06, "loss": 0.0037, "step": 19805 }, { "epoch": 6.18, "grad_norm": 0.40546736121177673, "learning_rate": 4.47250683135472e-06, "loss": 0.0043, "step": 19810 }, { "epoch": 6.18, "grad_norm": 0.5654178857803345, "learning_rate": 4.4679709509941585e-06, "loss": 0.0042, "step": 19815 }, { "epoch": 6.18, "grad_norm": 0.4417353868484497, "learning_rate": 4.463436710136308e-06, "loss": 0.0061, "step": 19820 }, { "epoch": 6.18, "grad_norm": 0.4045957922935486, "learning_rate": 4.458904110124965e-06, "loss": 0.0042, "step": 19825 }, { "epoch": 6.18, "grad_norm": 0.054574161767959595, "learning_rate": 4.454373152303439e-06, "loss": 0.0041, "step": 19830 }, { "epoch": 6.19, "grad_norm": 0.10774843394756317, "learning_rate": 4.449843838014546e-06, "loss": 0.0036, "step": 19835 }, { "epoch": 6.19, "grad_norm": 0.3569006621837616, "learning_rate": 4.445316168600621e-06, "loss": 0.0045, "step": 19840 }, { "epoch": 6.19, "grad_norm": 0.12751227617263794, "learning_rate": 4.440790145403503e-06, "loss": 0.0043, "step": 19845 }, { "epoch": 6.19, "grad_norm": 0.43211138248443604, "learning_rate": 4.4362657697645624e-06, "loss": 0.004, "step": 19850 }, { "epoch": 6.19, "grad_norm": 0.3285439908504486, "learning_rate": 4.431743043024658e-06, "loss": 0.0043, "step": 19855 }, { "epoch": 6.19, "grad_norm": 0.4181636571884155, "learning_rate": 4.427221966524171e-06, "loss": 0.0044, "step": 19860 }, { "epoch": 6.2, "grad_norm": 0.26568159461021423, "learning_rate": 4.422702541602994e-06, "loss": 0.0045, "step": 19865 }, { "epoch": 6.2, "grad_norm": 0.13646848499774933, "learning_rate": 4.41818476960053e-06, "loss": 0.0032, "step": 19870 }, { "epoch": 6.2, "grad_norm": 0.45093557238578796, "learning_rate": 4.4136686518556884e-06, "loss": 0.0033, "step": 19875 }, { "epoch": 6.2, "grad_norm": 0.22524970769882202, "learning_rate": 4.409154189706892e-06, "loss": 0.0025, "step": 19880 }, { "epoch": 6.2, "grad_norm": 0.6135339140892029, "learning_rate": 4.404641384492073e-06, "loss": 0.0035, "step": 19885 }, { "epoch": 6.2, "grad_norm": 0.29468056559562683, "learning_rate": 4.400130237548669e-06, "loss": 0.0035, "step": 19890 }, { "epoch": 6.21, "grad_norm": 0.2552112340927124, "learning_rate": 4.39562075021363e-06, "loss": 0.007, "step": 19895 }, { "epoch": 6.21, "grad_norm": 0.38863444328308105, "learning_rate": 4.391112923823413e-06, "loss": 0.0044, "step": 19900 }, { "epoch": 6.21, "grad_norm": 0.36713364720344543, "learning_rate": 4.38660675971398e-06, "loss": 0.0036, "step": 19905 }, { "epoch": 6.21, "grad_norm": 0.7707926034927368, "learning_rate": 4.382102259220805e-06, "loss": 0.0032, "step": 19910 }, { "epoch": 6.21, "grad_norm": 0.1946425884962082, "learning_rate": 4.377599423678867e-06, "loss": 0.0042, "step": 19915 }, { "epoch": 6.21, "grad_norm": 0.1757972240447998, "learning_rate": 4.373098254422649e-06, "loss": 0.0056, "step": 19920 }, { "epoch": 6.21, "grad_norm": 0.21779265999794006, "learning_rate": 4.368598752786143e-06, "loss": 0.0027, "step": 19925 }, { "epoch": 6.22, "grad_norm": 0.24872009456157684, "learning_rate": 4.364100920102847e-06, "loss": 0.0043, "step": 19930 }, { "epoch": 6.22, "grad_norm": 0.08531658351421356, "learning_rate": 4.359604757705764e-06, "loss": 0.0028, "step": 19935 }, { "epoch": 6.22, "grad_norm": 0.17155839502811432, "learning_rate": 4.3551102669274e-06, "loss": 0.0028, "step": 19940 }, { "epoch": 6.22, "grad_norm": 0.136628195643425, "learning_rate": 4.350617449099769e-06, "loss": 0.0039, "step": 19945 }, { "epoch": 6.22, "grad_norm": 0.6224244236946106, "learning_rate": 4.346126305554385e-06, "loss": 0.0022, "step": 19950 }, { "epoch": 6.22, "grad_norm": 0.41527363657951355, "learning_rate": 4.341636837622272e-06, "loss": 0.0033, "step": 19955 }, { "epoch": 6.23, "grad_norm": 0.25839653611183167, "learning_rate": 4.337149046633953e-06, "loss": 0.0027, "step": 19960 }, { "epoch": 6.23, "grad_norm": 0.23557740449905396, "learning_rate": 4.332662933919445e-06, "loss": 0.0045, "step": 19965 }, { "epoch": 6.23, "grad_norm": 0.2542274296283722, "learning_rate": 4.328178500808289e-06, "loss": 0.005, "step": 19970 }, { "epoch": 6.23, "grad_norm": 0.16733460128307343, "learning_rate": 4.3236957486295115e-06, "loss": 0.0034, "step": 19975 }, { "epoch": 6.23, "grad_norm": 0.28059637546539307, "learning_rate": 4.319214678711646e-06, "loss": 0.0037, "step": 19980 }, { "epoch": 6.23, "grad_norm": 0.3275239169597626, "learning_rate": 4.314735292382729e-06, "loss": 0.0031, "step": 19985 }, { "epoch": 6.23, "grad_norm": 0.15801599621772766, "learning_rate": 4.310257590970294e-06, "loss": 0.0035, "step": 19990 }, { "epoch": 6.24, "grad_norm": 0.07836773246526718, "learning_rate": 4.305781575801381e-06, "loss": 0.0035, "step": 19995 }, { "epoch": 6.24, "grad_norm": 0.16531780362129211, "learning_rate": 4.301307248202521e-06, "loss": 0.0027, "step": 20000 }, { "epoch": 6.24, "grad_norm": 0.16912704706192017, "learning_rate": 4.296834609499752e-06, "loss": 0.0032, "step": 20005 }, { "epoch": 6.24, "grad_norm": 0.3261689245700836, "learning_rate": 4.2923636610186084e-06, "loss": 0.0042, "step": 20010 }, { "epoch": 6.24, "grad_norm": 0.19443699717521667, "learning_rate": 4.287894404084131e-06, "loss": 0.0043, "step": 20015 }, { "epoch": 6.24, "grad_norm": 0.23142337799072266, "learning_rate": 4.28342684002085e-06, "loss": 0.0033, "step": 20020 }, { "epoch": 6.25, "grad_norm": 0.2110389918088913, "learning_rate": 4.278960970152798e-06, "loss": 0.0034, "step": 20025 }, { "epoch": 6.25, "grad_norm": 0.47038784623146057, "learning_rate": 4.274496795803509e-06, "loss": 0.0033, "step": 20030 }, { "epoch": 6.25, "grad_norm": 0.5487150549888611, "learning_rate": 4.2700343182960016e-06, "loss": 0.003, "step": 20035 }, { "epoch": 6.25, "grad_norm": 0.21167035400867462, "learning_rate": 4.265573538952805e-06, "loss": 0.0046, "step": 20040 }, { "epoch": 6.25, "grad_norm": 0.17947198450565338, "learning_rate": 4.26111445909594e-06, "loss": 0.0044, "step": 20045 }, { "epoch": 6.25, "grad_norm": 0.373744934797287, "learning_rate": 4.256657080046921e-06, "loss": 0.0037, "step": 20050 }, { "epoch": 6.26, "grad_norm": 0.3738728165626526, "learning_rate": 4.252201403126768e-06, "loss": 0.0041, "step": 20055 }, { "epoch": 6.26, "grad_norm": 0.44824865460395813, "learning_rate": 4.247747429655987e-06, "loss": 0.0067, "step": 20060 }, { "epoch": 6.26, "grad_norm": 0.23117795586585999, "learning_rate": 4.243295160954586e-06, "loss": 0.0049, "step": 20065 }, { "epoch": 6.26, "grad_norm": 0.2192441076040268, "learning_rate": 4.238844598342056e-06, "loss": 0.0032, "step": 20070 }, { "epoch": 6.26, "grad_norm": 0.27675142884254456, "learning_rate": 4.234395743137395e-06, "loss": 0.0041, "step": 20075 }, { "epoch": 6.26, "grad_norm": 0.1432168185710907, "learning_rate": 4.2299485966590895e-06, "loss": 0.0031, "step": 20080 }, { "epoch": 6.26, "grad_norm": 0.18300987780094147, "learning_rate": 4.225503160225119e-06, "loss": 0.0028, "step": 20085 }, { "epoch": 6.27, "grad_norm": 0.34145647287368774, "learning_rate": 4.221059435152961e-06, "loss": 0.0042, "step": 20090 }, { "epoch": 6.27, "grad_norm": 0.4050016403198242, "learning_rate": 4.216617422759577e-06, "loss": 0.0047, "step": 20095 }, { "epoch": 6.27, "grad_norm": 0.2938024401664734, "learning_rate": 4.212177124361433e-06, "loss": 0.0026, "step": 20100 }, { "epoch": 6.27, "grad_norm": 0.13880467414855957, "learning_rate": 4.207738541274482e-06, "loss": 0.0033, "step": 20105 }, { "epoch": 6.27, "grad_norm": 0.2138069123029709, "learning_rate": 4.203301674814158e-06, "loss": 0.0018, "step": 20110 }, { "epoch": 6.27, "grad_norm": 0.3359643816947937, "learning_rate": 4.1988665262954e-06, "loss": 0.0045, "step": 20115 }, { "epoch": 6.28, "grad_norm": 0.2553737163543701, "learning_rate": 4.194433097032632e-06, "loss": 0.0041, "step": 20120 }, { "epoch": 6.28, "grad_norm": 0.4595305919647217, "learning_rate": 4.190001388339771e-06, "loss": 0.0037, "step": 20125 }, { "epoch": 6.28, "grad_norm": 0.11691176146268845, "learning_rate": 4.185571401530223e-06, "loss": 0.0046, "step": 20130 }, { "epoch": 6.28, "grad_norm": 0.29672908782958984, "learning_rate": 4.1811431379168824e-06, "loss": 0.0039, "step": 20135 }, { "epoch": 6.28, "grad_norm": 0.30570900440216064, "learning_rate": 4.176716598812133e-06, "loss": 0.0027, "step": 20140 }, { "epoch": 6.28, "grad_norm": 0.2994268238544464, "learning_rate": 4.17229178552785e-06, "loss": 0.0032, "step": 20145 }, { "epoch": 6.28, "grad_norm": 0.284789115190506, "learning_rate": 4.167868699375394e-06, "loss": 0.0033, "step": 20150 }, { "epoch": 6.29, "grad_norm": 0.33170077204704285, "learning_rate": 4.1634473416656175e-06, "loss": 0.0035, "step": 20155 }, { "epoch": 6.29, "grad_norm": 0.22531452775001526, "learning_rate": 4.159027713708858e-06, "loss": 0.0036, "step": 20160 }, { "epoch": 6.29, "grad_norm": 0.3662478029727936, "learning_rate": 4.154609816814938e-06, "loss": 0.0035, "step": 20165 }, { "epoch": 6.29, "grad_norm": 0.24185219407081604, "learning_rate": 4.150193652293173e-06, "loss": 0.0045, "step": 20170 }, { "epoch": 6.29, "grad_norm": 0.1317419558763504, "learning_rate": 4.1457792214523606e-06, "loss": 0.0036, "step": 20175 }, { "epoch": 6.29, "grad_norm": 0.25156107544898987, "learning_rate": 4.141366525600786e-06, "loss": 0.0038, "step": 20180 }, { "epoch": 6.3, "grad_norm": 0.4421023428440094, "learning_rate": 4.136955566046221e-06, "loss": 0.0041, "step": 20185 }, { "epoch": 6.3, "grad_norm": 0.09637313336133957, "learning_rate": 4.132546344095922e-06, "loss": 0.003, "step": 20190 }, { "epoch": 6.3, "grad_norm": 0.3749695420265198, "learning_rate": 4.128138861056629e-06, "loss": 0.0058, "step": 20195 }, { "epoch": 6.3, "grad_norm": 0.13544899225234985, "learning_rate": 4.123733118234569e-06, "loss": 0.0033, "step": 20200 }, { "epoch": 6.3, "grad_norm": 0.289870023727417, "learning_rate": 4.119329116935451e-06, "loss": 0.0041, "step": 20205 }, { "epoch": 6.3, "grad_norm": 0.18580834567546844, "learning_rate": 4.114926858464474e-06, "loss": 0.0035, "step": 20210 }, { "epoch": 6.3, "grad_norm": 0.18573562800884247, "learning_rate": 4.110526344126306e-06, "loss": 0.0047, "step": 20215 }, { "epoch": 6.31, "grad_norm": 0.4938793480396271, "learning_rate": 4.106127575225116e-06, "loss": 0.0043, "step": 20220 }, { "epoch": 6.31, "grad_norm": 0.22334390878677368, "learning_rate": 4.101730553064546e-06, "loss": 0.0038, "step": 20225 }, { "epoch": 6.31, "grad_norm": 0.25191017985343933, "learning_rate": 4.097335278947721e-06, "loss": 0.0033, "step": 20230 }, { "epoch": 6.31, "grad_norm": 0.27087491750717163, "learning_rate": 4.0929417541772474e-06, "loss": 0.0034, "step": 20235 }, { "epoch": 6.31, "grad_norm": 0.32818400859832764, "learning_rate": 4.088549980055218e-06, "loss": 0.0034, "step": 20240 }, { "epoch": 6.31, "grad_norm": 0.36982041597366333, "learning_rate": 4.084159957883202e-06, "loss": 0.0048, "step": 20245 }, { "epoch": 6.32, "grad_norm": 0.18681477010250092, "learning_rate": 4.079771688962248e-06, "loss": 0.0025, "step": 20250 }, { "epoch": 6.32, "grad_norm": 0.18717291951179504, "learning_rate": 4.0753851745928856e-06, "loss": 0.0053, "step": 20255 }, { "epoch": 6.32, "grad_norm": 0.4687967002391815, "learning_rate": 4.071000416075133e-06, "loss": 0.0038, "step": 20260 }, { "epoch": 6.32, "grad_norm": 0.2571309804916382, "learning_rate": 4.0666174147084796e-06, "loss": 0.0043, "step": 20265 }, { "epoch": 6.32, "grad_norm": 0.1658802479505539, "learning_rate": 4.0622361717918935e-06, "loss": 0.0052, "step": 20270 }, { "epoch": 6.32, "grad_norm": 0.14285005629062653, "learning_rate": 4.057856688623827e-06, "loss": 0.0042, "step": 20275 }, { "epoch": 6.33, "grad_norm": 0.40070050954818726, "learning_rate": 4.053478966502205e-06, "loss": 0.0034, "step": 20280 }, { "epoch": 6.33, "grad_norm": 0.1511838585138321, "learning_rate": 4.049103006724439e-06, "loss": 0.0029, "step": 20285 }, { "epoch": 6.33, "grad_norm": 0.40002161264419556, "learning_rate": 4.044728810587406e-06, "loss": 0.0057, "step": 20290 }, { "epoch": 6.33, "grad_norm": 1.249631643295288, "learning_rate": 4.040356379387469e-06, "loss": 0.0044, "step": 20295 }, { "epoch": 6.33, "grad_norm": 0.3465880751609802, "learning_rate": 4.0359857144204625e-06, "loss": 0.0031, "step": 20300 }, { "epoch": 6.33, "grad_norm": 0.25544366240501404, "learning_rate": 4.031616816981708e-06, "loss": 0.0037, "step": 20305 }, { "epoch": 6.33, "grad_norm": 0.38923880457878113, "learning_rate": 4.027249688365994e-06, "loss": 0.0041, "step": 20310 }, { "epoch": 6.34, "grad_norm": 0.1947145313024521, "learning_rate": 4.022884329867586e-06, "loss": 0.0034, "step": 20315 }, { "epoch": 6.34, "grad_norm": 0.154936745762825, "learning_rate": 4.018520742780227e-06, "loss": 0.0025, "step": 20320 }, { "epoch": 6.34, "grad_norm": 0.29328685998916626, "learning_rate": 4.01415892839713e-06, "loss": 0.0039, "step": 20325 }, { "epoch": 6.34, "grad_norm": 0.2891095280647278, "learning_rate": 4.00979888801099e-06, "loss": 0.0031, "step": 20330 }, { "epoch": 6.34, "grad_norm": 0.15493731200695038, "learning_rate": 4.0054406229139706e-06, "loss": 0.003, "step": 20335 }, { "epoch": 6.34, "grad_norm": 0.17908723652362823, "learning_rate": 4.001084134397708e-06, "loss": 0.0036, "step": 20340 }, { "epoch": 6.35, "grad_norm": 0.5203627943992615, "learning_rate": 3.996729423753324e-06, "loss": 0.0056, "step": 20345 }, { "epoch": 6.35, "grad_norm": 0.2771199345588684, "learning_rate": 3.9923764922714e-06, "loss": 0.0037, "step": 20350 }, { "epoch": 6.35, "grad_norm": 0.3039291501045227, "learning_rate": 3.988025341241999e-06, "loss": 0.0042, "step": 20355 }, { "epoch": 6.35, "grad_norm": 0.22865301370620728, "learning_rate": 3.983675971954643e-06, "loss": 0.0045, "step": 20360 }, { "epoch": 6.35, "grad_norm": 0.47996705770492554, "learning_rate": 3.979328385698342e-06, "loss": 0.0047, "step": 20365 }, { "epoch": 6.35, "grad_norm": 0.41893118619918823, "learning_rate": 3.974982583761568e-06, "loss": 0.0026, "step": 20370 }, { "epoch": 6.35, "grad_norm": 0.6908000111579895, "learning_rate": 3.970638567432269e-06, "loss": 0.0033, "step": 20375 }, { "epoch": 6.36, "grad_norm": 0.1881827861070633, "learning_rate": 3.966296337997859e-06, "loss": 0.0041, "step": 20380 }, { "epoch": 6.36, "grad_norm": 0.2765253782272339, "learning_rate": 3.961955896745224e-06, "loss": 0.0043, "step": 20385 }, { "epoch": 6.36, "grad_norm": 0.5398893356323242, "learning_rate": 3.957617244960732e-06, "loss": 0.0044, "step": 20390 }, { "epoch": 6.36, "grad_norm": 0.13423627614974976, "learning_rate": 3.953280383930197e-06, "loss": 0.0028, "step": 20395 }, { "epoch": 6.36, "grad_norm": 0.11971479654312134, "learning_rate": 3.94894531493892e-06, "loss": 0.0025, "step": 20400 }, { "epoch": 6.36, "grad_norm": 0.17684215307235718, "learning_rate": 3.944612039271666e-06, "loss": 0.0032, "step": 20405 }, { "epoch": 6.37, "grad_norm": 0.37312769889831543, "learning_rate": 3.940280558212669e-06, "loss": 0.004, "step": 20410 }, { "epoch": 6.37, "grad_norm": 0.45159024000167847, "learning_rate": 3.935950873045629e-06, "loss": 0.0041, "step": 20415 }, { "epoch": 6.37, "grad_norm": 0.08123624324798584, "learning_rate": 3.931622985053717e-06, "loss": 0.0031, "step": 20420 }, { "epoch": 6.37, "grad_norm": 0.2570103406906128, "learning_rate": 3.927296895519569e-06, "loss": 0.0055, "step": 20425 }, { "epoch": 6.37, "grad_norm": 0.2280176728963852, "learning_rate": 3.922972605725291e-06, "loss": 0.0026, "step": 20430 }, { "epoch": 6.37, "grad_norm": 0.34249600768089294, "learning_rate": 3.91865011695245e-06, "loss": 0.0051, "step": 20435 }, { "epoch": 6.38, "grad_norm": 0.5048180818557739, "learning_rate": 3.914329430482083e-06, "loss": 0.0038, "step": 20440 }, { "epoch": 6.38, "grad_norm": 0.32122793793678284, "learning_rate": 3.910010547594695e-06, "loss": 0.0035, "step": 20445 }, { "epoch": 6.38, "grad_norm": 0.039529550820589066, "learning_rate": 3.905693469570253e-06, "loss": 0.0047, "step": 20450 }, { "epoch": 6.38, "grad_norm": 0.23486529290676117, "learning_rate": 3.901378197688191e-06, "loss": 0.0039, "step": 20455 }, { "epoch": 6.38, "grad_norm": 0.12477610260248184, "learning_rate": 3.897064733227404e-06, "loss": 0.0023, "step": 20460 }, { "epoch": 6.38, "grad_norm": 0.30318892002105713, "learning_rate": 3.892753077466256e-06, "loss": 0.0034, "step": 20465 }, { "epoch": 6.38, "grad_norm": 0.10283812880516052, "learning_rate": 3.888443231682573e-06, "loss": 0.0028, "step": 20470 }, { "epoch": 6.39, "grad_norm": 0.252560019493103, "learning_rate": 3.884135197153645e-06, "loss": 0.0047, "step": 20475 }, { "epoch": 6.39, "grad_norm": 0.27102649211883545, "learning_rate": 3.879828975156225e-06, "loss": 0.0032, "step": 20480 }, { "epoch": 6.39, "grad_norm": 0.4798802137374878, "learning_rate": 3.875524566966529e-06, "loss": 0.0053, "step": 20485 }, { "epoch": 6.39, "grad_norm": 0.5022309422492981, "learning_rate": 3.871221973860234e-06, "loss": 0.0035, "step": 20490 }, { "epoch": 6.39, "grad_norm": 0.2926425039768219, "learning_rate": 3.866921197112481e-06, "loss": 0.0052, "step": 20495 }, { "epoch": 6.39, "grad_norm": 0.28833624720573425, "learning_rate": 3.862622237997875e-06, "loss": 0.004, "step": 20500 }, { "epoch": 6.4, "grad_norm": 0.24428056180477142, "learning_rate": 3.85832509779047e-06, "loss": 0.0047, "step": 20505 }, { "epoch": 6.4, "grad_norm": 0.2550247013568878, "learning_rate": 3.854029777763799e-06, "loss": 0.0032, "step": 20510 }, { "epoch": 6.4, "grad_norm": 0.5598758459091187, "learning_rate": 3.849736279190846e-06, "loss": 0.0028, "step": 20515 }, { "epoch": 6.4, "grad_norm": 0.14802761375904083, "learning_rate": 3.845444603344053e-06, "loss": 0.0038, "step": 20520 }, { "epoch": 6.4, "grad_norm": 0.23852317035198212, "learning_rate": 3.841154751495326e-06, "loss": 0.0039, "step": 20525 }, { "epoch": 6.4, "grad_norm": 0.17386658489704132, "learning_rate": 3.836866724916029e-06, "loss": 0.0031, "step": 20530 }, { "epoch": 6.4, "grad_norm": 0.2971315383911133, "learning_rate": 3.8325805248769896e-06, "loss": 0.0038, "step": 20535 }, { "epoch": 6.41, "grad_norm": 0.2956434190273285, "learning_rate": 3.8282961526484815e-06, "loss": 0.0049, "step": 20540 }, { "epoch": 6.41, "grad_norm": 0.07207696884870529, "learning_rate": 3.8240136095002465e-06, "loss": 0.003, "step": 20545 }, { "epoch": 6.41, "grad_norm": 0.09542746841907501, "learning_rate": 3.819732896701488e-06, "loss": 0.0033, "step": 20550 }, { "epoch": 6.41, "grad_norm": 0.1593623012304306, "learning_rate": 3.81545401552086e-06, "loss": 0.0034, "step": 20555 }, { "epoch": 6.41, "grad_norm": 0.13820824027061462, "learning_rate": 3.811176967226473e-06, "loss": 0.0037, "step": 20560 }, { "epoch": 6.41, "grad_norm": 0.37225475907325745, "learning_rate": 3.8069017530859e-06, "loss": 0.0051, "step": 20565 }, { "epoch": 6.42, "grad_norm": 0.07505133002996445, "learning_rate": 3.8026283743661673e-06, "loss": 0.0033, "step": 20570 }, { "epoch": 6.42, "grad_norm": 0.2258065938949585, "learning_rate": 3.7983568323337518e-06, "loss": 0.0031, "step": 20575 }, { "epoch": 6.42, "grad_norm": 0.27532634139060974, "learning_rate": 3.7940871282545945e-06, "loss": 0.005, "step": 20580 }, { "epoch": 6.42, "grad_norm": 0.24397745728492737, "learning_rate": 3.789819263394088e-06, "loss": 0.0042, "step": 20585 }, { "epoch": 6.42, "grad_norm": 0.34583353996276855, "learning_rate": 3.785553239017078e-06, "loss": 0.0038, "step": 20590 }, { "epoch": 6.42, "grad_norm": 0.331437349319458, "learning_rate": 3.7812890563878736e-06, "loss": 0.0045, "step": 20595 }, { "epoch": 6.43, "grad_norm": 0.14943893253803253, "learning_rate": 3.777026716770227e-06, "loss": 0.0026, "step": 20600 }, { "epoch": 6.43, "grad_norm": 0.2038610726594925, "learning_rate": 3.7727662214273496e-06, "loss": 0.0036, "step": 20605 }, { "epoch": 6.43, "grad_norm": 0.4535045623779297, "learning_rate": 3.768507571621909e-06, "loss": 0.0032, "step": 20610 }, { "epoch": 6.43, "grad_norm": 0.1757279336452484, "learning_rate": 3.7642507686160155e-06, "loss": 0.0033, "step": 20615 }, { "epoch": 6.43, "grad_norm": 0.2368738204240799, "learning_rate": 3.759995813671241e-06, "loss": 0.004, "step": 20620 }, { "epoch": 6.43, "grad_norm": 0.211713045835495, "learning_rate": 3.7557427080486098e-06, "loss": 0.0039, "step": 20625 }, { "epoch": 6.43, "grad_norm": 0.25176042318344116, "learning_rate": 3.751491453008592e-06, "loss": 0.0029, "step": 20630 }, { "epoch": 6.44, "grad_norm": 0.4962671399116516, "learning_rate": 3.747242049811114e-06, "loss": 0.0036, "step": 20635 }, { "epoch": 6.44, "grad_norm": 0.2933531105518341, "learning_rate": 3.7429944997155565e-06, "loss": 0.0053, "step": 20640 }, { "epoch": 6.44, "grad_norm": 0.3499653935432434, "learning_rate": 3.7387488039807473e-06, "loss": 0.0041, "step": 20645 }, { "epoch": 6.44, "grad_norm": 0.806042492389679, "learning_rate": 3.7345049638649577e-06, "loss": 0.005, "step": 20650 }, { "epoch": 6.44, "grad_norm": 0.36196035146713257, "learning_rate": 3.7302629806259184e-06, "loss": 0.0041, "step": 20655 }, { "epoch": 6.44, "grad_norm": 0.14944805204868317, "learning_rate": 3.7268707318307895e-06, "loss": 0.0033, "step": 20660 }, { "epoch": 6.45, "grad_norm": 0.2249460369348526, "learning_rate": 3.7226320941376224e-06, "loss": 0.0046, "step": 20665 }, { "epoch": 6.45, "grad_norm": 0.36011096835136414, "learning_rate": 3.71839531683991e-06, "loss": 0.0042, "step": 20670 }, { "epoch": 6.45, "grad_norm": 0.09397727996110916, "learning_rate": 3.7141604011932965e-06, "loss": 0.0033, "step": 20675 }, { "epoch": 6.45, "grad_norm": 0.15068700909614563, "learning_rate": 3.709927348452861e-06, "loss": 0.0037, "step": 20680 }, { "epoch": 6.45, "grad_norm": 0.28642067313194275, "learning_rate": 3.7056961598731335e-06, "loss": 0.004, "step": 20685 }, { "epoch": 6.45, "grad_norm": 0.09968727827072144, "learning_rate": 3.701466836708094e-06, "loss": 0.0032, "step": 20690 }, { "epoch": 6.45, "grad_norm": 0.17942407727241516, "learning_rate": 3.697239380211167e-06, "loss": 0.0032, "step": 20695 }, { "epoch": 6.46, "grad_norm": 0.21173350512981415, "learning_rate": 3.6930137916352258e-06, "loss": 0.0047, "step": 20700 }, { "epoch": 6.46, "grad_norm": 0.2591868042945862, "learning_rate": 3.688790072232592e-06, "loss": 0.0043, "step": 20705 }, { "epoch": 6.46, "grad_norm": 0.16367943584918976, "learning_rate": 3.684568223255024e-06, "loss": 0.0032, "step": 20710 }, { "epoch": 6.46, "grad_norm": 0.3665323257446289, "learning_rate": 3.680348245953732e-06, "loss": 0.0024, "step": 20715 }, { "epoch": 6.46, "grad_norm": 0.31445151567459106, "learning_rate": 3.6761301415793784e-06, "loss": 0.0032, "step": 20720 }, { "epoch": 6.46, "grad_norm": 0.30521151423454285, "learning_rate": 3.671913911382061e-06, "loss": 0.0034, "step": 20725 }, { "epoch": 6.47, "grad_norm": 0.4186500608921051, "learning_rate": 3.667699556611325e-06, "loss": 0.0034, "step": 20730 }, { "epoch": 6.47, "grad_norm": 0.13586103916168213, "learning_rate": 3.6634870785161604e-06, "loss": 0.0058, "step": 20735 }, { "epoch": 6.47, "grad_norm": 0.28456878662109375, "learning_rate": 3.659276478345002e-06, "loss": 0.0046, "step": 20740 }, { "epoch": 6.47, "grad_norm": 0.7787228226661682, "learning_rate": 3.6550677573457218e-06, "loss": 0.0032, "step": 20745 }, { "epoch": 6.47, "grad_norm": 0.4641948640346527, "learning_rate": 3.650860916765643e-06, "loss": 0.0044, "step": 20750 }, { "epoch": 6.47, "grad_norm": 0.3779393136501312, "learning_rate": 3.6466559578515226e-06, "loss": 0.0041, "step": 20755 }, { "epoch": 6.47, "grad_norm": 0.18350698053836823, "learning_rate": 3.6424528818495752e-06, "loss": 0.0039, "step": 20760 }, { "epoch": 6.48, "grad_norm": 0.18837334215641022, "learning_rate": 3.638251690005442e-06, "loss": 0.0041, "step": 20765 }, { "epoch": 6.48, "grad_norm": 0.1566227376461029, "learning_rate": 3.6340523835642114e-06, "loss": 0.0043, "step": 20770 }, { "epoch": 6.48, "grad_norm": 0.36541804671287537, "learning_rate": 3.6298549637704184e-06, "loss": 0.0041, "step": 20775 }, { "epoch": 6.48, "grad_norm": 0.20943257212638855, "learning_rate": 3.6256594318680247e-06, "loss": 0.0028, "step": 20780 }, { "epoch": 6.48, "grad_norm": 0.17792825400829315, "learning_rate": 3.621465789100447e-06, "loss": 0.0041, "step": 20785 }, { "epoch": 6.48, "grad_norm": 0.7338669300079346, "learning_rate": 3.617274036710534e-06, "loss": 0.0042, "step": 20790 }, { "epoch": 6.49, "grad_norm": 0.19768068194389343, "learning_rate": 3.6130841759405776e-06, "loss": 0.0037, "step": 20795 }, { "epoch": 6.49, "grad_norm": 0.21714939177036285, "learning_rate": 3.6088962080323055e-06, "loss": 0.0031, "step": 20800 }, { "epoch": 6.49, "grad_norm": 0.2594071626663208, "learning_rate": 3.604710134226892e-06, "loss": 0.0036, "step": 20805 }, { "epoch": 6.49, "grad_norm": 0.3518996834754944, "learning_rate": 3.6005259557649473e-06, "loss": 0.0032, "step": 20810 }, { "epoch": 6.49, "grad_norm": 0.12475259602069855, "learning_rate": 3.59634367388651e-06, "loss": 0.003, "step": 20815 }, { "epoch": 6.49, "grad_norm": 0.3000490367412567, "learning_rate": 3.5921632898310666e-06, "loss": 0.0039, "step": 20820 }, { "epoch": 6.5, "grad_norm": 0.8753775954246521, "learning_rate": 3.58798480483754e-06, "loss": 0.0041, "step": 20825 }, { "epoch": 6.5, "grad_norm": 0.2704598009586334, "learning_rate": 3.583808220144288e-06, "loss": 0.0054, "step": 20830 }, { "epoch": 6.5, "grad_norm": 0.40405184030532837, "learning_rate": 3.5796335369891076e-06, "loss": 0.0032, "step": 20835 }, { "epoch": 6.5, "grad_norm": 0.321799099445343, "learning_rate": 3.5754607566092313e-06, "loss": 0.0044, "step": 20840 }, { "epoch": 6.5, "grad_norm": 0.3521104156970978, "learning_rate": 3.5712898802413266e-06, "loss": 0.006, "step": 20845 }, { "epoch": 6.5, "grad_norm": 0.17451238632202148, "learning_rate": 3.5671209091214963e-06, "loss": 0.0025, "step": 20850 }, { "epoch": 6.5, "grad_norm": 0.2520848512649536, "learning_rate": 3.562953844485282e-06, "loss": 0.0028, "step": 20855 }, { "epoch": 6.51, "grad_norm": 0.41883954405784607, "learning_rate": 3.558788687567656e-06, "loss": 0.0046, "step": 20860 }, { "epoch": 6.51, "grad_norm": 0.30903568863868713, "learning_rate": 3.554625439603029e-06, "loss": 0.0025, "step": 20865 }, { "epoch": 6.51, "grad_norm": 0.6085941791534424, "learning_rate": 3.550464101825243e-06, "loss": 0.0053, "step": 20870 }, { "epoch": 6.51, "grad_norm": 0.1391933262348175, "learning_rate": 3.5463046754675744e-06, "loss": 0.004, "step": 20875 }, { "epoch": 6.51, "grad_norm": 0.22042496502399445, "learning_rate": 3.5421471617627356e-06, "loss": 0.0028, "step": 20880 }, { "epoch": 6.51, "grad_norm": 0.2457202672958374, "learning_rate": 3.5379915619428697e-06, "loss": 0.0041, "step": 20885 }, { "epoch": 6.52, "grad_norm": 0.07952205091714859, "learning_rate": 3.5338378772395532e-06, "loss": 0.002, "step": 20890 }, { "epoch": 6.52, "grad_norm": 0.163711816072464, "learning_rate": 3.529686108883794e-06, "loss": 0.0022, "step": 20895 }, { "epoch": 6.52, "grad_norm": 0.136203795671463, "learning_rate": 3.525536258106035e-06, "loss": 0.0026, "step": 20900 }, { "epoch": 6.52, "grad_norm": 0.21723493933677673, "learning_rate": 3.521388326136148e-06, "loss": 0.0037, "step": 20905 }, { "epoch": 6.52, "grad_norm": 0.5146577954292297, "learning_rate": 3.5180713629280194e-06, "loss": 0.0039, "step": 20910 }, { "epoch": 6.52, "grad_norm": 0.271622896194458, "learning_rate": 3.5139268879097656e-06, "loss": 0.0048, "step": 20915 }, { "epoch": 6.52, "grad_norm": 0.434649795293808, "learning_rate": 3.509784335140001e-06, "loss": 0.0049, "step": 20920 }, { "epoch": 6.53, "grad_norm": 0.5571953058242798, "learning_rate": 3.505643705846432e-06, "loss": 0.005, "step": 20925 }, { "epoch": 6.53, "grad_norm": 0.33921700716018677, "learning_rate": 3.5015050012562047e-06, "loss": 0.0034, "step": 20930 }, { "epoch": 6.53, "grad_norm": 0.3331754505634308, "learning_rate": 3.497368222595886e-06, "loss": 0.005, "step": 20935 }, { "epoch": 6.53, "grad_norm": 0.21557758748531342, "learning_rate": 3.493233371091477e-06, "loss": 0.0052, "step": 20940 }, { "epoch": 6.53, "grad_norm": 0.23702630400657654, "learning_rate": 3.4891004479683986e-06, "loss": 0.003, "step": 20945 }, { "epoch": 6.53, "grad_norm": 0.2295365035533905, "learning_rate": 3.484969454451511e-06, "loss": 0.0037, "step": 20950 }, { "epoch": 6.54, "grad_norm": 0.41207176446914673, "learning_rate": 3.480840391765098e-06, "loss": 0.0046, "step": 20955 }, { "epoch": 6.54, "grad_norm": 0.15661972761154175, "learning_rate": 3.47671326113287e-06, "loss": 0.004, "step": 20960 }, { "epoch": 6.54, "grad_norm": 0.12692229449748993, "learning_rate": 3.472588063777964e-06, "loss": 0.0039, "step": 20965 }, { "epoch": 6.54, "grad_norm": 0.11728105694055557, "learning_rate": 3.468464800922954e-06, "loss": 0.0028, "step": 20970 }, { "epoch": 6.54, "grad_norm": 0.12148414552211761, "learning_rate": 3.4643434737898286e-06, "loss": 0.0047, "step": 20975 }, { "epoch": 6.54, "grad_norm": 0.19293716549873352, "learning_rate": 3.460224083600009e-06, "loss": 0.0026, "step": 20980 }, { "epoch": 6.55, "grad_norm": 0.22977282106876373, "learning_rate": 3.4561066315743365e-06, "loss": 0.0026, "step": 20985 }, { "epoch": 6.55, "grad_norm": 0.30549490451812744, "learning_rate": 3.4519911189330834e-06, "loss": 0.0048, "step": 20990 }, { "epoch": 6.55, "grad_norm": 0.2782987952232361, "learning_rate": 3.447877546895946e-06, "loss": 0.0042, "step": 20995 }, { "epoch": 6.55, "grad_norm": 0.29214322566986084, "learning_rate": 3.4437659166820457e-06, "loss": 0.0048, "step": 21000 }, { "epoch": 6.55, "grad_norm": 0.15185247361660004, "learning_rate": 3.439656229509929e-06, "loss": 0.0049, "step": 21005 }, { "epoch": 6.55, "grad_norm": 0.14987649023532867, "learning_rate": 3.4355484865975596e-06, "loss": 0.0038, "step": 21010 }, { "epoch": 6.55, "grad_norm": 0.2198924869298935, "learning_rate": 3.4314426891623432e-06, "loss": 0.0043, "step": 21015 }, { "epoch": 6.56, "grad_norm": 0.37174761295318604, "learning_rate": 3.4273388384210858e-06, "loss": 0.0039, "step": 21020 }, { "epoch": 6.56, "grad_norm": 0.38870739936828613, "learning_rate": 3.423236935590031e-06, "loss": 0.0034, "step": 21025 }, { "epoch": 6.56, "grad_norm": 0.2492036074399948, "learning_rate": 3.4191369818848396e-06, "loss": 0.0033, "step": 21030 }, { "epoch": 6.56, "grad_norm": 0.17112156748771667, "learning_rate": 3.4150389785205996e-06, "loss": 0.0048, "step": 21035 }, { "epoch": 6.56, "grad_norm": 0.2990744113922119, "learning_rate": 3.4109429267118155e-06, "loss": 0.0042, "step": 21040 }, { "epoch": 6.56, "grad_norm": 0.3892471492290497, "learning_rate": 3.4068488276724166e-06, "loss": 0.005, "step": 21045 }, { "epoch": 6.57, "grad_norm": 0.26357969641685486, "learning_rate": 3.402756682615752e-06, "loss": 0.0036, "step": 21050 }, { "epoch": 6.57, "grad_norm": 0.19580000638961792, "learning_rate": 3.3986664927545943e-06, "loss": 0.0046, "step": 21055 }, { "epoch": 6.57, "grad_norm": 0.2106107473373413, "learning_rate": 3.3945782593011334e-06, "loss": 0.0041, "step": 21060 }, { "epoch": 6.57, "grad_norm": 0.16694635152816772, "learning_rate": 3.3904919834669813e-06, "loss": 0.0031, "step": 21065 }, { "epoch": 6.57, "grad_norm": 0.08712832629680634, "learning_rate": 3.3864076664631694e-06, "loss": 0.0022, "step": 21070 }, { "epoch": 6.57, "grad_norm": 0.12394944578409195, "learning_rate": 3.3823253095001476e-06, "loss": 0.0023, "step": 21075 }, { "epoch": 6.57, "grad_norm": 0.18365685641765594, "learning_rate": 3.378244913787787e-06, "loss": 0.0034, "step": 21080 }, { "epoch": 6.58, "grad_norm": 0.05024956166744232, "learning_rate": 3.3741664805353746e-06, "loss": 0.0044, "step": 21085 }, { "epoch": 6.58, "grad_norm": 0.15042851865291595, "learning_rate": 3.3700900109516188e-06, "loss": 0.0029, "step": 21090 }, { "epoch": 6.58, "grad_norm": 0.27349185943603516, "learning_rate": 3.366015506244644e-06, "loss": 0.0033, "step": 21095 }, { "epoch": 6.58, "grad_norm": 0.25597962737083435, "learning_rate": 3.3619429676219937e-06, "loss": 0.0051, "step": 21100 }, { "epoch": 6.58, "grad_norm": 0.11255136132240295, "learning_rate": 3.357872396290628e-06, "loss": 0.0044, "step": 21105 }, { "epoch": 6.58, "grad_norm": 0.2635140120983124, "learning_rate": 3.3538037934569223e-06, "loss": 0.0043, "step": 21110 }, { "epoch": 6.59, "grad_norm": 0.35105404257774353, "learning_rate": 3.3497371603266714e-06, "loss": 0.0053, "step": 21115 }, { "epoch": 6.59, "grad_norm": 0.12501968443393707, "learning_rate": 3.345672498105088e-06, "loss": 0.0023, "step": 21120 }, { "epoch": 6.59, "grad_norm": 0.24185575544834137, "learning_rate": 3.341609807996793e-06, "loss": 0.0048, "step": 21125 }, { "epoch": 6.59, "grad_norm": 0.5106483101844788, "learning_rate": 3.337549091205826e-06, "loss": 0.0033, "step": 21130 }, { "epoch": 6.59, "grad_norm": 0.2359280288219452, "learning_rate": 3.3334903489356496e-06, "loss": 0.0022, "step": 21135 }, { "epoch": 6.59, "grad_norm": 0.2506748139858246, "learning_rate": 3.329433582389133e-06, "loss": 0.0033, "step": 21140 }, { "epoch": 6.6, "grad_norm": 0.1529313325881958, "learning_rate": 3.3253787927685622e-06, "loss": 0.0027, "step": 21145 }, { "epoch": 6.6, "grad_norm": 0.18696674704551697, "learning_rate": 3.321325981275636e-06, "loss": 0.0053, "step": 21150 }, { "epoch": 6.6, "grad_norm": 0.6021310091018677, "learning_rate": 3.3172751491114686e-06, "loss": 0.0055, "step": 21155 }, { "epoch": 6.6, "grad_norm": 0.3504422903060913, "learning_rate": 3.3132262974765906e-06, "loss": 0.0034, "step": 21160 }, { "epoch": 6.6, "grad_norm": 0.15375612676143646, "learning_rate": 3.3091794275709345e-06, "loss": 0.0032, "step": 21165 }, { "epoch": 6.6, "grad_norm": 0.12261166423559189, "learning_rate": 3.305134540593854e-06, "loss": 0.0026, "step": 21170 }, { "epoch": 6.6, "grad_norm": 0.14208921790122986, "learning_rate": 3.3010916377441207e-06, "loss": 0.0032, "step": 21175 }, { "epoch": 6.61, "grad_norm": 0.47172245383262634, "learning_rate": 3.2970507202199077e-06, "loss": 0.0047, "step": 21180 }, { "epoch": 6.61, "grad_norm": 0.2029520571231842, "learning_rate": 3.293011789218803e-06, "loss": 0.0034, "step": 21185 }, { "epoch": 6.61, "grad_norm": 0.2652159333229065, "learning_rate": 3.288974845937809e-06, "loss": 0.003, "step": 21190 }, { "epoch": 6.61, "grad_norm": 0.22905781865119934, "learning_rate": 3.2849398915733376e-06, "loss": 0.0038, "step": 21195 }, { "epoch": 6.61, "grad_norm": 0.20727509260177612, "learning_rate": 3.2809069273212047e-06, "loss": 0.0036, "step": 21200 }, { "epoch": 6.61, "grad_norm": 0.12545184791088104, "learning_rate": 3.276875954376644e-06, "loss": 0.0024, "step": 21205 }, { "epoch": 6.62, "grad_norm": 0.13865812122821808, "learning_rate": 3.2728469739342984e-06, "loss": 0.0035, "step": 21210 }, { "epoch": 6.62, "grad_norm": 0.17256861925125122, "learning_rate": 3.2688199871882155e-06, "loss": 0.0032, "step": 21215 }, { "epoch": 6.62, "grad_norm": 0.24499864876270294, "learning_rate": 3.26479499533186e-06, "loss": 0.0037, "step": 21220 }, { "epoch": 6.62, "grad_norm": 0.47705909609794617, "learning_rate": 3.2607719995580998e-06, "loss": 0.004, "step": 21225 }, { "epoch": 6.62, "grad_norm": 0.48891186714172363, "learning_rate": 3.256751001059214e-06, "loss": 0.0036, "step": 21230 }, { "epoch": 6.62, "grad_norm": 0.19393102824687958, "learning_rate": 3.2527320010268816e-06, "loss": 0.0043, "step": 21235 }, { "epoch": 6.62, "grad_norm": 0.1835356056690216, "learning_rate": 3.2487150006521994e-06, "loss": 0.0036, "step": 21240 }, { "epoch": 6.63, "grad_norm": 0.26606136560440063, "learning_rate": 3.244700001125668e-06, "loss": 0.0038, "step": 21245 }, { "epoch": 6.63, "grad_norm": 0.41781163215637207, "learning_rate": 3.2406870036371942e-06, "loss": 0.0033, "step": 21250 }, { "epoch": 6.63, "grad_norm": 0.20610876381397247, "learning_rate": 3.2366760093760885e-06, "loss": 0.0037, "step": 21255 }, { "epoch": 6.63, "grad_norm": 0.04123732075095177, "learning_rate": 3.232667019531078e-06, "loss": 0.0022, "step": 21260 }, { "epoch": 6.63, "grad_norm": 0.1551009863615036, "learning_rate": 3.228660035290291e-06, "loss": 0.0029, "step": 21265 }, { "epoch": 6.63, "grad_norm": 0.32036370038986206, "learning_rate": 3.2246550578412517e-06, "loss": 0.0051, "step": 21270 }, { "epoch": 6.64, "grad_norm": 0.22211022675037384, "learning_rate": 3.220652088370899e-06, "loss": 0.0042, "step": 21275 }, { "epoch": 6.64, "grad_norm": 0.1752278208732605, "learning_rate": 3.2166511280655775e-06, "loss": 0.0026, "step": 21280 }, { "epoch": 6.64, "grad_norm": 0.23283261060714722, "learning_rate": 3.212652178111033e-06, "loss": 0.0049, "step": 21285 }, { "epoch": 6.64, "grad_norm": 0.07854650169610977, "learning_rate": 3.2086552396924166e-06, "loss": 0.004, "step": 21290 }, { "epoch": 6.64, "grad_norm": 0.1206623762845993, "learning_rate": 3.2046603139942823e-06, "loss": 0.0037, "step": 21295 }, { "epoch": 6.64, "grad_norm": 0.16894768178462982, "learning_rate": 3.2006674022005857e-06, "loss": 0.0024, "step": 21300 }, { "epoch": 6.64, "grad_norm": 0.2977941036224365, "learning_rate": 3.196676505494698e-06, "loss": 0.0031, "step": 21305 }, { "epoch": 6.65, "grad_norm": 0.9024665951728821, "learning_rate": 3.192687625059373e-06, "loss": 0.0039, "step": 21310 }, { "epoch": 6.65, "grad_norm": 0.41954556107521057, "learning_rate": 3.188700762076782e-06, "loss": 0.0037, "step": 21315 }, { "epoch": 6.65, "grad_norm": 0.6290353536605835, "learning_rate": 3.1847159177284926e-06, "loss": 0.0047, "step": 21320 }, { "epoch": 6.65, "grad_norm": 0.17991706728935242, "learning_rate": 3.180733093195476e-06, "loss": 0.0052, "step": 21325 }, { "epoch": 6.65, "grad_norm": 0.2722851634025574, "learning_rate": 3.176752289658104e-06, "loss": 0.0024, "step": 21330 }, { "epoch": 6.65, "grad_norm": 0.2850939929485321, "learning_rate": 3.1727735082961485e-06, "loss": 0.0039, "step": 21335 }, { "epoch": 6.66, "grad_norm": 0.08436838537454605, "learning_rate": 3.168796750288784e-06, "loss": 0.0028, "step": 21340 }, { "epoch": 6.66, "grad_norm": 0.5143541693687439, "learning_rate": 3.164822016814585e-06, "loss": 0.004, "step": 21345 }, { "epoch": 6.66, "grad_norm": 0.08529569953680038, "learning_rate": 3.1608493090515245e-06, "loss": 0.0039, "step": 21350 }, { "epoch": 6.66, "grad_norm": 0.36727142333984375, "learning_rate": 3.156878628176977e-06, "loss": 0.0058, "step": 21355 }, { "epoch": 6.66, "grad_norm": 0.2226283848285675, "learning_rate": 3.152909975367715e-06, "loss": 0.004, "step": 21360 }, { "epoch": 6.66, "grad_norm": 0.26580193638801575, "learning_rate": 3.1489433517999102e-06, "loss": 0.0045, "step": 21365 }, { "epoch": 6.67, "grad_norm": 0.08652780950069427, "learning_rate": 3.144978758649133e-06, "loss": 0.0046, "step": 21370 }, { "epoch": 6.67, "grad_norm": 0.2908978760242462, "learning_rate": 3.141016197090353e-06, "loss": 0.0033, "step": 21375 }, { "epoch": 6.67, "grad_norm": 0.13436950743198395, "learning_rate": 3.137055668297936e-06, "loss": 0.0023, "step": 21380 }, { "epoch": 6.67, "grad_norm": 0.27255043387413025, "learning_rate": 3.133097173445646e-06, "loss": 0.0033, "step": 21385 }, { "epoch": 6.67, "grad_norm": 0.2276810258626938, "learning_rate": 3.1291407137066453e-06, "loss": 0.0048, "step": 21390 }, { "epoch": 6.67, "grad_norm": 0.29353004693984985, "learning_rate": 3.1251862902534926e-06, "loss": 0.0025, "step": 21395 }, { "epoch": 6.67, "grad_norm": 0.30397000908851624, "learning_rate": 3.121233904258141e-06, "loss": 0.0032, "step": 21400 }, { "epoch": 6.68, "grad_norm": 0.30446457862854004, "learning_rate": 3.1172835568919425e-06, "loss": 0.0051, "step": 21405 }, { "epoch": 6.68, "grad_norm": 0.1925806999206543, "learning_rate": 3.1133352493256475e-06, "loss": 0.0023, "step": 21410 }, { "epoch": 6.68, "grad_norm": 0.26819539070129395, "learning_rate": 3.109388982729391e-06, "loss": 0.003, "step": 21415 }, { "epoch": 6.68, "grad_norm": 0.42312636971473694, "learning_rate": 3.1054447582727098e-06, "loss": 0.0037, "step": 21420 }, { "epoch": 6.68, "grad_norm": 0.23445560038089752, "learning_rate": 3.1015025771245446e-06, "loss": 0.0024, "step": 21425 }, { "epoch": 6.68, "grad_norm": 0.47701147198677063, "learning_rate": 3.0975624404532167e-06, "loss": 0.0042, "step": 21430 }, { "epoch": 6.69, "grad_norm": 0.24110835790634155, "learning_rate": 3.0936243494264485e-06, "loss": 0.0042, "step": 21435 }, { "epoch": 6.69, "grad_norm": 0.14478828012943268, "learning_rate": 3.0896883052113526e-06, "loss": 0.0031, "step": 21440 }, { "epoch": 6.69, "grad_norm": 0.19179287552833557, "learning_rate": 3.0857543089744424e-06, "loss": 0.003, "step": 21445 }, { "epoch": 6.69, "grad_norm": 0.13948087394237518, "learning_rate": 3.08182236188161e-06, "loss": 0.0021, "step": 21450 }, { "epoch": 6.69, "grad_norm": 0.17611132562160492, "learning_rate": 3.0778924650981536e-06, "loss": 0.0034, "step": 21455 }, { "epoch": 6.69, "grad_norm": 0.2598230540752411, "learning_rate": 3.073964619788755e-06, "loss": 0.0028, "step": 21460 }, { "epoch": 6.69, "grad_norm": 0.21520379185676575, "learning_rate": 3.0700388271174985e-06, "loss": 0.0027, "step": 21465 }, { "epoch": 6.7, "grad_norm": 0.6616179347038269, "learning_rate": 3.066115088247852e-06, "loss": 0.0032, "step": 21470 }, { "epoch": 6.7, "grad_norm": 0.15049836039543152, "learning_rate": 3.062193404342676e-06, "loss": 0.002, "step": 21475 }, { "epoch": 6.7, "grad_norm": 0.3814724087715149, "learning_rate": 3.058273776564221e-06, "loss": 0.004, "step": 21480 }, { "epoch": 6.7, "grad_norm": 0.39669278264045715, "learning_rate": 3.0543562060741337e-06, "loss": 0.0047, "step": 21485 }, { "epoch": 6.7, "grad_norm": 0.09728311747312546, "learning_rate": 3.0504406940334418e-06, "loss": 0.0039, "step": 21490 }, { "epoch": 6.7, "grad_norm": 0.19073884189128876, "learning_rate": 3.0465272416025695e-06, "loss": 0.003, "step": 21495 }, { "epoch": 6.71, "grad_norm": 0.20341838896274567, "learning_rate": 3.0426158499413294e-06, "loss": 0.0032, "step": 21500 }, { "epoch": 6.71, "grad_norm": 0.25398117303848267, "learning_rate": 3.0387065202089204e-06, "loss": 0.0034, "step": 21505 }, { "epoch": 6.71, "grad_norm": 0.2423587143421173, "learning_rate": 3.034799253563939e-06, "loss": 0.0034, "step": 21510 }, { "epoch": 6.71, "grad_norm": 0.2504832446575165, "learning_rate": 3.0308940511643627e-06, "loss": 0.0039, "step": 21515 }, { "epoch": 6.71, "grad_norm": 0.07413171231746674, "learning_rate": 3.026990914167559e-06, "loss": 0.0029, "step": 21520 }, { "epoch": 6.71, "grad_norm": 0.4909358322620392, "learning_rate": 3.02308984373028e-06, "loss": 0.003, "step": 21525 }, { "epoch": 6.72, "grad_norm": 0.319919615983963, "learning_rate": 3.019190841008671e-06, "loss": 0.0033, "step": 21530 }, { "epoch": 6.72, "grad_norm": 0.2074088305234909, "learning_rate": 3.01529390715826e-06, "loss": 0.0042, "step": 21535 }, { "epoch": 6.72, "grad_norm": 0.24591241776943207, "learning_rate": 3.011399043333967e-06, "loss": 0.0032, "step": 21540 }, { "epoch": 6.72, "grad_norm": 0.33461540937423706, "learning_rate": 3.007506250690092e-06, "loss": 0.0038, "step": 21545 }, { "epoch": 6.72, "grad_norm": 0.27527114748954773, "learning_rate": 3.003615530380324e-06, "loss": 0.0024, "step": 21550 }, { "epoch": 6.72, "grad_norm": 0.10472951829433441, "learning_rate": 2.9997268835577454e-06, "loss": 0.0031, "step": 21555 }, { "epoch": 6.72, "grad_norm": 0.22347362339496613, "learning_rate": 2.9958403113748104e-06, "loss": 0.0047, "step": 21560 }, { "epoch": 6.73, "grad_norm": 0.3212512135505676, "learning_rate": 2.991955814983367e-06, "loss": 0.0057, "step": 21565 }, { "epoch": 6.73, "grad_norm": 0.1527688354253769, "learning_rate": 2.988073395534645e-06, "loss": 0.0027, "step": 21570 }, { "epoch": 6.73, "grad_norm": 0.10932507365942001, "learning_rate": 2.984193054179262e-06, "loss": 0.0034, "step": 21575 }, { "epoch": 6.73, "grad_norm": 0.3322175443172455, "learning_rate": 2.980314792067215e-06, "loss": 0.0048, "step": 21580 }, { "epoch": 6.73, "grad_norm": 0.2838209271430969, "learning_rate": 2.976438610347887e-06, "loss": 0.0058, "step": 21585 }, { "epoch": 6.73, "grad_norm": 0.266382098197937, "learning_rate": 2.9725645101700473e-06, "loss": 0.0045, "step": 21590 }, { "epoch": 6.74, "grad_norm": 0.1659298539161682, "learning_rate": 2.968692492681844e-06, "loss": 0.0038, "step": 21595 }, { "epoch": 6.74, "grad_norm": 0.19411054253578186, "learning_rate": 2.96482255903081e-06, "loss": 0.0044, "step": 21600 }, { "epoch": 6.74, "grad_norm": 0.28912949562072754, "learning_rate": 2.9609547103638603e-06, "loss": 0.0032, "step": 21605 }, { "epoch": 6.74, "grad_norm": 0.3964943289756775, "learning_rate": 2.9570889478272914e-06, "loss": 0.0043, "step": 21610 }, { "epoch": 6.74, "grad_norm": 0.18239091336727142, "learning_rate": 2.953225272566782e-06, "loss": 0.0023, "step": 21615 }, { "epoch": 6.74, "grad_norm": 0.21042893826961517, "learning_rate": 2.949363685727392e-06, "loss": 0.0044, "step": 21620 }, { "epoch": 6.74, "grad_norm": 0.3126097023487091, "learning_rate": 2.945504188453564e-06, "loss": 0.0021, "step": 21625 }, { "epoch": 6.75, "grad_norm": 0.2078578770160675, "learning_rate": 2.941646781889119e-06, "loss": 0.0034, "step": 21630 }, { "epoch": 6.75, "grad_norm": 0.29286807775497437, "learning_rate": 2.93779146717726e-06, "loss": 0.0042, "step": 21635 }, { "epoch": 6.75, "grad_norm": 0.34374094009399414, "learning_rate": 2.933938245460568e-06, "loss": 0.0044, "step": 21640 }, { "epoch": 6.75, "grad_norm": 0.33126339316368103, "learning_rate": 2.9300871178810065e-06, "loss": 0.0045, "step": 21645 }, { "epoch": 6.75, "grad_norm": 0.29246968030929565, "learning_rate": 2.9262380855799165e-06, "loss": 0.0029, "step": 21650 }, { "epoch": 6.75, "grad_norm": 0.7621261477470398, "learning_rate": 2.9223911496980184e-06, "loss": 0.0027, "step": 21655 }, { "epoch": 6.76, "grad_norm": 0.2603248655796051, "learning_rate": 2.918546311375411e-06, "loss": 0.003, "step": 21660 }, { "epoch": 6.76, "grad_norm": 0.3011554777622223, "learning_rate": 2.9147035717515725e-06, "loss": 0.0036, "step": 21665 }, { "epoch": 6.76, "grad_norm": 0.27007898688316345, "learning_rate": 2.910862931965358e-06, "loss": 0.004, "step": 21670 }, { "epoch": 6.76, "grad_norm": 0.2060871124267578, "learning_rate": 2.9070243931550013e-06, "loss": 0.0032, "step": 21675 }, { "epoch": 6.76, "grad_norm": 0.6070762872695923, "learning_rate": 2.9031879564581135e-06, "loss": 0.006, "step": 21680 }, { "epoch": 6.76, "grad_norm": 0.3831270933151245, "learning_rate": 2.899353623011679e-06, "loss": 0.0042, "step": 21685 }, { "epoch": 6.77, "grad_norm": 0.2492065280675888, "learning_rate": 2.8955213939520664e-06, "loss": 0.003, "step": 21690 }, { "epoch": 6.77, "grad_norm": 0.456470787525177, "learning_rate": 2.8916912704150136e-06, "loss": 0.0037, "step": 21695 }, { "epoch": 6.77, "grad_norm": 0.2670809328556061, "learning_rate": 2.887863253535641e-06, "loss": 0.0038, "step": 21700 }, { "epoch": 6.77, "grad_norm": 0.3004983961582184, "learning_rate": 2.8840373444484347e-06, "loss": 0.0032, "step": 21705 }, { "epoch": 6.77, "grad_norm": 0.18564684689044952, "learning_rate": 2.8802135442872627e-06, "loss": 0.0036, "step": 21710 }, { "epoch": 6.77, "grad_norm": 0.4440821409225464, "learning_rate": 2.876391854185373e-06, "loss": 0.0046, "step": 21715 }, { "epoch": 6.77, "grad_norm": 0.2004132866859436, "learning_rate": 2.8725722752753795e-06, "loss": 0.0042, "step": 21720 }, { "epoch": 6.78, "grad_norm": 0.3126099109649658, "learning_rate": 2.8687548086892734e-06, "loss": 0.0041, "step": 21725 }, { "epoch": 6.78, "grad_norm": 0.3466210961341858, "learning_rate": 2.864939455558423e-06, "loss": 0.0026, "step": 21730 }, { "epoch": 6.78, "grad_norm": 0.18223507702350616, "learning_rate": 2.8611262170135666e-06, "loss": 0.0035, "step": 21735 }, { "epoch": 6.78, "grad_norm": 0.5391222834587097, "learning_rate": 2.8573150941848138e-06, "loss": 0.0043, "step": 21740 }, { "epoch": 6.78, "grad_norm": 0.30332350730895996, "learning_rate": 2.853506088201652e-06, "loss": 0.003, "step": 21745 }, { "epoch": 6.78, "grad_norm": 0.20412799715995789, "learning_rate": 2.8496992001929347e-06, "loss": 0.0034, "step": 21750 }, { "epoch": 6.79, "grad_norm": 0.5126622319221497, "learning_rate": 2.8458944312868998e-06, "loss": 0.0035, "step": 21755 }, { "epoch": 6.79, "grad_norm": 0.25375452637672424, "learning_rate": 2.8420917826111473e-06, "loss": 0.0042, "step": 21760 }, { "epoch": 6.79, "grad_norm": 0.21945394575595856, "learning_rate": 2.8382912552926488e-06, "loss": 0.003, "step": 21765 }, { "epoch": 6.79, "grad_norm": 0.33257466554641724, "learning_rate": 2.8344928504577563e-06, "loss": 0.0034, "step": 21770 }, { "epoch": 6.79, "grad_norm": 0.3007264733314514, "learning_rate": 2.8306965692321764e-06, "loss": 0.0038, "step": 21775 }, { "epoch": 6.79, "grad_norm": 0.3190644085407257, "learning_rate": 2.8269024127410004e-06, "loss": 0.0032, "step": 21780 }, { "epoch": 6.79, "grad_norm": 0.13521669805049896, "learning_rate": 2.8231103821086856e-06, "loss": 0.0029, "step": 21785 }, { "epoch": 6.8, "grad_norm": 0.25221729278564453, "learning_rate": 2.81932047845906e-06, "loss": 0.0045, "step": 21790 }, { "epoch": 6.8, "grad_norm": 0.21736393868923187, "learning_rate": 2.8155327029153155e-06, "loss": 0.0029, "step": 21795 }, { "epoch": 6.8, "grad_norm": 0.1783631443977356, "learning_rate": 2.811747056600026e-06, "loss": 0.0037, "step": 21800 }, { "epoch": 6.8, "grad_norm": 0.45112115144729614, "learning_rate": 2.8079635406351226e-06, "loss": 0.0039, "step": 21805 }, { "epoch": 6.8, "grad_norm": 0.2695203423500061, "learning_rate": 2.8041821561419126e-06, "loss": 0.004, "step": 21810 }, { "epoch": 6.8, "grad_norm": 0.19870398938655853, "learning_rate": 2.8004029042410608e-06, "loss": 0.0021, "step": 21815 }, { "epoch": 6.81, "grad_norm": 0.5985007882118225, "learning_rate": 2.7966257860526115e-06, "loss": 0.0035, "step": 21820 }, { "epoch": 6.81, "grad_norm": 0.23408366739749908, "learning_rate": 2.792850802695971e-06, "loss": 0.0038, "step": 21825 }, { "epoch": 6.81, "grad_norm": 0.19258131086826324, "learning_rate": 2.789077955289916e-06, "loss": 0.0036, "step": 21830 }, { "epoch": 6.81, "grad_norm": 0.28733906149864197, "learning_rate": 2.7853072449525875e-06, "loss": 0.0041, "step": 21835 }, { "epoch": 6.81, "grad_norm": 0.07309596240520477, "learning_rate": 2.78153867280149e-06, "loss": 0.0033, "step": 21840 }, { "epoch": 6.81, "grad_norm": 0.2608742117881775, "learning_rate": 2.77777223995351e-06, "loss": 0.0038, "step": 21845 }, { "epoch": 6.81, "grad_norm": 0.29153764247894287, "learning_rate": 2.7740079475248773e-06, "loss": 0.0021, "step": 21850 }, { "epoch": 6.82, "grad_norm": 0.3125184178352356, "learning_rate": 2.770245796631201e-06, "loss": 0.0038, "step": 21855 }, { "epoch": 6.82, "grad_norm": 0.2416171431541443, "learning_rate": 2.766485788387455e-06, "loss": 0.0042, "step": 21860 }, { "epoch": 6.82, "grad_norm": 0.07885327935218811, "learning_rate": 2.762727923907974e-06, "loss": 0.0036, "step": 21865 }, { "epoch": 6.82, "grad_norm": 0.2367239147424698, "learning_rate": 2.7589722043064613e-06, "loss": 0.0037, "step": 21870 }, { "epoch": 6.82, "grad_norm": 0.2745641767978668, "learning_rate": 2.7552186306959815e-06, "loss": 0.0046, "step": 21875 }, { "epoch": 6.82, "grad_norm": 0.5628076195716858, "learning_rate": 2.7514672041889657e-06, "loss": 0.0032, "step": 21880 }, { "epoch": 6.83, "grad_norm": 0.2814357578754425, "learning_rate": 2.7477179258972053e-06, "loss": 0.0035, "step": 21885 }, { "epoch": 6.83, "grad_norm": 0.2630438208580017, "learning_rate": 2.7439707969318585e-06, "loss": 0.0037, "step": 21890 }, { "epoch": 6.83, "grad_norm": 0.3329617381095886, "learning_rate": 2.740225818403446e-06, "loss": 0.0032, "step": 21895 }, { "epoch": 6.83, "grad_norm": 0.1600303202867508, "learning_rate": 2.7364829914218493e-06, "loss": 0.0027, "step": 21900 }, { "epoch": 6.83, "grad_norm": 1.125551462173462, "learning_rate": 2.732742317096313e-06, "loss": 0.006, "step": 21905 }, { "epoch": 6.83, "grad_norm": 0.2530844509601593, "learning_rate": 2.7290037965354445e-06, "loss": 0.0038, "step": 21910 }, { "epoch": 6.84, "grad_norm": 0.18102207779884338, "learning_rate": 2.7252674308472115e-06, "loss": 0.0036, "step": 21915 }, { "epoch": 6.84, "grad_norm": 0.678311824798584, "learning_rate": 2.7215332211389454e-06, "loss": 0.0048, "step": 21920 }, { "epoch": 6.84, "grad_norm": 0.24131645262241364, "learning_rate": 2.717801168517337e-06, "loss": 0.0049, "step": 21925 }, { "epoch": 6.84, "grad_norm": 0.1348024308681488, "learning_rate": 2.714071274088438e-06, "loss": 0.004, "step": 21930 }, { "epoch": 6.84, "grad_norm": 0.18559803068637848, "learning_rate": 2.710343538957659e-06, "loss": 0.0046, "step": 21935 }, { "epoch": 6.84, "grad_norm": 0.25179290771484375, "learning_rate": 2.706617964229775e-06, "loss": 0.0041, "step": 21940 }, { "epoch": 6.84, "grad_norm": 0.15272609889507294, "learning_rate": 2.7028945510089157e-06, "loss": 0.0046, "step": 21945 }, { "epoch": 6.85, "grad_norm": 0.12700574100017548, "learning_rate": 2.6991733003985752e-06, "loss": 0.0031, "step": 21950 }, { "epoch": 6.85, "grad_norm": 0.1488189995288849, "learning_rate": 2.695454213501597e-06, "loss": 0.0021, "step": 21955 }, { "epoch": 6.85, "grad_norm": 0.27931275963783264, "learning_rate": 2.6917372914201976e-06, "loss": 0.0042, "step": 21960 }, { "epoch": 6.85, "grad_norm": 0.28481337428092957, "learning_rate": 2.688022535255942e-06, "loss": 0.0049, "step": 21965 }, { "epoch": 6.85, "grad_norm": 0.39484626054763794, "learning_rate": 2.6843099461097566e-06, "loss": 0.0041, "step": 21970 }, { "epoch": 6.85, "grad_norm": 0.11301475763320923, "learning_rate": 2.680599525081924e-06, "loss": 0.003, "step": 21975 }, { "epoch": 6.86, "grad_norm": 0.32207757234573364, "learning_rate": 2.676891273272084e-06, "loss": 0.0056, "step": 21980 }, { "epoch": 6.86, "grad_norm": 0.20087608695030212, "learning_rate": 2.673185191779237e-06, "loss": 0.0041, "step": 21985 }, { "epoch": 6.86, "grad_norm": 0.10096064954996109, "learning_rate": 2.669481281701739e-06, "loss": 0.0039, "step": 21990 }, { "epoch": 6.86, "grad_norm": 0.13234908878803253, "learning_rate": 2.6657795441372967e-06, "loss": 0.0031, "step": 21995 }, { "epoch": 6.86, "grad_norm": 0.4627056419849396, "learning_rate": 2.6620799801829766e-06, "loss": 0.0042, "step": 22000 }, { "epoch": 6.86, "grad_norm": 0.28749316930770874, "learning_rate": 2.658382590935208e-06, "loss": 0.0029, "step": 22005 }, { "epoch": 6.86, "grad_norm": 0.21665598452091217, "learning_rate": 2.6546873774897663e-06, "loss": 0.0033, "step": 22010 }, { "epoch": 6.87, "grad_norm": 0.33342960476875305, "learning_rate": 2.6509943409417858e-06, "loss": 0.0032, "step": 22015 }, { "epoch": 6.87, "grad_norm": 0.32708919048309326, "learning_rate": 2.647303482385756e-06, "loss": 0.0043, "step": 22020 }, { "epoch": 6.87, "grad_norm": 0.20584945380687714, "learning_rate": 2.6436148029155216e-06, "loss": 0.0029, "step": 22025 }, { "epoch": 6.87, "grad_norm": 0.1696053147315979, "learning_rate": 2.639928303624275e-06, "loss": 0.0038, "step": 22030 }, { "epoch": 6.87, "grad_norm": 0.07086633145809174, "learning_rate": 2.6362439856045707e-06, "loss": 0.0026, "step": 22035 }, { "epoch": 6.87, "grad_norm": 0.20959654450416565, "learning_rate": 2.632561849948313e-06, "loss": 0.0035, "step": 22040 }, { "epoch": 6.88, "grad_norm": 0.4945712685585022, "learning_rate": 2.6288818977467568e-06, "loss": 0.004, "step": 22045 }, { "epoch": 6.88, "grad_norm": 0.21944449841976166, "learning_rate": 2.6252041300905197e-06, "loss": 0.0034, "step": 22050 }, { "epoch": 6.88, "grad_norm": 0.2166629284620285, "learning_rate": 2.6215285480695618e-06, "loss": 0.0031, "step": 22055 }, { "epoch": 6.88, "grad_norm": 0.2644397020339966, "learning_rate": 2.617855152773202e-06, "loss": 0.0046, "step": 22060 }, { "epoch": 6.88, "grad_norm": 0.08215432614088058, "learning_rate": 2.614183945290103e-06, "loss": 0.0034, "step": 22065 }, { "epoch": 6.88, "grad_norm": 0.09645352512598038, "learning_rate": 2.610514926708285e-06, "loss": 0.0033, "step": 22070 }, { "epoch": 6.89, "grad_norm": 0.49109041690826416, "learning_rate": 2.606848098115121e-06, "loss": 0.0034, "step": 22075 }, { "epoch": 6.89, "grad_norm": 0.3283572494983673, "learning_rate": 2.6031834605973315e-06, "loss": 0.0033, "step": 22080 }, { "epoch": 6.89, "grad_norm": 0.10526677221059799, "learning_rate": 2.5995210152409855e-06, "loss": 0.0033, "step": 22085 }, { "epoch": 6.89, "grad_norm": 0.4159996509552002, "learning_rate": 2.5958607631315113e-06, "loss": 0.0031, "step": 22090 }, { "epoch": 6.89, "grad_norm": 0.42396876215934753, "learning_rate": 2.5922027053536823e-06, "loss": 0.0039, "step": 22095 }, { "epoch": 6.89, "grad_norm": 0.6997884511947632, "learning_rate": 2.588546842991615e-06, "loss": 0.0041, "step": 22100 }, { "epoch": 6.89, "grad_norm": 0.1890052706003189, "learning_rate": 2.584893177128782e-06, "loss": 0.0035, "step": 22105 }, { "epoch": 6.9, "grad_norm": 0.3274327516555786, "learning_rate": 2.5812417088480067e-06, "loss": 0.0037, "step": 22110 }, { "epoch": 6.9, "grad_norm": 0.12549816071987152, "learning_rate": 2.5775924392314567e-06, "loss": 0.0031, "step": 22115 }, { "epoch": 6.9, "grad_norm": 0.21259631216526031, "learning_rate": 2.57394536936065e-06, "loss": 0.0034, "step": 22120 }, { "epoch": 6.9, "grad_norm": 0.13228316605091095, "learning_rate": 2.570300500316452e-06, "loss": 0.003, "step": 22125 }, { "epoch": 6.9, "grad_norm": 0.3453643023967743, "learning_rate": 2.5666578331790735e-06, "loss": 0.0044, "step": 22130 }, { "epoch": 6.9, "grad_norm": 0.29129546880722046, "learning_rate": 2.5630173690280856e-06, "loss": 0.0062, "step": 22135 }, { "epoch": 6.91, "grad_norm": 0.4494245946407318, "learning_rate": 2.559379108942386e-06, "loss": 0.0049, "step": 22140 }, { "epoch": 6.91, "grad_norm": 0.2582186460494995, "learning_rate": 2.555743054000234e-06, "loss": 0.0044, "step": 22145 }, { "epoch": 6.91, "grad_norm": 0.24486398696899414, "learning_rate": 2.5521092052792286e-06, "loss": 0.0033, "step": 22150 }, { "epoch": 6.91, "grad_norm": 0.1669885218143463, "learning_rate": 2.548477563856319e-06, "loss": 0.003, "step": 22155 }, { "epoch": 6.91, "grad_norm": 0.3093135952949524, "learning_rate": 2.5448481308077976e-06, "loss": 0.0041, "step": 22160 }, { "epoch": 6.91, "grad_norm": 0.3074520528316498, "learning_rate": 2.541220907209304e-06, "loss": 0.0036, "step": 22165 }, { "epoch": 6.91, "grad_norm": 0.1470278799533844, "learning_rate": 2.5375958941358224e-06, "loss": 0.0031, "step": 22170 }, { "epoch": 6.92, "grad_norm": 0.2791728377342224, "learning_rate": 2.5339730926616823e-06, "loss": 0.0048, "step": 22175 }, { "epoch": 6.92, "grad_norm": 0.1116819679737091, "learning_rate": 2.530352503860556e-06, "loss": 0.0025, "step": 22180 }, { "epoch": 6.92, "grad_norm": 0.2816881835460663, "learning_rate": 2.52673412880546e-06, "loss": 0.0032, "step": 22185 }, { "epoch": 6.92, "grad_norm": 0.35410192608833313, "learning_rate": 2.523117968568759e-06, "loss": 0.0044, "step": 22190 }, { "epoch": 6.92, "grad_norm": 0.3361516296863556, "learning_rate": 2.519504024222156e-06, "loss": 0.0038, "step": 22195 }, { "epoch": 6.92, "grad_norm": 0.3186194598674774, "learning_rate": 2.5158922968366995e-06, "loss": 0.0038, "step": 22200 }, { "epoch": 6.93, "grad_norm": 0.156856507062912, "learning_rate": 2.5122827874827825e-06, "loss": 0.004, "step": 22205 }, { "epoch": 6.93, "grad_norm": 0.07649488747119904, "learning_rate": 2.5086754972301386e-06, "loss": 0.0028, "step": 22210 }, { "epoch": 6.93, "grad_norm": 0.4045189917087555, "learning_rate": 2.505070427147843e-06, "loss": 0.0042, "step": 22215 }, { "epoch": 6.93, "grad_norm": 0.41002410650253296, "learning_rate": 2.5014675783043165e-06, "loss": 0.0048, "step": 22220 }, { "epoch": 6.93, "grad_norm": 0.3689776062965393, "learning_rate": 2.497866951767317e-06, "loss": 0.0056, "step": 22225 }, { "epoch": 6.93, "grad_norm": 0.3803596496582031, "learning_rate": 2.4942685486039487e-06, "loss": 0.0023, "step": 22230 }, { "epoch": 6.94, "grad_norm": 0.08025051653385162, "learning_rate": 2.490672369880651e-06, "loss": 0.004, "step": 22235 }, { "epoch": 6.94, "grad_norm": 1.6498674154281616, "learning_rate": 2.4870784166632123e-06, "loss": 0.0066, "step": 22240 }, { "epoch": 6.94, "grad_norm": 0.2917429506778717, "learning_rate": 2.4834866900167478e-06, "loss": 0.0032, "step": 22245 }, { "epoch": 6.94, "grad_norm": 0.4845334589481354, "learning_rate": 2.479897191005729e-06, "loss": 0.004, "step": 22250 }, { "epoch": 6.94, "grad_norm": 0.2840844988822937, "learning_rate": 2.476309920693958e-06, "loss": 0.004, "step": 22255 }, { "epoch": 6.94, "grad_norm": 0.19367815554141998, "learning_rate": 2.4727248801445768e-06, "loss": 0.0047, "step": 22260 }, { "epoch": 6.94, "grad_norm": 0.20746980607509613, "learning_rate": 2.4691420704200696e-06, "loss": 0.0024, "step": 22265 }, { "epoch": 6.95, "grad_norm": 0.5077010989189148, "learning_rate": 2.4655614925822558e-06, "loss": 0.0032, "step": 22270 }, { "epoch": 6.95, "grad_norm": 0.28518030047416687, "learning_rate": 2.4619831476922994e-06, "loss": 0.0045, "step": 22275 }, { "epoch": 6.95, "grad_norm": 0.19493155181407928, "learning_rate": 2.458407036810693e-06, "loss": 0.004, "step": 22280 }, { "epoch": 6.95, "grad_norm": 0.1684066355228424, "learning_rate": 2.454833160997273e-06, "loss": 0.0034, "step": 22285 }, { "epoch": 6.95, "grad_norm": 0.1775389462709427, "learning_rate": 2.4512615213112133e-06, "loss": 0.0037, "step": 22290 }, { "epoch": 6.95, "grad_norm": 0.4576650857925415, "learning_rate": 2.4476921188110293e-06, "loss": 0.0025, "step": 22295 }, { "epoch": 6.96, "grad_norm": 0.35909825563430786, "learning_rate": 2.4441249545545653e-06, "loss": 0.0043, "step": 22300 }, { "epoch": 6.96, "grad_norm": 0.23739157617092133, "learning_rate": 2.440560029599006e-06, "loss": 0.0027, "step": 22305 }, { "epoch": 6.96, "grad_norm": 0.11318200081586838, "learning_rate": 2.4369973450008734e-06, "loss": 0.0033, "step": 22310 }, { "epoch": 6.96, "grad_norm": 0.3762189745903015, "learning_rate": 2.4334369018160262e-06, "loss": 0.0037, "step": 22315 }, { "epoch": 6.96, "grad_norm": 0.15604671835899353, "learning_rate": 2.4298787010996517e-06, "loss": 0.0033, "step": 22320 }, { "epoch": 6.96, "grad_norm": 0.25424784421920776, "learning_rate": 2.42632274390628e-06, "loss": 0.0031, "step": 22325 }, { "epoch": 6.96, "grad_norm": 0.38101381063461304, "learning_rate": 2.4227690312897754e-06, "loss": 0.0028, "step": 22330 }, { "epoch": 6.97, "grad_norm": 0.25989487767219543, "learning_rate": 2.419217564303332e-06, "loss": 0.0037, "step": 22335 }, { "epoch": 6.97, "grad_norm": 0.12496748566627502, "learning_rate": 2.41566834399949e-06, "loss": 0.0031, "step": 22340 }, { "epoch": 6.97, "grad_norm": 0.28795716166496277, "learning_rate": 2.4121213714301096e-06, "loss": 0.0028, "step": 22345 }, { "epoch": 6.97, "grad_norm": 0.13154681026935577, "learning_rate": 2.408576647646397e-06, "loss": 0.0032, "step": 22350 }, { "epoch": 6.97, "grad_norm": 0.11087685078382492, "learning_rate": 2.4050341736988793e-06, "loss": 0.0042, "step": 22355 }, { "epoch": 6.97, "grad_norm": 0.23004786670207977, "learning_rate": 2.4014939506374267e-06, "loss": 0.0043, "step": 22360 }, { "epoch": 6.98, "grad_norm": 0.28975680470466614, "learning_rate": 2.3979559795112384e-06, "loss": 0.0048, "step": 22365 }, { "epoch": 6.98, "grad_norm": 0.12339877337217331, "learning_rate": 2.394420261368848e-06, "loss": 0.0031, "step": 22370 }, { "epoch": 6.98, "grad_norm": 0.15025168657302856, "learning_rate": 2.3908867972581186e-06, "loss": 0.0035, "step": 22375 }, { "epoch": 6.98, "grad_norm": 0.3735179305076599, "learning_rate": 2.387355588226251e-06, "loss": 0.0035, "step": 22380 }, { "epoch": 6.98, "grad_norm": 0.1667015254497528, "learning_rate": 2.383826635319775e-06, "loss": 0.0031, "step": 22385 }, { "epoch": 6.98, "grad_norm": 0.17676593363285065, "learning_rate": 2.380299939584544e-06, "loss": 0.0038, "step": 22390 }, { "epoch": 6.98, "grad_norm": 0.30443859100341797, "learning_rate": 2.3767755020657523e-06, "loss": 0.0026, "step": 22395 }, { "epoch": 6.99, "grad_norm": 0.3084462583065033, "learning_rate": 2.373253323807921e-06, "loss": 0.0029, "step": 22400 }, { "epoch": 6.99, "grad_norm": 0.32841742038726807, "learning_rate": 2.3697334058549047e-06, "loss": 0.0035, "step": 22405 }, { "epoch": 6.99, "grad_norm": 0.28747549653053284, "learning_rate": 2.3662157492498838e-06, "loss": 0.0049, "step": 22410 }, { "epoch": 6.99, "grad_norm": 0.15251532196998596, "learning_rate": 2.36270035503537e-06, "loss": 0.0041, "step": 22415 }, { "epoch": 6.99, "grad_norm": 0.1651524305343628, "learning_rate": 2.3591872242532066e-06, "loss": 0.0024, "step": 22420 }, { "epoch": 6.99, "grad_norm": 0.27897897362709045, "learning_rate": 2.355676357944563e-06, "loss": 0.004, "step": 22425 }, { "epoch": 7.0, "grad_norm": 0.2509746253490448, "learning_rate": 2.35216775714994e-06, "loss": 0.0043, "step": 22430 }, { "epoch": 7.0, "grad_norm": 0.39022454619407654, "learning_rate": 2.3486614229091654e-06, "loss": 0.0048, "step": 22435 }, { "epoch": 7.0, "grad_norm": 0.2762365937232971, "learning_rate": 2.345157356261395e-06, "loss": 0.0035, "step": 22440 }, { "epoch": 7.0, "grad_norm": 0.1320386826992035, "learning_rate": 2.341655558245115e-06, "loss": 0.0041, "step": 22445 }, { "epoch": 7.0, "grad_norm": 0.12403634190559387, "learning_rate": 2.338156029898138e-06, "loss": 0.0023, "step": 22450 }, { "epoch": 7.0, "grad_norm": 0.10191764682531357, "learning_rate": 2.3346587722576007e-06, "loss": 0.0022, "step": 22455 }, { "epoch": 7.01, "grad_norm": 0.08048174530267715, "learning_rate": 2.3311637863599724e-06, "loss": 0.002, "step": 22460 }, { "epoch": 7.01, "grad_norm": 0.0919155478477478, "learning_rate": 2.327671073241046e-06, "loss": 0.0017, "step": 22465 }, { "epoch": 7.01, "grad_norm": 0.08900649100542068, "learning_rate": 2.3241806339359397e-06, "loss": 0.0019, "step": 22470 }, { "epoch": 7.01, "grad_norm": 0.17529134452342987, "learning_rate": 2.320692469479101e-06, "loss": 0.0024, "step": 22475 }, { "epoch": 7.01, "grad_norm": 0.03381878510117531, "learning_rate": 2.317206580904301e-06, "loss": 0.0015, "step": 22480 }, { "epoch": 7.01, "grad_norm": 0.09003644436597824, "learning_rate": 2.3137229692446373e-06, "loss": 0.0014, "step": 22485 }, { "epoch": 7.01, "grad_norm": 0.18930363655090332, "learning_rate": 2.3102416355325307e-06, "loss": 0.0013, "step": 22490 }, { "epoch": 7.02, "grad_norm": 0.05011576786637306, "learning_rate": 2.306762580799732e-06, "loss": 0.0013, "step": 22495 }, { "epoch": 7.02, "grad_norm": 0.13884611427783966, "learning_rate": 2.3032858060773087e-06, "loss": 0.0013, "step": 22500 }, { "epoch": 7.02, "grad_norm": 0.13446393609046936, "learning_rate": 2.2998113123956613e-06, "loss": 0.0021, "step": 22505 }, { "epoch": 7.02, "grad_norm": 0.2293945550918579, "learning_rate": 2.296339100784506e-06, "loss": 0.0034, "step": 22510 }, { "epoch": 7.02, "grad_norm": 0.09749463200569153, "learning_rate": 2.292869172272889e-06, "loss": 0.0013, "step": 22515 }, { "epoch": 7.02, "grad_norm": 0.24164359271526337, "learning_rate": 2.289401527889178e-06, "loss": 0.0019, "step": 22520 }, { "epoch": 7.03, "grad_norm": 0.0538080595433712, "learning_rate": 2.2859361686610617e-06, "loss": 0.0012, "step": 22525 }, { "epoch": 7.03, "grad_norm": 0.08052002638578415, "learning_rate": 2.282473095615556e-06, "loss": 0.002, "step": 22530 }, { "epoch": 7.03, "grad_norm": 0.020526913926005363, "learning_rate": 2.279012309778992e-06, "loss": 0.0012, "step": 22535 }, { "epoch": 7.03, "grad_norm": 0.5815721154212952, "learning_rate": 2.275553812177026e-06, "loss": 0.0017, "step": 22540 }, { "epoch": 7.03, "grad_norm": 0.08703240007162094, "learning_rate": 2.272097603834643e-06, "loss": 0.0025, "step": 22545 }, { "epoch": 7.03, "grad_norm": 0.12789790332317352, "learning_rate": 2.268643685776143e-06, "loss": 0.0014, "step": 22550 }, { "epoch": 7.03, "grad_norm": 0.15096785128116608, "learning_rate": 2.2651920590251463e-06, "loss": 0.0022, "step": 22555 }, { "epoch": 7.04, "grad_norm": 0.09342913329601288, "learning_rate": 2.2617427246045976e-06, "loss": 0.0012, "step": 22560 }, { "epoch": 7.04, "grad_norm": 0.0484846793115139, "learning_rate": 2.2582956835367632e-06, "loss": 0.0021, "step": 22565 }, { "epoch": 7.04, "grad_norm": 0.04795683175325394, "learning_rate": 2.25485093684322e-06, "loss": 0.0019, "step": 22570 }, { "epoch": 7.04, "grad_norm": 0.033159200102090836, "learning_rate": 2.251408485544877e-06, "loss": 0.0018, "step": 22575 }, { "epoch": 7.04, "grad_norm": 0.22821782529354095, "learning_rate": 2.2479683306619537e-06, "loss": 0.0019, "step": 22580 }, { "epoch": 7.04, "grad_norm": 0.12525643408298492, "learning_rate": 2.2445304732140005e-06, "loss": 0.0014, "step": 22585 }, { "epoch": 7.05, "grad_norm": 0.1769990772008896, "learning_rate": 2.2410949142198766e-06, "loss": 0.0019, "step": 22590 }, { "epoch": 7.05, "grad_norm": 0.19607287645339966, "learning_rate": 2.2376616546977635e-06, "loss": 0.0022, "step": 22595 }, { "epoch": 7.05, "grad_norm": 0.06251411139965057, "learning_rate": 2.234230695665163e-06, "loss": 0.0017, "step": 22600 }, { "epoch": 7.05, "grad_norm": 0.15792545676231384, "learning_rate": 2.2308020381388874e-06, "loss": 0.0014, "step": 22605 }, { "epoch": 7.05, "grad_norm": 0.18448935449123383, "learning_rate": 2.2273756831350755e-06, "loss": 0.001, "step": 22610 }, { "epoch": 7.05, "grad_norm": 0.04084780439734459, "learning_rate": 2.2239516316691812e-06, "loss": 0.0012, "step": 22615 }, { "epoch": 7.06, "grad_norm": 0.21777600049972534, "learning_rate": 2.2205298847559762e-06, "loss": 0.0016, "step": 22620 }, { "epoch": 7.06, "grad_norm": 0.22892186045646667, "learning_rate": 2.2171104434095436e-06, "loss": 0.0016, "step": 22625 }, { "epoch": 7.06, "grad_norm": 0.36050963401794434, "learning_rate": 2.2136933086432956e-06, "loss": 0.0022, "step": 22630 }, { "epoch": 7.06, "grad_norm": 0.0779469683766365, "learning_rate": 2.2102784814699483e-06, "loss": 0.0015, "step": 22635 }, { "epoch": 7.06, "grad_norm": 0.17603763937950134, "learning_rate": 2.2068659629015433e-06, "loss": 0.0025, "step": 22640 }, { "epoch": 7.06, "grad_norm": 0.15554018318653107, "learning_rate": 2.203455753949426e-06, "loss": 0.002, "step": 22645 }, { "epoch": 7.06, "grad_norm": 0.06508053839206696, "learning_rate": 2.20004785562427e-06, "loss": 0.0019, "step": 22650 }, { "epoch": 7.07, "grad_norm": 0.10713350772857666, "learning_rate": 2.196642268936057e-06, "loss": 0.0016, "step": 22655 }, { "epoch": 7.07, "grad_norm": 0.15166723728179932, "learning_rate": 2.193238994894086e-06, "loss": 0.0015, "step": 22660 }, { "epoch": 7.07, "grad_norm": 0.07224074751138687, "learning_rate": 2.1898380345069703e-06, "loss": 0.0015, "step": 22665 }, { "epoch": 7.07, "grad_norm": 0.024913206696510315, "learning_rate": 2.1864393887826352e-06, "loss": 0.0015, "step": 22670 }, { "epoch": 7.07, "grad_norm": 0.4481954276561737, "learning_rate": 2.183043058728329e-06, "loss": 0.003, "step": 22675 }, { "epoch": 7.07, "grad_norm": 0.32068902254104614, "learning_rate": 2.179649045350599e-06, "loss": 0.0019, "step": 22680 }, { "epoch": 7.08, "grad_norm": 0.07001259177923203, "learning_rate": 2.1762573496553176e-06, "loss": 0.0017, "step": 22685 }, { "epoch": 7.08, "grad_norm": 0.11420892924070358, "learning_rate": 2.172867972647664e-06, "loss": 0.0022, "step": 22690 }, { "epoch": 7.08, "grad_norm": 0.17854531109333038, "learning_rate": 2.169480915332136e-06, "loss": 0.0021, "step": 22695 }, { "epoch": 7.08, "grad_norm": 0.21439093351364136, "learning_rate": 2.166096178712539e-06, "loss": 0.002, "step": 22700 }, { "epoch": 7.08, "grad_norm": 0.1033504381775856, "learning_rate": 2.1627137637919916e-06, "loss": 0.0013, "step": 22705 }, { "epoch": 7.08, "grad_norm": 0.059978220611810684, "learning_rate": 2.1593336715729264e-06, "loss": 0.0022, "step": 22710 }, { "epoch": 7.08, "grad_norm": 0.12991735339164734, "learning_rate": 2.155955903057084e-06, "loss": 0.0018, "step": 22715 }, { "epoch": 7.09, "grad_norm": 0.14945195615291595, "learning_rate": 2.1525804592455226e-06, "loss": 0.0017, "step": 22720 }, { "epoch": 7.09, "grad_norm": 0.10099814832210541, "learning_rate": 2.149207341138603e-06, "loss": 0.0016, "step": 22725 }, { "epoch": 7.09, "grad_norm": 0.04819072037935257, "learning_rate": 2.145836549736003e-06, "loss": 0.0018, "step": 22730 }, { "epoch": 7.09, "grad_norm": 0.14807431399822235, "learning_rate": 2.1424680860367086e-06, "loss": 0.0018, "step": 22735 }, { "epoch": 7.09, "grad_norm": 0.1759273111820221, "learning_rate": 2.1391019510390177e-06, "loss": 0.0015, "step": 22740 }, { "epoch": 7.09, "grad_norm": 0.11430688947439194, "learning_rate": 2.135738145740536e-06, "loss": 0.0023, "step": 22745 }, { "epoch": 7.1, "grad_norm": 0.09345373511314392, "learning_rate": 2.1323766711381776e-06, "loss": 0.0013, "step": 22750 }, { "epoch": 7.1, "grad_norm": 0.1269146054983139, "learning_rate": 2.12901752822817e-06, "loss": 0.0027, "step": 22755 }, { "epoch": 7.1, "grad_norm": 0.22538627684116364, "learning_rate": 2.1256607180060464e-06, "loss": 0.0015, "step": 22760 }, { "epoch": 7.1, "grad_norm": 0.14450940489768982, "learning_rate": 2.122306241466651e-06, "loss": 0.0018, "step": 22765 }, { "epoch": 7.1, "grad_norm": 0.06322424858808517, "learning_rate": 2.1189540996041314e-06, "loss": 0.0012, "step": 22770 }, { "epoch": 7.1, "grad_norm": 0.14859558641910553, "learning_rate": 2.1156042934119514e-06, "loss": 0.0022, "step": 22775 }, { "epoch": 7.11, "grad_norm": 0.08851836621761322, "learning_rate": 2.112256823882878e-06, "loss": 0.0015, "step": 22780 }, { "epoch": 7.11, "grad_norm": 0.08810979872941971, "learning_rate": 2.108911692008978e-06, "loss": 0.0018, "step": 22785 }, { "epoch": 7.11, "grad_norm": 0.3387254476547241, "learning_rate": 2.1055688987816414e-06, "loss": 0.0021, "step": 22790 }, { "epoch": 7.11, "grad_norm": 0.02320532500743866, "learning_rate": 2.1022284451915554e-06, "loss": 0.0014, "step": 22795 }, { "epoch": 7.11, "grad_norm": 0.09600692987442017, "learning_rate": 2.0988903322287137e-06, "loss": 0.0013, "step": 22800 }, { "epoch": 7.11, "grad_norm": 0.41077396273612976, "learning_rate": 2.095554560882418e-06, "loss": 0.0041, "step": 22805 }, { "epoch": 7.11, "grad_norm": 0.08172262459993362, "learning_rate": 2.092221132141278e-06, "loss": 0.0018, "step": 22810 }, { "epoch": 7.12, "grad_norm": 0.16185802221298218, "learning_rate": 2.088890046993204e-06, "loss": 0.0015, "step": 22815 }, { "epoch": 7.12, "grad_norm": 0.44276872277259827, "learning_rate": 2.0855613064254198e-06, "loss": 0.0022, "step": 22820 }, { "epoch": 7.12, "grad_norm": 0.169739231467247, "learning_rate": 2.0822349114244434e-06, "loss": 0.0016, "step": 22825 }, { "epoch": 7.12, "grad_norm": 0.06298299133777618, "learning_rate": 2.0789108629761033e-06, "loss": 0.0009, "step": 22830 }, { "epoch": 7.12, "grad_norm": 0.10760955512523651, "learning_rate": 2.075589162065538e-06, "loss": 0.0021, "step": 22835 }, { "epoch": 7.12, "grad_norm": 0.17187266051769257, "learning_rate": 2.072269809677183e-06, "loss": 0.0028, "step": 22840 }, { "epoch": 7.13, "grad_norm": 0.07554920017719269, "learning_rate": 2.0689528067947805e-06, "loss": 0.0015, "step": 22845 }, { "epoch": 7.13, "grad_norm": 0.20058053731918335, "learning_rate": 2.0656381544013736e-06, "loss": 0.002, "step": 22850 }, { "epoch": 7.13, "grad_norm": 0.1426592618227005, "learning_rate": 2.0623258534793167e-06, "loss": 0.0033, "step": 22855 }, { "epoch": 7.13, "grad_norm": 0.15479739010334015, "learning_rate": 2.0590159050102543e-06, "loss": 0.0018, "step": 22860 }, { "epoch": 7.13, "grad_norm": 0.13816289603710175, "learning_rate": 2.055708309975144e-06, "loss": 0.002, "step": 22865 }, { "epoch": 7.13, "grad_norm": 0.29392099380493164, "learning_rate": 2.05240306935424e-06, "loss": 0.0024, "step": 22870 }, { "epoch": 7.13, "grad_norm": 0.15190020203590393, "learning_rate": 2.0491001841271073e-06, "loss": 0.0017, "step": 22875 }, { "epoch": 7.14, "grad_norm": 0.07080844044685364, "learning_rate": 2.0457996552726058e-06, "loss": 0.0011, "step": 22880 }, { "epoch": 7.14, "grad_norm": 0.04252322018146515, "learning_rate": 2.042501483768897e-06, "loss": 0.002, "step": 22885 }, { "epoch": 7.14, "grad_norm": 0.06752241402864456, "learning_rate": 2.039205670593448e-06, "loss": 0.0019, "step": 22890 }, { "epoch": 7.14, "grad_norm": 0.11792897433042526, "learning_rate": 2.0359122167230194e-06, "loss": 0.0017, "step": 22895 }, { "epoch": 7.14, "grad_norm": 0.24622458219528198, "learning_rate": 2.03262112313368e-06, "loss": 0.0014, "step": 22900 }, { "epoch": 7.14, "grad_norm": 0.08839282393455505, "learning_rate": 2.0293323908007957e-06, "loss": 0.0018, "step": 22905 }, { "epoch": 7.15, "grad_norm": 0.16991908848285675, "learning_rate": 2.026046020699035e-06, "loss": 0.0024, "step": 22910 }, { "epoch": 7.15, "grad_norm": 0.055997684597969055, "learning_rate": 2.0227620138023605e-06, "loss": 0.0021, "step": 22915 }, { "epoch": 7.15, "grad_norm": 0.25313636660575867, "learning_rate": 2.019480371084045e-06, "loss": 0.0016, "step": 22920 }, { "epoch": 7.15, "grad_norm": 0.35071924328804016, "learning_rate": 2.016201093516653e-06, "loss": 0.002, "step": 22925 }, { "epoch": 7.15, "grad_norm": 0.10866184532642365, "learning_rate": 2.0129241820720457e-06, "loss": 0.0024, "step": 22930 }, { "epoch": 7.15, "grad_norm": 0.04748568683862686, "learning_rate": 2.009649637721388e-06, "loss": 0.0014, "step": 22935 }, { "epoch": 7.15, "grad_norm": 0.13335078954696655, "learning_rate": 2.0063774614351427e-06, "loss": 0.0025, "step": 22940 }, { "epoch": 7.16, "grad_norm": 0.08357970416545868, "learning_rate": 2.0031076541830696e-06, "loss": 0.0015, "step": 22945 }, { "epoch": 7.16, "grad_norm": 0.12237964570522308, "learning_rate": 1.9998402169342266e-06, "loss": 0.0019, "step": 22950 }, { "epoch": 7.16, "grad_norm": 0.15404756367206573, "learning_rate": 1.996575150656971e-06, "loss": 0.0021, "step": 22955 }, { "epoch": 7.16, "grad_norm": 0.09013190865516663, "learning_rate": 1.9933124563189523e-06, "loss": 0.0016, "step": 22960 }, { "epoch": 7.16, "grad_norm": 0.09105832129716873, "learning_rate": 1.9900521348871273e-06, "loss": 0.0019, "step": 22965 }, { "epoch": 7.16, "grad_norm": 0.05199562385678291, "learning_rate": 1.9867941873277365e-06, "loss": 0.0017, "step": 22970 }, { "epoch": 7.17, "grad_norm": 0.0494653545320034, "learning_rate": 1.9835386146063263e-06, "loss": 0.0012, "step": 22975 }, { "epoch": 7.17, "grad_norm": 0.1796211451292038, "learning_rate": 1.980285417687735e-06, "loss": 0.0014, "step": 22980 }, { "epoch": 7.17, "grad_norm": 0.03675558790564537, "learning_rate": 1.977034597536098e-06, "loss": 0.002, "step": 22985 }, { "epoch": 7.17, "grad_norm": 0.04541827738285065, "learning_rate": 1.9737861551148474e-06, "loss": 0.0011, "step": 22990 }, { "epoch": 7.17, "grad_norm": 0.1748480200767517, "learning_rate": 1.970540091386709e-06, "loss": 0.0013, "step": 22995 }, { "epoch": 7.17, "grad_norm": 0.2021503746509552, "learning_rate": 1.9672964073137037e-06, "loss": 0.0025, "step": 23000 }, { "epoch": 7.18, "grad_norm": 0.09402123093605042, "learning_rate": 1.9640551038571476e-06, "loss": 0.0016, "step": 23005 }, { "epoch": 7.18, "grad_norm": 0.0501425638794899, "learning_rate": 1.9608161819776527e-06, "loss": 0.0021, "step": 23010 }, { "epoch": 7.18, "grad_norm": 0.04663001373410225, "learning_rate": 1.9575796426351233e-06, "loss": 0.0011, "step": 23015 }, { "epoch": 7.18, "grad_norm": 0.10955876111984253, "learning_rate": 1.9543454867887568e-06, "loss": 0.0016, "step": 23020 }, { "epoch": 7.18, "grad_norm": 0.24366258084774017, "learning_rate": 1.9511137153970473e-06, "loss": 0.0024, "step": 23025 }, { "epoch": 7.18, "grad_norm": 0.04715977981686592, "learning_rate": 1.9478843294177785e-06, "loss": 0.0012, "step": 23030 }, { "epoch": 7.18, "grad_norm": 0.2500501871109009, "learning_rate": 1.9446573298080308e-06, "loss": 0.0022, "step": 23035 }, { "epoch": 7.19, "grad_norm": 0.07603548467159271, "learning_rate": 1.9414327175241755e-06, "loss": 0.0011, "step": 23040 }, { "epoch": 7.19, "grad_norm": 0.08409067988395691, "learning_rate": 1.938210493521875e-06, "loss": 0.0017, "step": 23045 }, { "epoch": 7.19, "grad_norm": 0.3819182217121124, "learning_rate": 1.934990658756086e-06, "loss": 0.0022, "step": 23050 }, { "epoch": 7.19, "grad_norm": 0.10924136638641357, "learning_rate": 1.9317732141810587e-06, "loss": 0.0014, "step": 23055 }, { "epoch": 7.19, "grad_norm": 0.11690499633550644, "learning_rate": 1.92855816075033e-06, "loss": 0.0023, "step": 23060 }, { "epoch": 7.19, "grad_norm": 0.1321161836385727, "learning_rate": 1.925345499416732e-06, "loss": 0.0015, "step": 23065 }, { "epoch": 7.2, "grad_norm": 0.12577715516090393, "learning_rate": 1.9221352311323893e-06, "loss": 0.0017, "step": 23070 }, { "epoch": 7.2, "grad_norm": 0.1400633454322815, "learning_rate": 1.9189273568487068e-06, "loss": 0.0023, "step": 23075 }, { "epoch": 7.2, "grad_norm": 0.16724464297294617, "learning_rate": 1.915721877516397e-06, "loss": 0.0018, "step": 23080 }, { "epoch": 7.2, "grad_norm": 0.29586324095726013, "learning_rate": 1.9125187940854484e-06, "loss": 0.0022, "step": 23085 }, { "epoch": 7.2, "grad_norm": 0.1833949089050293, "learning_rate": 1.909318107505146e-06, "loss": 0.0013, "step": 23090 }, { "epoch": 7.2, "grad_norm": 0.10151824355125427, "learning_rate": 1.9061198187240626e-06, "loss": 0.0015, "step": 23095 }, { "epoch": 7.2, "grad_norm": 0.2685157358646393, "learning_rate": 1.9029239286900614e-06, "loss": 0.0019, "step": 23100 }, { "epoch": 7.21, "grad_norm": 0.13566923141479492, "learning_rate": 1.8997304383502956e-06, "loss": 0.0016, "step": 23105 }, { "epoch": 7.21, "grad_norm": 0.03023446351289749, "learning_rate": 1.8965393486512019e-06, "loss": 0.0011, "step": 23110 }, { "epoch": 7.21, "grad_norm": 0.10596200823783875, "learning_rate": 1.893350660538511e-06, "loss": 0.0016, "step": 23115 }, { "epoch": 7.21, "grad_norm": 0.15463019907474518, "learning_rate": 1.8901643749572373e-06, "loss": 0.0022, "step": 23120 }, { "epoch": 7.21, "grad_norm": 0.9097394347190857, "learning_rate": 1.886980492851691e-06, "loss": 0.0032, "step": 23125 }, { "epoch": 7.21, "grad_norm": 0.10497675836086273, "learning_rate": 1.8837990151654639e-06, "loss": 0.0016, "step": 23130 }, { "epoch": 7.22, "grad_norm": 0.04598972573876381, "learning_rate": 1.880619942841435e-06, "loss": 0.0023, "step": 23135 }, { "epoch": 7.22, "grad_norm": 0.15432889759540558, "learning_rate": 1.877443276821771e-06, "loss": 0.0019, "step": 23140 }, { "epoch": 7.22, "grad_norm": 0.08259057253599167, "learning_rate": 1.8742690180479307e-06, "loss": 0.0017, "step": 23145 }, { "epoch": 7.22, "grad_norm": 0.05540916323661804, "learning_rate": 1.8710971674606493e-06, "loss": 0.0011, "step": 23150 }, { "epoch": 7.22, "grad_norm": 0.15354594588279724, "learning_rate": 1.867927725999956e-06, "loss": 0.0012, "step": 23155 }, { "epoch": 7.22, "grad_norm": 0.13194885849952698, "learning_rate": 1.8647606946051634e-06, "loss": 0.0025, "step": 23160 }, { "epoch": 7.23, "grad_norm": 0.1791795939207077, "learning_rate": 1.8615960742148699e-06, "loss": 0.0015, "step": 23165 }, { "epoch": 7.23, "grad_norm": 0.09989296644926071, "learning_rate": 1.8584338657669631e-06, "loss": 0.0019, "step": 23170 }, { "epoch": 7.23, "grad_norm": 0.32999667525291443, "learning_rate": 1.8552740701986106e-06, "loss": 0.0017, "step": 23175 }, { "epoch": 7.23, "grad_norm": 0.11784721165895462, "learning_rate": 1.8521166884462693e-06, "loss": 0.0011, "step": 23180 }, { "epoch": 7.23, "grad_norm": 0.1490192860364914, "learning_rate": 1.8489617214456746e-06, "loss": 0.002, "step": 23185 }, { "epoch": 7.23, "grad_norm": 0.05966825410723686, "learning_rate": 1.8458091701318504e-06, "loss": 0.0016, "step": 23190 }, { "epoch": 7.23, "grad_norm": 0.08577445149421692, "learning_rate": 1.842659035439106e-06, "loss": 0.0013, "step": 23195 }, { "epoch": 7.24, "grad_norm": 0.18630562722682953, "learning_rate": 1.8395113183010316e-06, "loss": 0.0026, "step": 23200 }, { "epoch": 7.24, "grad_norm": 0.08478522300720215, "learning_rate": 1.8363660196504995e-06, "loss": 0.0014, "step": 23205 }, { "epoch": 7.24, "grad_norm": 0.05588646978139877, "learning_rate": 1.8332231404196732e-06, "loss": 0.0013, "step": 23210 }, { "epoch": 7.24, "grad_norm": 0.15565073490142822, "learning_rate": 1.8300826815399941e-06, "loss": 0.0013, "step": 23215 }, { "epoch": 7.24, "grad_norm": 0.1549144834280014, "learning_rate": 1.826944643942181e-06, "loss": 0.0021, "step": 23220 }, { "epoch": 7.24, "grad_norm": 0.25133708119392395, "learning_rate": 1.8238090285562416e-06, "loss": 0.002, "step": 23225 }, { "epoch": 7.25, "grad_norm": 0.12042286247015, "learning_rate": 1.8206758363114652e-06, "loss": 0.0013, "step": 23230 }, { "epoch": 7.25, "grad_norm": 0.06990408152341843, "learning_rate": 1.8175450681364216e-06, "loss": 0.0019, "step": 23235 }, { "epoch": 7.25, "grad_norm": 0.1288883239030838, "learning_rate": 1.8144167249589629e-06, "loss": 0.0017, "step": 23240 }, { "epoch": 7.25, "grad_norm": 0.13696298003196716, "learning_rate": 1.8112908077062208e-06, "loss": 0.0015, "step": 23245 }, { "epoch": 7.25, "grad_norm": 0.1362457573413849, "learning_rate": 1.808167317304611e-06, "loss": 0.0019, "step": 23250 }, { "epoch": 7.25, "grad_norm": 0.12248032540082932, "learning_rate": 1.8050462546798275e-06, "loss": 0.0015, "step": 23255 }, { "epoch": 7.25, "grad_norm": 0.21087703108787537, "learning_rate": 1.8019276207568471e-06, "loss": 0.0018, "step": 23260 }, { "epoch": 7.26, "grad_norm": 0.09722114354372025, "learning_rate": 1.7988114164599236e-06, "loss": 0.002, "step": 23265 }, { "epoch": 7.26, "grad_norm": 0.07935526967048645, "learning_rate": 1.7956976427125927e-06, "loss": 0.0016, "step": 23270 }, { "epoch": 7.26, "grad_norm": 0.3887324929237366, "learning_rate": 1.7925863004376709e-06, "loss": 0.0021, "step": 23275 }, { "epoch": 7.26, "grad_norm": 0.18374229967594147, "learning_rate": 1.7894773905572517e-06, "loss": 0.002, "step": 23280 }, { "epoch": 7.26, "grad_norm": 0.05458586663007736, "learning_rate": 1.78637091399271e-06, "loss": 0.0016, "step": 23285 }, { "epoch": 7.26, "grad_norm": 0.26444464921951294, "learning_rate": 1.7832668716646961e-06, "loss": 0.0011, "step": 23290 }, { "epoch": 7.27, "grad_norm": 0.15571677684783936, "learning_rate": 1.7801652644931433e-06, "loss": 0.0021, "step": 23295 }, { "epoch": 7.27, "grad_norm": 0.1157768964767456, "learning_rate": 1.7770660933972606e-06, "loss": 0.0013, "step": 23300 }, { "epoch": 7.27, "grad_norm": 0.25759342312812805, "learning_rate": 1.7739693592955353e-06, "loss": 0.0034, "step": 23305 }, { "epoch": 7.27, "grad_norm": 0.12619584798812866, "learning_rate": 1.7708750631057325e-06, "loss": 0.0014, "step": 23310 }, { "epoch": 7.27, "grad_norm": 0.23353789746761322, "learning_rate": 1.7677832057448952e-06, "loss": 0.0026, "step": 23315 }, { "epoch": 7.27, "grad_norm": 0.11836845427751541, "learning_rate": 1.7646937881293425e-06, "loss": 0.002, "step": 23320 }, { "epoch": 7.27, "grad_norm": 0.13385270535945892, "learning_rate": 1.761606811174672e-06, "loss": 0.0028, "step": 23325 }, { "epoch": 7.28, "grad_norm": 0.1433686912059784, "learning_rate": 1.7585222757957576e-06, "loss": 0.002, "step": 23330 }, { "epoch": 7.28, "grad_norm": 0.06003287434577942, "learning_rate": 1.755440182906748e-06, "loss": 0.0012, "step": 23335 }, { "epoch": 7.28, "grad_norm": 0.04114048182964325, "learning_rate": 1.7523605334210713e-06, "loss": 0.0014, "step": 23340 }, { "epoch": 7.28, "grad_norm": 0.2964211702346802, "learning_rate": 1.7492833282514278e-06, "loss": 0.002, "step": 23345 }, { "epoch": 7.28, "grad_norm": 0.15680080652236938, "learning_rate": 1.7462085683097952e-06, "loss": 0.0024, "step": 23350 }, { "epoch": 7.28, "grad_norm": 0.07513896375894547, "learning_rate": 1.743136254507427e-06, "loss": 0.0017, "step": 23355 }, { "epoch": 7.29, "grad_norm": 0.38666918873786926, "learning_rate": 1.740066387754853e-06, "loss": 0.0029, "step": 23360 }, { "epoch": 7.29, "grad_norm": 0.20412775874137878, "learning_rate": 1.7369989689618683e-06, "loss": 0.0017, "step": 23365 }, { "epoch": 7.29, "grad_norm": 0.18670332431793213, "learning_rate": 1.7339339990375592e-06, "loss": 0.0021, "step": 23370 }, { "epoch": 7.29, "grad_norm": 0.09254976361989975, "learning_rate": 1.7308714788902726e-06, "loss": 0.0025, "step": 23375 }, { "epoch": 7.29, "grad_norm": 0.22622790932655334, "learning_rate": 1.7278114094276343e-06, "loss": 0.0014, "step": 23380 }, { "epoch": 7.29, "grad_norm": 0.4574730098247528, "learning_rate": 1.7247537915565449e-06, "loss": 0.0026, "step": 23385 }, { "epoch": 7.3, "grad_norm": 0.08487337082624435, "learning_rate": 1.7216986261831747e-06, "loss": 0.0016, "step": 23390 }, { "epoch": 7.3, "grad_norm": 0.2237267643213272, "learning_rate": 1.718645914212973e-06, "loss": 0.0032, "step": 23395 }, { "epoch": 7.3, "grad_norm": 0.08069462329149246, "learning_rate": 1.715595656550655e-06, "loss": 0.0018, "step": 23400 }, { "epoch": 7.3, "grad_norm": 0.10751515626907349, "learning_rate": 1.7125478541002117e-06, "loss": 0.0034, "step": 23405 }, { "epoch": 7.3, "grad_norm": 0.20734716951847076, "learning_rate": 1.7095025077649053e-06, "loss": 0.002, "step": 23410 }, { "epoch": 7.3, "grad_norm": 0.13567696511745453, "learning_rate": 1.7064596184472759e-06, "loss": 0.0012, "step": 23415 }, { "epoch": 7.3, "grad_norm": 0.17350482940673828, "learning_rate": 1.7034191870491301e-06, "loss": 0.0022, "step": 23420 }, { "epoch": 7.31, "grad_norm": 0.13199970126152039, "learning_rate": 1.7003812144715459e-06, "loss": 0.0016, "step": 23425 }, { "epoch": 7.31, "grad_norm": 0.10658343881368637, "learning_rate": 1.6973457016148764e-06, "loss": 0.0024, "step": 23430 }, { "epoch": 7.31, "grad_norm": 0.18849949538707733, "learning_rate": 1.6943126493787377e-06, "loss": 0.002, "step": 23435 }, { "epoch": 7.31, "grad_norm": 0.031828440725803375, "learning_rate": 1.6912820586620237e-06, "loss": 0.0012, "step": 23440 }, { "epoch": 7.31, "grad_norm": 0.06511376798152924, "learning_rate": 1.6882539303628987e-06, "loss": 0.0021, "step": 23445 }, { "epoch": 7.31, "grad_norm": 0.14078600704669952, "learning_rate": 1.6852282653787943e-06, "loss": 0.0021, "step": 23450 }, { "epoch": 7.32, "grad_norm": 0.09009654074907303, "learning_rate": 1.6822050646064113e-06, "loss": 0.0011, "step": 23455 }, { "epoch": 7.32, "grad_norm": 0.11348678171634674, "learning_rate": 1.6791843289417275e-06, "loss": 0.0021, "step": 23460 }, { "epoch": 7.32, "grad_norm": 0.13618040084838867, "learning_rate": 1.6761660592799822e-06, "loss": 0.0029, "step": 23465 }, { "epoch": 7.32, "grad_norm": 0.16246213018894196, "learning_rate": 1.6731502565156875e-06, "loss": 0.0018, "step": 23470 }, { "epoch": 7.32, "grad_norm": 0.04980913922190666, "learning_rate": 1.670136921542621e-06, "loss": 0.0031, "step": 23475 }, { "epoch": 7.32, "grad_norm": 0.22228434681892395, "learning_rate": 1.6671260552538315e-06, "loss": 0.0017, "step": 23480 }, { "epoch": 7.32, "grad_norm": 0.07236877083778381, "learning_rate": 1.6641176585416375e-06, "loss": 0.001, "step": 23485 }, { "epoch": 7.33, "grad_norm": 0.24225230515003204, "learning_rate": 1.6611117322976222e-06, "loss": 0.0024, "step": 23490 }, { "epoch": 7.33, "grad_norm": 0.2151099145412445, "learning_rate": 1.658108277412639e-06, "loss": 0.002, "step": 23495 }, { "epoch": 7.33, "grad_norm": 0.08060291409492493, "learning_rate": 1.655107294776811e-06, "loss": 0.0022, "step": 23500 }, { "epoch": 7.33, "grad_norm": 0.08732081949710846, "learning_rate": 1.652108785279526e-06, "loss": 0.0016, "step": 23505 }, { "epoch": 7.33, "grad_norm": 0.3340969979763031, "learning_rate": 1.649112749809435e-06, "loss": 0.0027, "step": 23510 }, { "epoch": 7.33, "grad_norm": 0.1414659023284912, "learning_rate": 1.6461191892544615e-06, "loss": 0.0023, "step": 23515 }, { "epoch": 7.34, "grad_norm": 0.062250565737485886, "learning_rate": 1.6431281045017945e-06, "loss": 0.0014, "step": 23520 }, { "epoch": 7.34, "grad_norm": 0.18974672257900238, "learning_rate": 1.640139496437887e-06, "loss": 0.0015, "step": 23525 }, { "epoch": 7.34, "grad_norm": 0.09417407959699631, "learning_rate": 1.6371533659484606e-06, "loss": 0.0013, "step": 23530 }, { "epoch": 7.34, "grad_norm": 0.13603679835796356, "learning_rate": 1.6341697139185008e-06, "loss": 0.0017, "step": 23535 }, { "epoch": 7.34, "grad_norm": 0.030808500945568085, "learning_rate": 1.6311885412322602e-06, "loss": 0.002, "step": 23540 }, { "epoch": 7.34, "grad_norm": 0.024461254477500916, "learning_rate": 1.6282098487732545e-06, "loss": 0.0011, "step": 23545 }, { "epoch": 7.35, "grad_norm": 0.17799636721611023, "learning_rate": 1.6252336374242649e-06, "loss": 0.0016, "step": 23550 }, { "epoch": 7.35, "grad_norm": 0.13428995013237, "learning_rate": 1.6222599080673395e-06, "loss": 0.002, "step": 23555 }, { "epoch": 7.35, "grad_norm": 0.07780268788337708, "learning_rate": 1.6192886615837888e-06, "loss": 0.0012, "step": 23560 }, { "epoch": 7.35, "grad_norm": 0.38052231073379517, "learning_rate": 1.6163198988541862e-06, "loss": 0.002, "step": 23565 }, { "epoch": 7.35, "grad_norm": 0.10485098510980606, "learning_rate": 1.6133536207583734e-06, "loss": 0.0011, "step": 23570 }, { "epoch": 7.35, "grad_norm": 0.14786922931671143, "learning_rate": 1.6103898281754504e-06, "loss": 0.0016, "step": 23575 }, { "epoch": 7.35, "grad_norm": 0.11026573926210403, "learning_rate": 1.6074285219837848e-06, "loss": 0.0011, "step": 23580 }, { "epoch": 7.36, "grad_norm": 0.22885969281196594, "learning_rate": 1.6044697030610045e-06, "loss": 0.0021, "step": 23585 }, { "epoch": 7.36, "grad_norm": 0.6174030900001526, "learning_rate": 1.6015133722840026e-06, "loss": 0.0017, "step": 23590 }, { "epoch": 7.36, "grad_norm": 0.07516336441040039, "learning_rate": 1.5985595305289337e-06, "loss": 0.0018, "step": 23595 }, { "epoch": 7.36, "grad_norm": 0.4718790054321289, "learning_rate": 1.5956081786712129e-06, "loss": 0.0026, "step": 23600 }, { "epoch": 7.36, "grad_norm": 0.08145270496606827, "learning_rate": 1.592659317585521e-06, "loss": 0.0018, "step": 23605 }, { "epoch": 7.36, "grad_norm": 0.14953170716762543, "learning_rate": 1.5897129481457995e-06, "loss": 0.0012, "step": 23610 }, { "epoch": 7.37, "grad_norm": 0.07766649127006531, "learning_rate": 1.5867690712252459e-06, "loss": 0.0014, "step": 23615 }, { "epoch": 7.37, "grad_norm": 0.13531377911567688, "learning_rate": 1.5838276876963288e-06, "loss": 0.003, "step": 23620 }, { "epoch": 7.37, "grad_norm": 0.2835178077220917, "learning_rate": 1.580888798430772e-06, "loss": 0.0012, "step": 23625 }, { "epoch": 7.37, "grad_norm": 0.07614782452583313, "learning_rate": 1.5779524042995597e-06, "loss": 0.002, "step": 23630 }, { "epoch": 7.37, "grad_norm": 0.10155941545963287, "learning_rate": 1.5750185061729395e-06, "loss": 0.0016, "step": 23635 }, { "epoch": 7.37, "grad_norm": 0.09648201614618301, "learning_rate": 1.5720871049204156e-06, "loss": 0.0019, "step": 23640 }, { "epoch": 7.37, "grad_norm": 0.1506904512643814, "learning_rate": 1.5691582014107553e-06, "loss": 0.002, "step": 23645 }, { "epoch": 7.38, "grad_norm": 0.053801387548446655, "learning_rate": 1.5662317965119877e-06, "loss": 0.0016, "step": 23650 }, { "epoch": 7.38, "grad_norm": 0.37937116622924805, "learning_rate": 1.5633078910913934e-06, "loss": 0.0023, "step": 23655 }, { "epoch": 7.38, "grad_norm": 0.3482864201068878, "learning_rate": 1.560386486015516e-06, "loss": 0.0019, "step": 23660 }, { "epoch": 7.38, "grad_norm": 0.16894619166851044, "learning_rate": 1.557467582150165e-06, "loss": 0.0022, "step": 23665 }, { "epoch": 7.38, "grad_norm": 0.11991281807422638, "learning_rate": 1.5545511803603997e-06, "loss": 0.0016, "step": 23670 }, { "epoch": 7.38, "grad_norm": 0.18368425965309143, "learning_rate": 1.5516372815105429e-06, "loss": 0.0028, "step": 23675 }, { "epoch": 7.39, "grad_norm": 0.12280163913965225, "learning_rate": 1.548725886464172e-06, "loss": 0.0022, "step": 23680 }, { "epoch": 7.39, "grad_norm": 0.022240638732910156, "learning_rate": 1.5458169960841264e-06, "loss": 0.0018, "step": 23685 }, { "epoch": 7.39, "grad_norm": 0.4456542134284973, "learning_rate": 1.5429106112324976e-06, "loss": 0.0019, "step": 23690 }, { "epoch": 7.39, "grad_norm": 0.06904762983322144, "learning_rate": 1.5400067327706392e-06, "loss": 0.0014, "step": 23695 }, { "epoch": 7.39, "grad_norm": 0.1527893841266632, "learning_rate": 1.537105361559158e-06, "loss": 0.0027, "step": 23700 }, { "epoch": 7.39, "grad_norm": 0.07344002276659012, "learning_rate": 1.5342064984579263e-06, "loss": 0.0014, "step": 23705 }, { "epoch": 7.4, "grad_norm": 0.10091239213943481, "learning_rate": 1.5313101443260637e-06, "loss": 0.0019, "step": 23710 }, { "epoch": 7.4, "grad_norm": 0.0808711051940918, "learning_rate": 1.528416300021951e-06, "loss": 0.0017, "step": 23715 }, { "epoch": 7.4, "grad_norm": 0.1359367072582245, "learning_rate": 1.5255249664032256e-06, "loss": 0.0026, "step": 23720 }, { "epoch": 7.4, "grad_norm": 0.2536752223968506, "learning_rate": 1.5226361443267735e-06, "loss": 0.0016, "step": 23725 }, { "epoch": 7.4, "grad_norm": 0.1135883554816246, "learning_rate": 1.519749834648746e-06, "loss": 0.0016, "step": 23730 }, { "epoch": 7.4, "grad_norm": 0.101815365254879, "learning_rate": 1.516866038224545e-06, "loss": 0.0017, "step": 23735 }, { "epoch": 7.4, "grad_norm": 0.06885939836502075, "learning_rate": 1.5139847559088283e-06, "loss": 0.0012, "step": 23740 }, { "epoch": 7.41, "grad_norm": 0.3209657073020935, "learning_rate": 1.5111059885555057e-06, "loss": 0.0023, "step": 23745 }, { "epoch": 7.41, "grad_norm": 0.22403649985790253, "learning_rate": 1.5082297370177502e-06, "loss": 0.0027, "step": 23750 }, { "epoch": 7.41, "grad_norm": 0.20601578056812286, "learning_rate": 1.5053560021479829e-06, "loss": 0.0029, "step": 23755 }, { "epoch": 7.41, "grad_norm": 0.07541821897029877, "learning_rate": 1.5024847847978763e-06, "loss": 0.0014, "step": 23760 }, { "epoch": 7.41, "grad_norm": 0.08014453947544098, "learning_rate": 1.4996160858183617e-06, "loss": 0.0018, "step": 23765 }, { "epoch": 7.41, "grad_norm": 0.10281440615653992, "learning_rate": 1.4967499060596224e-06, "loss": 0.0021, "step": 23770 }, { "epoch": 7.42, "grad_norm": 0.1816490888595581, "learning_rate": 1.4938862463710956e-06, "loss": 0.0023, "step": 23775 }, { "epoch": 7.42, "grad_norm": 0.05056086555123329, "learning_rate": 1.4910251076014714e-06, "loss": 0.0023, "step": 23780 }, { "epoch": 7.42, "grad_norm": 0.05771959200501442, "learning_rate": 1.4881664905986903e-06, "loss": 0.0014, "step": 23785 }, { "epoch": 7.42, "grad_norm": 0.06854867935180664, "learning_rate": 1.4853103962099558e-06, "loss": 0.0016, "step": 23790 }, { "epoch": 7.42, "grad_norm": 0.1187528446316719, "learning_rate": 1.4824568252817062e-06, "loss": 0.0017, "step": 23795 }, { "epoch": 7.42, "grad_norm": 0.01285573374480009, "learning_rate": 1.4796057786596464e-06, "loss": 0.0018, "step": 23800 }, { "epoch": 7.42, "grad_norm": 0.16395248472690582, "learning_rate": 1.4767572571887268e-06, "loss": 0.0032, "step": 23805 }, { "epoch": 7.43, "grad_norm": 0.08021502941846848, "learning_rate": 1.4739112617131523e-06, "loss": 0.0016, "step": 23810 }, { "epoch": 7.43, "grad_norm": 0.07731685787439346, "learning_rate": 1.4710677930763773e-06, "loss": 0.0014, "step": 23815 }, { "epoch": 7.43, "grad_norm": 0.17272412776947021, "learning_rate": 1.4682268521211075e-06, "loss": 0.0014, "step": 23820 }, { "epoch": 7.43, "grad_norm": 0.1399872601032257, "learning_rate": 1.4653884396892992e-06, "loss": 0.002, "step": 23825 }, { "epoch": 7.43, "grad_norm": 0.11964300274848938, "learning_rate": 1.462552556622162e-06, "loss": 0.001, "step": 23830 }, { "epoch": 7.43, "grad_norm": 0.08896835148334503, "learning_rate": 1.4597192037601526e-06, "loss": 0.0019, "step": 23835 }, { "epoch": 7.44, "grad_norm": 0.11974415928125381, "learning_rate": 1.4568883819429803e-06, "loss": 0.0016, "step": 23840 }, { "epoch": 7.44, "grad_norm": 0.25031784176826477, "learning_rate": 1.4540600920096003e-06, "loss": 0.0027, "step": 23845 }, { "epoch": 7.44, "grad_norm": 0.06933608651161194, "learning_rate": 1.451234334798224e-06, "loss": 0.0027, "step": 23850 }, { "epoch": 7.44, "grad_norm": 0.16940376162528992, "learning_rate": 1.448411111146305e-06, "loss": 0.0016, "step": 23855 }, { "epoch": 7.44, "grad_norm": 0.07995875924825668, "learning_rate": 1.4455904218905514e-06, "loss": 0.0023, "step": 23860 }, { "epoch": 7.44, "grad_norm": 0.28575241565704346, "learning_rate": 1.4427722678669188e-06, "loss": 0.0022, "step": 23865 }, { "epoch": 7.44, "grad_norm": 0.0801328793168068, "learning_rate": 1.4399566499106089e-06, "loss": 0.002, "step": 23870 }, { "epoch": 7.45, "grad_norm": 0.059965942054986954, "learning_rate": 1.4371435688560753e-06, "loss": 0.0012, "step": 23875 }, { "epoch": 7.45, "grad_norm": 0.27908244729042053, "learning_rate": 1.434333025537018e-06, "loss": 0.0026, "step": 23880 }, { "epoch": 7.45, "grad_norm": 0.11046892404556274, "learning_rate": 1.431525020786385e-06, "loss": 0.0025, "step": 23885 }, { "epoch": 7.45, "grad_norm": 0.2373012900352478, "learning_rate": 1.428719555436372e-06, "loss": 0.0016, "step": 23890 }, { "epoch": 7.45, "grad_norm": 0.09902806580066681, "learning_rate": 1.4259166303184212e-06, "loss": 0.002, "step": 23895 }, { "epoch": 7.45, "grad_norm": 0.07448076456785202, "learning_rate": 1.4231162462632276e-06, "loss": 0.0016, "step": 23900 }, { "epoch": 7.46, "grad_norm": 0.1125267893075943, "learning_rate": 1.42031840410072e-06, "loss": 0.0023, "step": 23905 }, { "epoch": 7.46, "grad_norm": 0.1669638454914093, "learning_rate": 1.4175231046600902e-06, "loss": 0.0022, "step": 23910 }, { "epoch": 7.46, "grad_norm": 0.1995648890733719, "learning_rate": 1.4147303487697661e-06, "loss": 0.002, "step": 23915 }, { "epoch": 7.46, "grad_norm": 0.15967807173728943, "learning_rate": 1.4119401372574226e-06, "loss": 0.0021, "step": 23920 }, { "epoch": 7.46, "grad_norm": 0.16623800992965698, "learning_rate": 1.4091524709499849e-06, "loss": 0.0023, "step": 23925 }, { "epoch": 7.46, "grad_norm": 0.047116223722696304, "learning_rate": 1.4063673506736198e-06, "loss": 0.0024, "step": 23930 }, { "epoch": 7.47, "grad_norm": 0.06346458941698074, "learning_rate": 1.4035847772537426e-06, "loss": 0.002, "step": 23935 }, { "epoch": 7.47, "grad_norm": 0.2038680911064148, "learning_rate": 1.400804751515008e-06, "loss": 0.0015, "step": 23940 }, { "epoch": 7.47, "grad_norm": 0.046279482543468475, "learning_rate": 1.3980272742813228e-06, "loss": 0.0021, "step": 23945 }, { "epoch": 7.47, "grad_norm": 0.12787197530269623, "learning_rate": 1.3952523463758327e-06, "loss": 0.0022, "step": 23950 }, { "epoch": 7.47, "grad_norm": 0.11368878930807114, "learning_rate": 1.3924799686209357e-06, "loss": 0.0032, "step": 23955 }, { "epoch": 7.47, "grad_norm": 0.04303009808063507, "learning_rate": 1.3897101418382664e-06, "loss": 0.0016, "step": 23960 }, { "epoch": 7.47, "grad_norm": 0.23386818170547485, "learning_rate": 1.3869428668487061e-06, "loss": 0.0029, "step": 23965 }, { "epoch": 7.48, "grad_norm": 0.06971883773803711, "learning_rate": 1.384178144472379e-06, "loss": 0.0017, "step": 23970 }, { "epoch": 7.48, "grad_norm": 0.08481031656265259, "learning_rate": 1.381415975528656e-06, "loss": 0.0016, "step": 23975 }, { "epoch": 7.48, "grad_norm": 0.14777174592018127, "learning_rate": 1.3786563608361447e-06, "loss": 0.0017, "step": 23980 }, { "epoch": 7.48, "grad_norm": 0.16118240356445312, "learning_rate": 1.375899301212703e-06, "loss": 0.0023, "step": 23985 }, { "epoch": 7.48, "grad_norm": 0.19115978479385376, "learning_rate": 1.3731447974754241e-06, "loss": 0.0015, "step": 23990 }, { "epoch": 7.48, "grad_norm": 0.09597334265708923, "learning_rate": 1.3703928504406527e-06, "loss": 0.0012, "step": 23995 }, { "epoch": 7.49, "grad_norm": 0.2174786627292633, "learning_rate": 1.3676434609239708e-06, "loss": 0.0015, "step": 24000 }, { "epoch": 7.49, "grad_norm": 0.2653753161430359, "learning_rate": 1.3648966297402e-06, "loss": 0.0023, "step": 24005 }, { "epoch": 7.49, "grad_norm": 0.15789607167243958, "learning_rate": 1.3621523577034102e-06, "loss": 0.0023, "step": 24010 }, { "epoch": 7.49, "grad_norm": 0.34349653124809265, "learning_rate": 1.3594106456269052e-06, "loss": 0.0021, "step": 24015 }, { "epoch": 7.49, "grad_norm": 0.07134407013654709, "learning_rate": 1.356671494323234e-06, "loss": 0.0013, "step": 24020 }, { "epoch": 7.49, "grad_norm": 0.11368986964225769, "learning_rate": 1.3539349046041882e-06, "loss": 0.0014, "step": 24025 }, { "epoch": 7.49, "grad_norm": 0.12296315282583237, "learning_rate": 1.3512008772807994e-06, "loss": 0.0017, "step": 24030 }, { "epoch": 7.5, "grad_norm": 0.17269238829612732, "learning_rate": 1.3484694131633346e-06, "loss": 0.0016, "step": 24035 }, { "epoch": 7.5, "grad_norm": 0.07103969156742096, "learning_rate": 1.345740513061311e-06, "loss": 0.0013, "step": 24040 }, { "epoch": 7.5, "grad_norm": 0.2738235294818878, "learning_rate": 1.3430141777834815e-06, "loss": 0.0021, "step": 24045 }, { "epoch": 7.5, "grad_norm": 0.10616691410541534, "learning_rate": 1.3402904081378333e-06, "loss": 0.0014, "step": 24050 }, { "epoch": 7.5, "grad_norm": 0.07980950176715851, "learning_rate": 1.337569204931598e-06, "loss": 0.0017, "step": 24055 }, { "epoch": 7.5, "grad_norm": 0.09553997963666916, "learning_rate": 1.3348505689712488e-06, "loss": 0.0015, "step": 24060 }, { "epoch": 7.51, "grad_norm": 0.13931411504745483, "learning_rate": 1.3321345010624941e-06, "loss": 0.0013, "step": 24065 }, { "epoch": 7.51, "grad_norm": 0.23361235857009888, "learning_rate": 1.3294210020102837e-06, "loss": 0.0019, "step": 24070 }, { "epoch": 7.51, "grad_norm": 0.16457533836364746, "learning_rate": 1.3267100726188053e-06, "loss": 0.003, "step": 24075 }, { "epoch": 7.51, "grad_norm": 0.125288724899292, "learning_rate": 1.3240017136914852e-06, "loss": 0.0017, "step": 24080 }, { "epoch": 7.51, "grad_norm": 0.2684983015060425, "learning_rate": 1.3212959260309855e-06, "loss": 0.0019, "step": 24085 }, { "epoch": 7.51, "grad_norm": 0.17238128185272217, "learning_rate": 1.3185927104392105e-06, "loss": 0.0017, "step": 24090 }, { "epoch": 7.52, "grad_norm": 0.21431346237659454, "learning_rate": 1.3158920677172994e-06, "loss": 0.0018, "step": 24095 }, { "epoch": 7.52, "grad_norm": 0.1683589220046997, "learning_rate": 1.3131939986656305e-06, "loss": 0.0019, "step": 24100 }, { "epoch": 7.52, "grad_norm": 0.07739071547985077, "learning_rate": 1.310498504083817e-06, "loss": 0.0015, "step": 24105 }, { "epoch": 7.52, "grad_norm": 0.07120134681463242, "learning_rate": 1.3078055847707116e-06, "loss": 0.0014, "step": 24110 }, { "epoch": 7.52, "grad_norm": 0.09688661247491837, "learning_rate": 1.3051152415244016e-06, "loss": 0.0014, "step": 24115 }, { "epoch": 7.52, "grad_norm": 0.15954263508319855, "learning_rate": 1.302427475142214e-06, "loss": 0.0028, "step": 24120 }, { "epoch": 7.52, "grad_norm": 0.07054508477449417, "learning_rate": 1.2997422864207076e-06, "loss": 0.0016, "step": 24125 }, { "epoch": 7.53, "grad_norm": 0.1399802416563034, "learning_rate": 1.297059676155682e-06, "loss": 0.002, "step": 24130 }, { "epoch": 7.53, "grad_norm": 0.2348816841840744, "learning_rate": 1.2943796451421686e-06, "loss": 0.0022, "step": 24135 }, { "epoch": 7.53, "grad_norm": 0.1977377086877823, "learning_rate": 1.291702194174438e-06, "loss": 0.0026, "step": 24140 }, { "epoch": 7.53, "grad_norm": 0.41480663418769836, "learning_rate": 1.2890273240459938e-06, "loss": 0.0018, "step": 24145 }, { "epoch": 7.53, "grad_norm": 0.13588984310626984, "learning_rate": 1.2863550355495735e-06, "loss": 0.0017, "step": 24150 }, { "epoch": 7.53, "grad_norm": 0.2099747657775879, "learning_rate": 1.2836853294771545e-06, "loss": 0.0017, "step": 24155 }, { "epoch": 7.54, "grad_norm": 0.13870127499103546, "learning_rate": 1.2810182066199429e-06, "loss": 0.0018, "step": 24160 }, { "epoch": 7.54, "grad_norm": 0.21662606298923492, "learning_rate": 1.2783536677683828e-06, "loss": 0.0019, "step": 24165 }, { "epoch": 7.54, "grad_norm": 0.11504267156124115, "learning_rate": 1.2756917137121527e-06, "loss": 0.0019, "step": 24170 }, { "epoch": 7.54, "grad_norm": 0.1169959232211113, "learning_rate": 1.2730323452401627e-06, "loss": 0.0019, "step": 24175 }, { "epoch": 7.54, "grad_norm": 0.07010991871356964, "learning_rate": 1.2703755631405589e-06, "loss": 0.0016, "step": 24180 }, { "epoch": 7.54, "grad_norm": 0.1383741796016693, "learning_rate": 1.2677213682007195e-06, "loss": 0.0015, "step": 24185 }, { "epoch": 7.54, "grad_norm": 0.22670747339725494, "learning_rate": 1.2650697612072582e-06, "loss": 0.0017, "step": 24190 }, { "epoch": 7.55, "grad_norm": 0.06694146990776062, "learning_rate": 1.2624207429460134e-06, "loss": 0.0023, "step": 24195 }, { "epoch": 7.55, "grad_norm": 0.13071152567863464, "learning_rate": 1.2597743142020703e-06, "loss": 0.0018, "step": 24200 }, { "epoch": 7.55, "grad_norm": 0.06935352832078934, "learning_rate": 1.2571304757597357e-06, "loss": 0.0009, "step": 24205 }, { "epoch": 7.55, "grad_norm": 0.06360245496034622, "learning_rate": 1.2544892284025523e-06, "loss": 0.0031, "step": 24210 }, { "epoch": 7.55, "grad_norm": 0.17494170367717743, "learning_rate": 1.2518505729132969e-06, "loss": 0.0015, "step": 24215 }, { "epoch": 7.55, "grad_norm": 0.08135698735713959, "learning_rate": 1.2492145100739728e-06, "loss": 0.0017, "step": 24220 }, { "epoch": 7.56, "grad_norm": 0.2048783153295517, "learning_rate": 1.246581040665823e-06, "loss": 0.002, "step": 24225 }, { "epoch": 7.56, "grad_norm": 0.1028202474117279, "learning_rate": 1.243950165469311e-06, "loss": 0.0014, "step": 24230 }, { "epoch": 7.56, "grad_norm": 0.13023538887500763, "learning_rate": 1.2413218852641406e-06, "loss": 0.0018, "step": 24235 }, { "epoch": 7.56, "grad_norm": 0.10343587398529053, "learning_rate": 1.2386962008292413e-06, "loss": 0.0024, "step": 24240 }, { "epoch": 7.56, "grad_norm": 0.12373864650726318, "learning_rate": 1.2360731129427794e-06, "loss": 0.0029, "step": 24245 }, { "epoch": 7.56, "grad_norm": 0.12057089060544968, "learning_rate": 1.2334526223821453e-06, "loss": 0.0017, "step": 24250 }, { "epoch": 7.57, "grad_norm": 0.1910213679075241, "learning_rate": 1.230834729923963e-06, "loss": 0.0022, "step": 24255 }, { "epoch": 7.57, "grad_norm": 0.4182658791542053, "learning_rate": 1.228219436344087e-06, "loss": 0.002, "step": 24260 }, { "epoch": 7.57, "grad_norm": 0.22596170008182526, "learning_rate": 1.2256067424175956e-06, "loss": 0.0027, "step": 24265 }, { "epoch": 7.57, "grad_norm": 0.09142956137657166, "learning_rate": 1.2229966489188038e-06, "loss": 0.0017, "step": 24270 }, { "epoch": 7.57, "grad_norm": 0.1430046558380127, "learning_rate": 1.2203891566212545e-06, "loss": 0.0023, "step": 24275 }, { "epoch": 7.57, "grad_norm": 0.1019957885146141, "learning_rate": 1.2177842662977136e-06, "loss": 0.0017, "step": 24280 }, { "epoch": 7.57, "grad_norm": 0.2015380859375, "learning_rate": 1.215181978720188e-06, "loss": 0.0019, "step": 24285 }, { "epoch": 7.58, "grad_norm": 0.15892979502677917, "learning_rate": 1.212582294659902e-06, "loss": 0.0019, "step": 24290 }, { "epoch": 7.58, "grad_norm": 0.11718596518039703, "learning_rate": 1.2099852148873136e-06, "loss": 0.0018, "step": 24295 }, { "epoch": 7.58, "grad_norm": 0.08311883360147476, "learning_rate": 1.2073907401721086e-06, "loss": 0.0015, "step": 24300 }, { "epoch": 7.58, "grad_norm": 0.1394026130437851, "learning_rate": 1.2047988712831959e-06, "loss": 0.0013, "step": 24305 }, { "epoch": 7.58, "grad_norm": 0.20909558236598969, "learning_rate": 1.2022096089887191e-06, "loss": 0.0019, "step": 24310 }, { "epoch": 7.58, "grad_norm": 0.17596064507961273, "learning_rate": 1.1996229540560456e-06, "loss": 0.0018, "step": 24315 }, { "epoch": 7.59, "grad_norm": 0.11002202332019806, "learning_rate": 1.1970389072517707e-06, "loss": 0.0025, "step": 24320 }, { "epoch": 7.59, "grad_norm": 0.17328931391239166, "learning_rate": 1.1944574693417155e-06, "loss": 0.0017, "step": 24325 }, { "epoch": 7.59, "grad_norm": 0.3190286159515381, "learning_rate": 1.1918786410909323e-06, "loss": 0.0015, "step": 24330 }, { "epoch": 7.59, "grad_norm": 0.20087780058383942, "learning_rate": 1.1893024232636974e-06, "loss": 0.0025, "step": 24335 }, { "epoch": 7.59, "grad_norm": 0.24992135167121887, "learning_rate": 1.1867288166235091e-06, "loss": 0.0016, "step": 24340 }, { "epoch": 7.59, "grad_norm": 0.14544323086738586, "learning_rate": 1.1841578219330974e-06, "loss": 0.0027, "step": 24345 }, { "epoch": 7.59, "grad_norm": 0.04758672043681145, "learning_rate": 1.181589439954416e-06, "loss": 0.0017, "step": 24350 }, { "epoch": 7.6, "grad_norm": 0.1231321468949318, "learning_rate": 1.1790236714486459e-06, "loss": 0.0021, "step": 24355 }, { "epoch": 7.6, "grad_norm": 0.14308707416057587, "learning_rate": 1.1764605171761923e-06, "loss": 0.0015, "step": 24360 }, { "epoch": 7.6, "grad_norm": 0.24447542428970337, "learning_rate": 1.1738999778966842e-06, "loss": 0.0018, "step": 24365 }, { "epoch": 7.6, "grad_norm": 0.14588113129138947, "learning_rate": 1.171342054368979e-06, "loss": 0.0016, "step": 24370 }, { "epoch": 7.6, "grad_norm": 0.17098912596702576, "learning_rate": 1.168786747351156e-06, "loss": 0.0014, "step": 24375 }, { "epoch": 7.6, "grad_norm": 0.10740490257740021, "learning_rate": 1.1662340576005215e-06, "loss": 0.0023, "step": 24380 }, { "epoch": 7.61, "grad_norm": 0.1715841144323349, "learning_rate": 1.1636839858736027e-06, "loss": 0.0013, "step": 24385 }, { "epoch": 7.61, "grad_norm": 0.2061709761619568, "learning_rate": 1.1611365329261549e-06, "loss": 0.0014, "step": 24390 }, { "epoch": 7.61, "grad_norm": 0.11730609089136124, "learning_rate": 1.158591699513154e-06, "loss": 0.0016, "step": 24395 }, { "epoch": 7.61, "grad_norm": 0.09845942258834839, "learning_rate": 1.1560494863888006e-06, "loss": 0.0014, "step": 24400 }, { "epoch": 7.61, "grad_norm": 0.18577240407466888, "learning_rate": 1.1535098943065204e-06, "loss": 0.0018, "step": 24405 }, { "epoch": 7.61, "grad_norm": 0.1618165522813797, "learning_rate": 1.1509729240189605e-06, "loss": 0.0018, "step": 24410 }, { "epoch": 7.61, "grad_norm": 0.11084864288568497, "learning_rate": 1.1484385762779914e-06, "loss": 0.0017, "step": 24415 }, { "epoch": 7.62, "grad_norm": 0.23109754920005798, "learning_rate": 1.1459068518347062e-06, "loss": 0.0021, "step": 24420 }, { "epoch": 7.62, "grad_norm": 0.04823751002550125, "learning_rate": 1.1433777514394206e-06, "loss": 0.0013, "step": 24425 }, { "epoch": 7.62, "grad_norm": 0.08719401061534882, "learning_rate": 1.1408512758416735e-06, "loss": 0.0015, "step": 24430 }, { "epoch": 7.62, "grad_norm": 0.08335618674755096, "learning_rate": 1.1383274257902244e-06, "loss": 0.002, "step": 24435 }, { "epoch": 7.62, "grad_norm": 0.29434606432914734, "learning_rate": 1.1358062020330574e-06, "loss": 0.0028, "step": 24440 }, { "epoch": 7.62, "grad_norm": 0.19055098295211792, "learning_rate": 1.1332876053173714e-06, "loss": 0.0017, "step": 24445 }, { "epoch": 7.63, "grad_norm": 0.05965162441134453, "learning_rate": 1.130771636389596e-06, "loss": 0.0019, "step": 24450 }, { "epoch": 7.63, "grad_norm": 0.18123778700828552, "learning_rate": 1.1282582959953769e-06, "loss": 0.0015, "step": 24455 }, { "epoch": 7.63, "grad_norm": 0.027868177741765976, "learning_rate": 1.1257475848795818e-06, "loss": 0.0018, "step": 24460 }, { "epoch": 7.63, "grad_norm": 0.14499281346797943, "learning_rate": 1.1232395037862974e-06, "loss": 0.0023, "step": 24465 }, { "epoch": 7.63, "grad_norm": 0.16413047909736633, "learning_rate": 1.120734053458834e-06, "loss": 0.0022, "step": 24470 }, { "epoch": 7.63, "grad_norm": 0.13355393707752228, "learning_rate": 1.1182312346397205e-06, "loss": 0.0021, "step": 24475 }, { "epoch": 7.64, "grad_norm": 0.16823016107082367, "learning_rate": 1.1157310480707073e-06, "loss": 0.001, "step": 24480 }, { "epoch": 7.64, "grad_norm": 0.1044691801071167, "learning_rate": 1.1132334944927582e-06, "loss": 0.0014, "step": 24485 }, { "epoch": 7.64, "grad_norm": 0.13820968568325043, "learning_rate": 1.1107385746460686e-06, "loss": 0.0021, "step": 24490 }, { "epoch": 7.64, "grad_norm": 0.18508176505565643, "learning_rate": 1.108246289270043e-06, "loss": 0.0033, "step": 24495 }, { "epoch": 7.64, "grad_norm": 0.05543508380651474, "learning_rate": 1.1057566391033092e-06, "loss": 0.0016, "step": 24500 }, { "epoch": 7.64, "grad_norm": 0.07221606373786926, "learning_rate": 1.103269624883715e-06, "loss": 0.0018, "step": 24505 }, { "epoch": 7.64, "grad_norm": 0.08910844475030899, "learning_rate": 1.100785247348325e-06, "loss": 0.0011, "step": 24510 }, { "epoch": 7.65, "grad_norm": 0.16187050938606262, "learning_rate": 1.0983035072334248e-06, "loss": 0.0018, "step": 24515 }, { "epoch": 7.65, "grad_norm": 0.1790592521429062, "learning_rate": 1.0958244052745127e-06, "loss": 0.0022, "step": 24520 }, { "epoch": 7.65, "grad_norm": 0.0456264354288578, "learning_rate": 1.093347942206312e-06, "loss": 0.0013, "step": 24525 }, { "epoch": 7.65, "grad_norm": 0.09067858755588531, "learning_rate": 1.090874118762757e-06, "loss": 0.0021, "step": 24530 }, { "epoch": 7.65, "grad_norm": 0.08108704537153244, "learning_rate": 1.08840293567701e-06, "loss": 0.0023, "step": 24535 }, { "epoch": 7.65, "grad_norm": 0.10392729192972183, "learning_rate": 1.0859343936814416e-06, "loss": 0.0012, "step": 24540 }, { "epoch": 7.66, "grad_norm": 0.3361983299255371, "learning_rate": 1.083468493507641e-06, "loss": 0.0018, "step": 24545 }, { "epoch": 7.66, "grad_norm": 0.07768458127975464, "learning_rate": 1.08100523588642e-06, "loss": 0.0018, "step": 24550 }, { "epoch": 7.66, "grad_norm": 0.11710520833730698, "learning_rate": 1.078544621547799e-06, "loss": 0.001, "step": 24555 }, { "epoch": 7.66, "grad_norm": 0.08064442873001099, "learning_rate": 1.07608665122102e-06, "loss": 0.0022, "step": 24560 }, { "epoch": 7.66, "grad_norm": 0.11553987860679626, "learning_rate": 1.073631325634542e-06, "loss": 0.0014, "step": 24565 }, { "epoch": 7.66, "grad_norm": 0.22669553756713867, "learning_rate": 1.0711786455160389e-06, "loss": 0.0014, "step": 24570 }, { "epoch": 7.66, "grad_norm": 0.13220682740211487, "learning_rate": 1.0687286115923979e-06, "loss": 0.0016, "step": 24575 }, { "epoch": 7.67, "grad_norm": 0.2977820336818695, "learning_rate": 1.066281224589728e-06, "loss": 0.0026, "step": 24580 }, { "epoch": 7.67, "grad_norm": 0.04379139095544815, "learning_rate": 1.0638364852333516e-06, "loss": 0.0015, "step": 24585 }, { "epoch": 7.67, "grad_norm": 0.18310536444187164, "learning_rate": 1.0613943942478e-06, "loss": 0.0022, "step": 24590 }, { "epoch": 7.67, "grad_norm": 0.12095720320940018, "learning_rate": 1.0589549523568287e-06, "loss": 0.002, "step": 24595 }, { "epoch": 7.67, "grad_norm": 0.12880194187164307, "learning_rate": 1.0565181602834017e-06, "loss": 0.0029, "step": 24600 }, { "epoch": 7.67, "grad_norm": 0.1454593986272812, "learning_rate": 1.0540840187497026e-06, "loss": 0.0021, "step": 24605 }, { "epoch": 7.68, "grad_norm": 0.15364618599414825, "learning_rate": 1.0516525284771252e-06, "loss": 0.0022, "step": 24610 }, { "epoch": 7.68, "grad_norm": 0.23326414823532104, "learning_rate": 1.0492236901862774e-06, "loss": 0.0024, "step": 24615 }, { "epoch": 7.68, "grad_norm": 0.17910537123680115, "learning_rate": 1.0467975045969902e-06, "loss": 0.0014, "step": 24620 }, { "epoch": 7.68, "grad_norm": 0.12565740942955017, "learning_rate": 1.0443739724282954e-06, "loss": 0.0023, "step": 24625 }, { "epoch": 7.68, "grad_norm": 0.0883830189704895, "learning_rate": 1.0419530943984458e-06, "loss": 0.0014, "step": 24630 }, { "epoch": 7.68, "grad_norm": 0.0909748524427414, "learning_rate": 1.0395348712249064e-06, "loss": 0.0019, "step": 24635 }, { "epoch": 7.69, "grad_norm": 0.10346776992082596, "learning_rate": 1.0371193036243554e-06, "loss": 0.0012, "step": 24640 }, { "epoch": 7.69, "grad_norm": 0.18627353012561798, "learning_rate": 1.0347063923126832e-06, "loss": 0.0023, "step": 24645 }, { "epoch": 7.69, "grad_norm": 0.11687027662992477, "learning_rate": 1.0322961380049935e-06, "loss": 0.0026, "step": 24650 }, { "epoch": 7.69, "grad_norm": 0.17399488389492035, "learning_rate": 1.0298885414156034e-06, "loss": 0.0019, "step": 24655 }, { "epoch": 7.69, "grad_norm": 0.1125384047627449, "learning_rate": 1.0274836032580416e-06, "loss": 0.0008, "step": 24660 }, { "epoch": 7.69, "grad_norm": 0.11277636140584946, "learning_rate": 1.0250813242450474e-06, "loss": 0.0024, "step": 24665 }, { "epoch": 7.69, "grad_norm": 0.17943529784679413, "learning_rate": 1.022681705088575e-06, "loss": 0.0017, "step": 24670 }, { "epoch": 7.7, "grad_norm": 0.037092261016368866, "learning_rate": 1.020284746499788e-06, "loss": 0.0014, "step": 24675 }, { "epoch": 7.7, "grad_norm": 0.1271539330482483, "learning_rate": 1.0178904491890628e-06, "loss": 0.0019, "step": 24680 }, { "epoch": 7.7, "grad_norm": 0.08718859404325485, "learning_rate": 1.015498813865986e-06, "loss": 0.0019, "step": 24685 }, { "epoch": 7.7, "grad_norm": 0.1267717331647873, "learning_rate": 1.013109841239356e-06, "loss": 0.0016, "step": 24690 }, { "epoch": 7.7, "grad_norm": 0.2325306236743927, "learning_rate": 1.0107235320171826e-06, "loss": 0.0026, "step": 24695 }, { "epoch": 7.7, "grad_norm": 0.06632918864488602, "learning_rate": 1.0083398869066852e-06, "loss": 0.0022, "step": 24700 }, { "epoch": 7.71, "grad_norm": 0.21014709770679474, "learning_rate": 1.005958906614294e-06, "loss": 0.0018, "step": 24705 }, { "epoch": 7.71, "grad_norm": 0.17265684902668, "learning_rate": 1.0035805918456487e-06, "loss": 0.004, "step": 24710 }, { "epoch": 7.71, "grad_norm": 0.13958702981472015, "learning_rate": 1.0012049433056015e-06, "loss": 0.0022, "step": 24715 }, { "epoch": 7.71, "grad_norm": 0.14307720959186554, "learning_rate": 9.988319616982112e-07, "loss": 0.0025, "step": 24720 }, { "epoch": 7.71, "grad_norm": 0.18551354110240936, "learning_rate": 9.964616477267486e-07, "loss": 0.0031, "step": 24725 }, { "epoch": 7.71, "grad_norm": 0.16873350739479065, "learning_rate": 9.940940020936952e-07, "loss": 0.0017, "step": 24730 }, { "epoch": 7.71, "grad_norm": 0.10657307505607605, "learning_rate": 9.917290255007327e-07, "loss": 0.0019, "step": 24735 }, { "epoch": 7.72, "grad_norm": 0.09567416459321976, "learning_rate": 9.893667186487654e-07, "loss": 0.002, "step": 24740 }, { "epoch": 7.72, "grad_norm": 0.05228479579091072, "learning_rate": 9.870070822378974e-07, "loss": 0.0015, "step": 24745 }, { "epoch": 7.72, "grad_norm": 0.13476140797138214, "learning_rate": 9.84650116967444e-07, "loss": 0.0027, "step": 24750 }, { "epoch": 7.72, "grad_norm": 0.12251874804496765, "learning_rate": 9.82295823535927e-07, "loss": 0.0018, "step": 24755 }, { "epoch": 7.72, "grad_norm": 0.08515207469463348, "learning_rate": 9.799442026410788e-07, "loss": 0.0011, "step": 24760 }, { "epoch": 7.72, "grad_norm": 0.6399232745170593, "learning_rate": 9.775952549798406e-07, "loss": 0.0038, "step": 24765 }, { "epoch": 7.73, "grad_norm": 0.19654053449630737, "learning_rate": 9.752489812483567e-07, "loss": 0.0031, "step": 24770 }, { "epoch": 7.73, "grad_norm": 0.09422720223665237, "learning_rate": 9.729053821419797e-07, "loss": 0.002, "step": 24775 }, { "epoch": 7.73, "grad_norm": 0.2208988070487976, "learning_rate": 9.705644583552775e-07, "loss": 0.001, "step": 24780 }, { "epoch": 7.73, "grad_norm": 0.09688408672809601, "learning_rate": 9.682262105820161e-07, "loss": 0.0024, "step": 24785 }, { "epoch": 7.73, "grad_norm": 0.07366550713777542, "learning_rate": 9.658906395151713e-07, "loss": 0.0023, "step": 24790 }, { "epoch": 7.73, "grad_norm": 0.17863616347312927, "learning_rate": 9.63557745846927e-07, "loss": 0.0021, "step": 24795 }, { "epoch": 7.74, "grad_norm": 0.08455432206392288, "learning_rate": 9.612275302686714e-07, "loss": 0.0021, "step": 24800 }, { "epoch": 7.74, "grad_norm": 0.23154126107692719, "learning_rate": 9.58899993471003e-07, "loss": 0.0019, "step": 24805 }, { "epoch": 7.74, "grad_norm": 0.11111778020858765, "learning_rate": 9.56575136143718e-07, "loss": 0.0014, "step": 24810 }, { "epoch": 7.74, "grad_norm": 0.1214340329170227, "learning_rate": 9.542529589758276e-07, "loss": 0.0023, "step": 24815 }, { "epoch": 7.74, "grad_norm": 0.13460777699947357, "learning_rate": 9.519334626555421e-07, "loss": 0.0016, "step": 24820 }, { "epoch": 7.74, "grad_norm": 0.16119147837162018, "learning_rate": 9.496166478702851e-07, "loss": 0.0021, "step": 24825 }, { "epoch": 7.74, "grad_norm": 0.09674394875764847, "learning_rate": 9.473025153066773e-07, "loss": 0.0019, "step": 24830 }, { "epoch": 7.75, "grad_norm": 0.15101002156734467, "learning_rate": 9.44991065650549e-07, "loss": 0.0025, "step": 24835 }, { "epoch": 7.75, "grad_norm": 0.06771969050168991, "learning_rate": 9.426822995869367e-07, "loss": 0.0013, "step": 24840 }, { "epoch": 7.75, "grad_norm": 0.12194545567035675, "learning_rate": 9.403762178000731e-07, "loss": 0.0021, "step": 24845 }, { "epoch": 7.75, "grad_norm": 0.10539015382528305, "learning_rate": 9.380728209734047e-07, "loss": 0.0018, "step": 24850 }, { "epoch": 7.75, "grad_norm": 0.10646653920412064, "learning_rate": 9.357721097895778e-07, "loss": 0.0018, "step": 24855 }, { "epoch": 7.75, "grad_norm": 0.35590100288391113, "learning_rate": 9.334740849304457e-07, "loss": 0.0021, "step": 24860 }, { "epoch": 7.76, "grad_norm": 0.09334748238325119, "learning_rate": 9.311787470770606e-07, "loss": 0.0021, "step": 24865 }, { "epoch": 7.76, "grad_norm": 0.3298105001449585, "learning_rate": 9.288860969096858e-07, "loss": 0.0031, "step": 24870 }, { "epoch": 7.76, "grad_norm": 0.0878896489739418, "learning_rate": 9.265961351077835e-07, "loss": 0.002, "step": 24875 }, { "epoch": 7.76, "grad_norm": 0.05391193926334381, "learning_rate": 9.243088623500163e-07, "loss": 0.0014, "step": 24880 }, { "epoch": 7.76, "grad_norm": 0.137955442070961, "learning_rate": 9.22024279314253e-07, "loss": 0.0019, "step": 24885 }, { "epoch": 7.76, "grad_norm": 0.13348926603794098, "learning_rate": 9.197423866775679e-07, "loss": 0.0017, "step": 24890 }, { "epoch": 7.76, "grad_norm": 0.604273796081543, "learning_rate": 9.17463185116233e-07, "loss": 0.0019, "step": 24895 }, { "epoch": 7.77, "grad_norm": 0.21288979053497314, "learning_rate": 9.15186675305727e-07, "loss": 0.0015, "step": 24900 }, { "epoch": 7.77, "grad_norm": 0.2926914095878601, "learning_rate": 9.129128579207258e-07, "loss": 0.0025, "step": 24905 }, { "epoch": 7.77, "grad_norm": 0.0603574700653553, "learning_rate": 9.106417336351159e-07, "loss": 0.0013, "step": 24910 }, { "epoch": 7.77, "grad_norm": 0.15772417187690735, "learning_rate": 9.083733031219765e-07, "loss": 0.0018, "step": 24915 }, { "epoch": 7.77, "grad_norm": 0.04270298406481743, "learning_rate": 9.061075670535913e-07, "loss": 0.0011, "step": 24920 }, { "epoch": 7.77, "grad_norm": 0.11631837487220764, "learning_rate": 9.038445261014484e-07, "loss": 0.0022, "step": 24925 }, { "epoch": 7.78, "grad_norm": 0.18365252017974854, "learning_rate": 9.015841809362347e-07, "loss": 0.0021, "step": 24930 }, { "epoch": 7.78, "grad_norm": 0.1408557891845703, "learning_rate": 8.993265322278389e-07, "loss": 0.0018, "step": 24935 }, { "epoch": 7.78, "grad_norm": 0.10175014287233353, "learning_rate": 8.97071580645349e-07, "loss": 0.0021, "step": 24940 }, { "epoch": 7.78, "grad_norm": 0.048780281096696854, "learning_rate": 8.94819326857056e-07, "loss": 0.001, "step": 24945 }, { "epoch": 7.78, "grad_norm": 0.15094539523124695, "learning_rate": 8.925697715304504e-07, "loss": 0.0017, "step": 24950 }, { "epoch": 7.78, "grad_norm": 0.1805548220872879, "learning_rate": 8.903229153322212e-07, "loss": 0.0018, "step": 24955 }, { "epoch": 7.78, "grad_norm": 0.07314382493495941, "learning_rate": 8.880787589282613e-07, "loss": 0.0021, "step": 24960 }, { "epoch": 7.79, "grad_norm": 0.12884323298931122, "learning_rate": 8.858373029836609e-07, "loss": 0.0007, "step": 24965 }, { "epoch": 7.79, "grad_norm": 0.19089964032173157, "learning_rate": 8.835985481627097e-07, "loss": 0.0017, "step": 24970 }, { "epoch": 7.79, "grad_norm": 0.13515068590641022, "learning_rate": 8.813624951288969e-07, "loss": 0.0013, "step": 24975 }, { "epoch": 7.79, "grad_norm": 0.2111157923936844, "learning_rate": 8.791291445449124e-07, "loss": 0.0023, "step": 24980 }, { "epoch": 7.79, "grad_norm": 0.1934300810098648, "learning_rate": 8.768984970726435e-07, "loss": 0.0022, "step": 24985 }, { "epoch": 7.79, "grad_norm": 0.15313348174095154, "learning_rate": 8.746705533731792e-07, "loss": 0.0019, "step": 24990 }, { "epoch": 7.8, "grad_norm": 0.07578973472118378, "learning_rate": 8.724453141068023e-07, "loss": 0.0011, "step": 24995 }, { "epoch": 7.8, "grad_norm": 0.09974081814289093, "learning_rate": 8.702227799329999e-07, "loss": 0.002, "step": 25000 }, { "epoch": 7.8, "grad_norm": 0.03508811444044113, "learning_rate": 8.680029515104516e-07, "loss": 0.0013, "step": 25005 }, { "epoch": 7.8, "grad_norm": 0.14680829644203186, "learning_rate": 8.657858294970411e-07, "loss": 0.0026, "step": 25010 }, { "epoch": 7.8, "grad_norm": 0.19211848080158234, "learning_rate": 8.635714145498442e-07, "loss": 0.0024, "step": 25015 }, { "epoch": 7.8, "grad_norm": 0.12090691179037094, "learning_rate": 8.613597073251412e-07, "loss": 0.0017, "step": 25020 }, { "epoch": 7.81, "grad_norm": 0.1656752973794937, "learning_rate": 8.59150708478399e-07, "loss": 0.002, "step": 25025 }, { "epoch": 7.81, "grad_norm": 0.20714865624904633, "learning_rate": 8.56944418664295e-07, "loss": 0.0021, "step": 25030 }, { "epoch": 7.81, "grad_norm": 0.25840499997138977, "learning_rate": 8.547408385366951e-07, "loss": 0.0013, "step": 25035 }, { "epoch": 7.81, "grad_norm": 0.1592530459165573, "learning_rate": 8.525399687486635e-07, "loss": 0.0021, "step": 25040 }, { "epoch": 7.81, "grad_norm": 0.182389497756958, "learning_rate": 8.503418099524641e-07, "loss": 0.0018, "step": 25045 }, { "epoch": 7.81, "grad_norm": 0.040837742388248444, "learning_rate": 8.481463627995545e-07, "loss": 0.0021, "step": 25050 }, { "epoch": 7.81, "grad_norm": 0.10436588525772095, "learning_rate": 8.459536279405922e-07, "loss": 0.0026, "step": 25055 }, { "epoch": 7.82, "grad_norm": 0.18838675320148468, "learning_rate": 8.437636060254228e-07, "loss": 0.0019, "step": 25060 }, { "epoch": 7.82, "grad_norm": 0.2783716917037964, "learning_rate": 8.415762977030961e-07, "loss": 0.0024, "step": 25065 }, { "epoch": 7.82, "grad_norm": 0.3167223036289215, "learning_rate": 8.393917036218535e-07, "loss": 0.0019, "step": 25070 }, { "epoch": 7.82, "grad_norm": 0.11779921501874924, "learning_rate": 8.372098244291371e-07, "loss": 0.0022, "step": 25075 }, { "epoch": 7.82, "grad_norm": 0.2449861466884613, "learning_rate": 8.350306607715774e-07, "loss": 0.0022, "step": 25080 }, { "epoch": 7.82, "grad_norm": 0.08655525743961334, "learning_rate": 8.328542132950046e-07, "loss": 0.0014, "step": 25085 }, { "epoch": 7.83, "grad_norm": 0.06086605787277222, "learning_rate": 8.306804826444448e-07, "loss": 0.0011, "step": 25090 }, { "epoch": 7.83, "grad_norm": 0.0736154168844223, "learning_rate": 8.285094694641127e-07, "loss": 0.002, "step": 25095 }, { "epoch": 7.83, "grad_norm": 0.38189685344696045, "learning_rate": 8.263411743974226e-07, "loss": 0.003, "step": 25100 }, { "epoch": 7.83, "grad_norm": 0.02803908847272396, "learning_rate": 8.241755980869837e-07, "loss": 0.0013, "step": 25105 }, { "epoch": 7.83, "grad_norm": 0.10847136378288269, "learning_rate": 8.220127411745971e-07, "loss": 0.0016, "step": 25110 }, { "epoch": 7.83, "grad_norm": 0.2512924373149872, "learning_rate": 8.198526043012611e-07, "loss": 0.0017, "step": 25115 }, { "epoch": 7.83, "grad_norm": 0.06729435920715332, "learning_rate": 8.176951881071638e-07, "loss": 0.002, "step": 25120 }, { "epoch": 7.84, "grad_norm": 0.1283143311738968, "learning_rate": 8.155404932316902e-07, "loss": 0.0015, "step": 25125 }, { "epoch": 7.84, "grad_norm": 0.2080482542514801, "learning_rate": 8.133885203134184e-07, "loss": 0.0011, "step": 25130 }, { "epoch": 7.84, "grad_norm": 0.21707133948802948, "learning_rate": 8.112392699901172e-07, "loss": 0.0018, "step": 25135 }, { "epoch": 7.84, "grad_norm": 0.5315659642219543, "learning_rate": 8.090927428987494e-07, "loss": 0.0019, "step": 25140 }, { "epoch": 7.84, "grad_norm": 0.1359284371137619, "learning_rate": 8.069489396754738e-07, "loss": 0.0025, "step": 25145 }, { "epoch": 7.84, "grad_norm": 0.13661302626132965, "learning_rate": 8.048078609556387e-07, "loss": 0.0021, "step": 25150 }, { "epoch": 7.85, "grad_norm": 0.2878587245941162, "learning_rate": 8.026695073737845e-07, "loss": 0.0023, "step": 25155 }, { "epoch": 7.85, "grad_norm": 0.09561365097761154, "learning_rate": 8.005338795636497e-07, "loss": 0.0019, "step": 25160 }, { "epoch": 7.85, "grad_norm": 0.1620081216096878, "learning_rate": 7.984009781581592e-07, "loss": 0.0022, "step": 25165 }, { "epoch": 7.85, "grad_norm": 0.08829033374786377, "learning_rate": 7.962708037894296e-07, "loss": 0.0012, "step": 25170 }, { "epoch": 7.85, "grad_norm": 0.17688344419002533, "learning_rate": 7.941433570887714e-07, "loss": 0.0025, "step": 25175 }, { "epoch": 7.85, "grad_norm": 0.18411220610141754, "learning_rate": 7.92018638686688e-07, "loss": 0.0019, "step": 25180 }, { "epoch": 7.86, "grad_norm": 0.24484209716320038, "learning_rate": 7.898966492128724e-07, "loss": 0.002, "step": 25185 }, { "epoch": 7.86, "grad_norm": 0.18548721075057983, "learning_rate": 7.87777389296207e-07, "loss": 0.001, "step": 25190 }, { "epoch": 7.86, "grad_norm": 0.5427772402763367, "learning_rate": 7.856608595647686e-07, "loss": 0.0025, "step": 25195 }, { "epoch": 7.86, "grad_norm": 0.13414634764194489, "learning_rate": 7.835470606458273e-07, "loss": 0.0022, "step": 25200 }, { "epoch": 7.86, "grad_norm": 0.11223254352807999, "learning_rate": 7.814359931658356e-07, "loss": 0.0013, "step": 25205 }, { "epoch": 7.86, "grad_norm": 0.10069742798805237, "learning_rate": 7.793276577504428e-07, "loss": 0.0012, "step": 25210 }, { "epoch": 7.86, "grad_norm": 0.4649175703525543, "learning_rate": 7.772220550244858e-07, "loss": 0.0024, "step": 25215 }, { "epoch": 7.87, "grad_norm": 0.25964003801345825, "learning_rate": 7.751191856119933e-07, "loss": 0.0013, "step": 25220 }, { "epoch": 7.87, "grad_norm": 0.15606264770030975, "learning_rate": 7.730190501361845e-07, "loss": 0.0015, "step": 25225 }, { "epoch": 7.87, "grad_norm": 0.12841029465198517, "learning_rate": 7.70921649219466e-07, "loss": 0.0029, "step": 25230 }, { "epoch": 7.87, "grad_norm": 0.13047796487808228, "learning_rate": 7.688269834834349e-07, "loss": 0.0013, "step": 25235 }, { "epoch": 7.87, "grad_norm": 0.08756542950868607, "learning_rate": 7.667350535488782e-07, "loss": 0.0018, "step": 25240 }, { "epoch": 7.87, "grad_norm": 0.12574045360088348, "learning_rate": 7.64645860035772e-07, "loss": 0.0015, "step": 25245 }, { "epoch": 7.88, "grad_norm": 0.08015564829111099, "learning_rate": 7.625594035632821e-07, "loss": 0.002, "step": 25250 }, { "epoch": 7.88, "grad_norm": 0.10074813663959503, "learning_rate": 7.604756847497596e-07, "loss": 0.0024, "step": 25255 }, { "epoch": 7.88, "grad_norm": 0.1459551900625229, "learning_rate": 7.583947042127504e-07, "loss": 0.0021, "step": 25260 }, { "epoch": 7.88, "grad_norm": 0.13360217213630676, "learning_rate": 7.563164625689823e-07, "loss": 0.0023, "step": 25265 }, { "epoch": 7.88, "grad_norm": 0.16135920584201813, "learning_rate": 7.54240960434377e-07, "loss": 0.0023, "step": 25270 }, { "epoch": 7.88, "grad_norm": 0.19439354538917542, "learning_rate": 7.521681984240403e-07, "loss": 0.002, "step": 25275 }, { "epoch": 7.88, "grad_norm": 0.09518568962812424, "learning_rate": 7.500981771522675e-07, "loss": 0.0018, "step": 25280 }, { "epoch": 7.89, "grad_norm": 0.17542310059070587, "learning_rate": 7.480308972325423e-07, "loss": 0.0023, "step": 25285 }, { "epoch": 7.89, "grad_norm": 0.03590959310531616, "learning_rate": 7.459663592775335e-07, "loss": 0.0014, "step": 25290 }, { "epoch": 7.89, "grad_norm": 0.1374923437833786, "learning_rate": 7.439045638991016e-07, "loss": 0.0016, "step": 25295 }, { "epoch": 7.89, "grad_norm": 0.18914851546287537, "learning_rate": 7.418455117082901e-07, "loss": 0.0013, "step": 25300 }, { "epoch": 7.89, "grad_norm": 0.13574159145355225, "learning_rate": 7.397892033153309e-07, "loss": 0.002, "step": 25305 }, { "epoch": 7.89, "grad_norm": 0.12614206969738007, "learning_rate": 7.377356393296442e-07, "loss": 0.0026, "step": 25310 }, { "epoch": 7.9, "grad_norm": 0.3813757598400116, "learning_rate": 7.356848203598321e-07, "loss": 0.0023, "step": 25315 }, { "epoch": 7.9, "grad_norm": 0.18741311132907867, "learning_rate": 7.336367470136906e-07, "loss": 0.0017, "step": 25320 }, { "epoch": 7.9, "grad_norm": 0.10486096888780594, "learning_rate": 7.315914198981977e-07, "loss": 0.002, "step": 25325 }, { "epoch": 7.9, "grad_norm": 0.20126007497310638, "learning_rate": 7.295488396195161e-07, "loss": 0.0023, "step": 25330 }, { "epoch": 7.9, "grad_norm": 0.1647581160068512, "learning_rate": 7.275090067829971e-07, "loss": 0.0017, "step": 25335 }, { "epoch": 7.9, "grad_norm": 0.12988118827342987, "learning_rate": 7.254719219931761e-07, "loss": 0.0015, "step": 25340 }, { "epoch": 7.91, "grad_norm": 0.19390501081943512, "learning_rate": 7.234375858537768e-07, "loss": 0.0019, "step": 25345 }, { "epoch": 7.91, "grad_norm": 0.2685386836528778, "learning_rate": 7.214059989677036e-07, "loss": 0.0021, "step": 25350 }, { "epoch": 7.91, "grad_norm": 0.13090895116329193, "learning_rate": 7.193771619370504e-07, "loss": 0.0018, "step": 25355 }, { "epoch": 7.91, "grad_norm": 0.1114158183336258, "learning_rate": 7.173510753630919e-07, "loss": 0.0024, "step": 25360 }, { "epoch": 7.91, "grad_norm": 0.03875642642378807, "learning_rate": 7.153277398462955e-07, "loss": 0.0025, "step": 25365 }, { "epoch": 7.91, "grad_norm": 0.17294959723949432, "learning_rate": 7.133071559863048e-07, "loss": 0.0022, "step": 25370 }, { "epoch": 7.91, "grad_norm": 0.25580188632011414, "learning_rate": 7.112893243819508e-07, "loss": 0.0025, "step": 25375 }, { "epoch": 7.92, "grad_norm": 0.17490358650684357, "learning_rate": 7.092742456312518e-07, "loss": 0.0014, "step": 25380 }, { "epoch": 7.92, "grad_norm": 0.12375898659229279, "learning_rate": 7.072619203314057e-07, "loss": 0.0016, "step": 25385 }, { "epoch": 7.92, "grad_norm": 0.11281836777925491, "learning_rate": 7.052523490787954e-07, "loss": 0.0022, "step": 25390 }, { "epoch": 7.92, "grad_norm": 0.15400633215904236, "learning_rate": 7.032455324689902e-07, "loss": 0.0022, "step": 25395 }, { "epoch": 7.92, "grad_norm": 0.0841684639453888, "learning_rate": 7.012414710967386e-07, "loss": 0.0016, "step": 25400 }, { "epoch": 7.92, "grad_norm": 0.09495735913515091, "learning_rate": 6.9924016555598e-07, "loss": 0.001, "step": 25405 }, { "epoch": 7.93, "grad_norm": 0.10059231519699097, "learning_rate": 6.972416164398299e-07, "loss": 0.0014, "step": 25410 }, { "epoch": 7.93, "grad_norm": 0.39723658561706543, "learning_rate": 6.952458243405913e-07, "loss": 0.0016, "step": 25415 }, { "epoch": 7.93, "grad_norm": 0.15220953524112701, "learning_rate": 6.932527898497443e-07, "loss": 0.002, "step": 25420 }, { "epoch": 7.93, "grad_norm": 0.06055636703968048, "learning_rate": 6.912625135579587e-07, "loss": 0.0018, "step": 25425 }, { "epoch": 7.93, "grad_norm": 0.486737459897995, "learning_rate": 6.892749960550815e-07, "loss": 0.0028, "step": 25430 }, { "epoch": 7.93, "grad_norm": 0.09291423112154007, "learning_rate": 6.872902379301461e-07, "loss": 0.0013, "step": 25435 }, { "epoch": 7.93, "grad_norm": 0.07296492159366608, "learning_rate": 6.853082397713662e-07, "loss": 0.0024, "step": 25440 }, { "epoch": 7.94, "grad_norm": 0.057733118534088135, "learning_rate": 6.833290021661354e-07, "loss": 0.0017, "step": 25445 }, { "epoch": 7.94, "grad_norm": 0.09245966374874115, "learning_rate": 6.813525257010378e-07, "loss": 0.0019, "step": 25450 }, { "epoch": 7.94, "grad_norm": 0.14521659910678864, "learning_rate": 6.79378810961826e-07, "loss": 0.002, "step": 25455 }, { "epoch": 7.94, "grad_norm": 0.2611040472984314, "learning_rate": 6.774078585334443e-07, "loss": 0.0026, "step": 25460 }, { "epoch": 7.94, "grad_norm": 0.08619168400764465, "learning_rate": 6.754396690000142e-07, "loss": 0.0012, "step": 25465 }, { "epoch": 7.94, "grad_norm": 0.12047097831964493, "learning_rate": 6.734742429448394e-07, "loss": 0.0018, "step": 25470 }, { "epoch": 7.95, "grad_norm": 0.14527033269405365, "learning_rate": 6.715115809504047e-07, "loss": 0.0014, "step": 25475 }, { "epoch": 7.95, "grad_norm": 0.08826136589050293, "learning_rate": 6.695516835983751e-07, "loss": 0.0018, "step": 25480 }, { "epoch": 7.95, "grad_norm": 0.11847338825464249, "learning_rate": 6.67594551469597e-07, "loss": 0.0013, "step": 25485 }, { "epoch": 7.95, "grad_norm": 0.12033139169216156, "learning_rate": 6.656401851440963e-07, "loss": 0.0018, "step": 25490 }, { "epoch": 7.95, "grad_norm": 0.2235201746225357, "learning_rate": 6.636885852010799e-07, "loss": 0.0014, "step": 25495 }, { "epoch": 7.95, "grad_norm": 0.21314990520477295, "learning_rate": 6.617397522189361e-07, "loss": 0.0019, "step": 25500 }, { "epoch": 7.95, "grad_norm": 0.036323484033346176, "learning_rate": 6.597936867752297e-07, "loss": 0.0013, "step": 25505 }, { "epoch": 7.96, "grad_norm": 0.1177893653512001, "learning_rate": 6.578503894467092e-07, "loss": 0.0016, "step": 25510 }, { "epoch": 7.96, "grad_norm": 0.09829962998628616, "learning_rate": 6.559098608092996e-07, "loss": 0.0019, "step": 25515 }, { "epoch": 7.96, "grad_norm": 0.11559101194143295, "learning_rate": 6.539721014381073e-07, "loss": 0.0013, "step": 25520 }, { "epoch": 7.96, "grad_norm": 0.03609297797083855, "learning_rate": 6.520371119074187e-07, "loss": 0.0027, "step": 25525 }, { "epoch": 7.96, "grad_norm": 0.10191293805837631, "learning_rate": 6.50104892790695e-07, "loss": 0.0034, "step": 25530 }, { "epoch": 7.96, "grad_norm": 0.23136061429977417, "learning_rate": 6.481754446605825e-07, "loss": 0.0019, "step": 25535 }, { "epoch": 7.97, "grad_norm": 0.1560174822807312, "learning_rate": 6.462487680889007e-07, "loss": 0.0015, "step": 25540 }, { "epoch": 7.97, "grad_norm": 0.0929771214723587, "learning_rate": 6.443248636466515e-07, "loss": 0.0017, "step": 25545 }, { "epoch": 7.97, "grad_norm": 0.08027144521474838, "learning_rate": 6.424037319040122e-07, "loss": 0.0021, "step": 25550 }, { "epoch": 7.97, "grad_norm": 0.09332633018493652, "learning_rate": 6.404853734303429e-07, "loss": 0.0017, "step": 25555 }, { "epoch": 7.97, "grad_norm": 0.07819889485836029, "learning_rate": 6.385697887941778e-07, "loss": 0.0015, "step": 25560 }, { "epoch": 7.97, "grad_norm": 0.06993052363395691, "learning_rate": 6.36656978563227e-07, "loss": 0.0012, "step": 25565 }, { "epoch": 7.98, "grad_norm": 0.23164385557174683, "learning_rate": 6.347469433043852e-07, "loss": 0.002, "step": 25570 }, { "epoch": 7.98, "grad_norm": 0.4761601686477661, "learning_rate": 6.328396835837203e-07, "loss": 0.0018, "step": 25575 }, { "epoch": 7.98, "grad_norm": 0.07373147457838058, "learning_rate": 6.309351999664781e-07, "loss": 0.0016, "step": 25580 }, { "epoch": 7.98, "grad_norm": 0.14360737800598145, "learning_rate": 6.290334930170816e-07, "loss": 0.0017, "step": 25585 }, { "epoch": 7.98, "grad_norm": 0.15142029523849487, "learning_rate": 6.271345632991321e-07, "loss": 0.0024, "step": 25590 }, { "epoch": 7.98, "grad_norm": 0.11168301105499268, "learning_rate": 6.252384113754073e-07, "loss": 0.0018, "step": 25595 }, { "epoch": 7.98, "grad_norm": 0.085924431681633, "learning_rate": 6.233450378078576e-07, "loss": 0.0016, "step": 25600 }, { "epoch": 7.99, "grad_norm": 0.1271122246980667, "learning_rate": 6.214544431576163e-07, "loss": 0.0023, "step": 25605 }, { "epoch": 7.99, "grad_norm": 0.06772235035896301, "learning_rate": 6.19566627984991e-07, "loss": 0.0014, "step": 25610 }, { "epoch": 7.99, "grad_norm": 0.18694095313549042, "learning_rate": 6.176815928494639e-07, "loss": 0.0019, "step": 25615 }, { "epoch": 7.99, "grad_norm": 0.8700315356254578, "learning_rate": 6.15799338309696e-07, "loss": 0.0025, "step": 25620 }, { "epoch": 7.99, "grad_norm": 0.08709261566400528, "learning_rate": 6.139198649235212e-07, "loss": 0.0025, "step": 25625 }, { "epoch": 7.99, "grad_norm": 0.11673015356063843, "learning_rate": 6.120431732479515e-07, "loss": 0.0022, "step": 25630 }, { "epoch": 8.0, "grad_norm": 0.3640889823436737, "learning_rate": 6.101692638391732e-07, "loss": 0.0024, "step": 25635 }, { "epoch": 8.0, "grad_norm": 0.18475152552127838, "learning_rate": 6.082981372525487e-07, "loss": 0.0017, "step": 25640 }, { "epoch": 8.0, "grad_norm": 0.0432017557322979, "learning_rate": 6.064297940426144e-07, "loss": 0.0008, "step": 25645 }, { "epoch": 8.0, "grad_norm": 0.19196128845214844, "learning_rate": 6.04564234763082e-07, "loss": 0.0016, "step": 25650 }, { "epoch": 8.0, "grad_norm": 0.11198998987674713, "learning_rate": 6.027014599668423e-07, "loss": 0.0012, "step": 25655 }, { "epoch": 8.0, "grad_norm": 0.028984561562538147, "learning_rate": 6.008414702059551e-07, "loss": 0.0007, "step": 25660 }, { "epoch": 8.0, "grad_norm": 0.10032795369625092, "learning_rate": 5.989842660316591e-07, "loss": 0.001, "step": 25665 }, { "epoch": 8.01, "grad_norm": 0.051030367612838745, "learning_rate": 5.971298479943644e-07, "loss": 0.0015, "step": 25670 }, { "epoch": 8.01, "grad_norm": 0.07989995181560516, "learning_rate": 5.952782166436566e-07, "loss": 0.0015, "step": 25675 }, { "epoch": 8.01, "grad_norm": 0.0383356548845768, "learning_rate": 5.934293725282947e-07, "loss": 0.0012, "step": 25680 }, { "epoch": 8.01, "grad_norm": 0.053743164986371994, "learning_rate": 5.915833161962126e-07, "loss": 0.0008, "step": 25685 }, { "epoch": 8.01, "grad_norm": 0.04896370321512222, "learning_rate": 5.89740048194517e-07, "loss": 0.0014, "step": 25690 }, { "epoch": 8.01, "grad_norm": 0.09783028066158295, "learning_rate": 5.878995690694922e-07, "loss": 0.0015, "step": 25695 }, { "epoch": 8.02, "grad_norm": 0.09386777132749557, "learning_rate": 5.860618793665907e-07, "loss": 0.0011, "step": 25700 }, { "epoch": 8.02, "grad_norm": 0.09913419932126999, "learning_rate": 5.842269796304423e-07, "loss": 0.0015, "step": 25705 }, { "epoch": 8.02, "grad_norm": 0.05181480199098587, "learning_rate": 5.823948704048443e-07, "loss": 0.0014, "step": 25710 }, { "epoch": 8.02, "grad_norm": 0.060823723673820496, "learning_rate": 5.805655522327725e-07, "loss": 0.0013, "step": 25715 }, { "epoch": 8.02, "grad_norm": 0.12706847488880157, "learning_rate": 5.78739025656374e-07, "loss": 0.0012, "step": 25720 }, { "epoch": 8.02, "grad_norm": 0.02192131243646145, "learning_rate": 5.769152912169685e-07, "loss": 0.001, "step": 25725 }, { "epoch": 8.03, "grad_norm": 0.03676176071166992, "learning_rate": 5.750943494550465e-07, "loss": 0.0006, "step": 25730 }, { "epoch": 8.03, "grad_norm": 0.07616845518350601, "learning_rate": 5.732762009102732e-07, "loss": 0.0012, "step": 25735 }, { "epoch": 8.03, "grad_norm": 0.09130016714334488, "learning_rate": 5.714608461214888e-07, "loss": 0.0012, "step": 25740 }, { "epoch": 8.03, "grad_norm": 0.018413733690977097, "learning_rate": 5.69648285626696e-07, "loss": 0.0011, "step": 25745 }, { "epoch": 8.03, "grad_norm": 0.06552843004465103, "learning_rate": 5.678385199630787e-07, "loss": 0.0013, "step": 25750 }, { "epoch": 8.03, "grad_norm": 0.05143953859806061, "learning_rate": 5.660315496669888e-07, "loss": 0.0009, "step": 25755 }, { "epoch": 8.03, "grad_norm": 0.043731290847063065, "learning_rate": 5.64227375273948e-07, "loss": 0.001, "step": 25760 }, { "epoch": 8.04, "grad_norm": 0.09969484806060791, "learning_rate": 5.624259973186541e-07, "loss": 0.0012, "step": 25765 }, { "epoch": 8.04, "grad_norm": 0.0643591657280922, "learning_rate": 5.606274163349712e-07, "loss": 0.0007, "step": 25770 }, { "epoch": 8.04, "grad_norm": 0.12061813473701477, "learning_rate": 5.588316328559385e-07, "loss": 0.0017, "step": 25775 }, { "epoch": 8.04, "grad_norm": 0.07178698480129242, "learning_rate": 5.570386474137624e-07, "loss": 0.0017, "step": 25780 }, { "epoch": 8.04, "grad_norm": 0.052433501929044724, "learning_rate": 5.552484605398234e-07, "loss": 0.0023, "step": 25785 }, { "epoch": 8.04, "grad_norm": 0.053316082805395126, "learning_rate": 5.534610727646716e-07, "loss": 0.0008, "step": 25790 }, { "epoch": 8.05, "grad_norm": 0.11161698400974274, "learning_rate": 5.516764846180256e-07, "loss": 0.0012, "step": 25795 }, { "epoch": 8.05, "grad_norm": 0.05765370652079582, "learning_rate": 5.498946966287777e-07, "loss": 0.0011, "step": 25800 }, { "epoch": 8.05, "grad_norm": 0.021409546956419945, "learning_rate": 5.481157093249866e-07, "loss": 0.0009, "step": 25805 }, { "epoch": 8.05, "grad_norm": 0.16189897060394287, "learning_rate": 5.46339523233883e-07, "loss": 0.0012, "step": 25810 }, { "epoch": 8.05, "grad_norm": 0.07351277023553848, "learning_rate": 5.44566138881868e-07, "loss": 0.0006, "step": 25815 }, { "epoch": 8.05, "grad_norm": 0.09774326533079147, "learning_rate": 5.42795556794512e-07, "loss": 0.0008, "step": 25820 }, { "epoch": 8.05, "grad_norm": 0.029827700927853584, "learning_rate": 5.410277774965533e-07, "loss": 0.0012, "step": 25825 }, { "epoch": 8.06, "grad_norm": 0.05322696641087532, "learning_rate": 5.392628015119017e-07, "loss": 0.001, "step": 25830 }, { "epoch": 8.06, "grad_norm": 0.0880359560251236, "learning_rate": 5.375006293636342e-07, "loss": 0.001, "step": 25835 }, { "epoch": 8.06, "grad_norm": 0.02932639606297016, "learning_rate": 5.357412615739999e-07, "loss": 0.0011, "step": 25840 }, { "epoch": 8.06, "grad_norm": 0.09115365147590637, "learning_rate": 5.339846986644126e-07, "loss": 0.001, "step": 25845 }, { "epoch": 8.06, "grad_norm": 0.057407550513744354, "learning_rate": 5.322309411554583e-07, "loss": 0.001, "step": 25850 }, { "epoch": 8.06, "grad_norm": 0.057419128715991974, "learning_rate": 5.304799895668877e-07, "loss": 0.0006, "step": 25855 }, { "epoch": 8.07, "grad_norm": 0.0982402116060257, "learning_rate": 5.287318444176259e-07, "loss": 0.0014, "step": 25860 }, { "epoch": 8.07, "grad_norm": 0.08957464247941971, "learning_rate": 5.269865062257618e-07, "loss": 0.0012, "step": 25865 }, { "epoch": 8.07, "grad_norm": 0.05921388417482376, "learning_rate": 5.252439755085536e-07, "loss": 0.0009, "step": 25870 }, { "epoch": 8.07, "grad_norm": 0.02325126901268959, "learning_rate": 5.235042527824264e-07, "loss": 0.0013, "step": 25875 }, { "epoch": 8.07, "grad_norm": 0.059985339641571045, "learning_rate": 5.21767338562974e-07, "loss": 0.0012, "step": 25880 }, { "epoch": 8.07, "grad_norm": 0.08335838466882706, "learning_rate": 5.200332333649605e-07, "loss": 0.0011, "step": 25885 }, { "epoch": 8.08, "grad_norm": 0.09611143916845322, "learning_rate": 5.183019377023113e-07, "loss": 0.0017, "step": 25890 }, { "epoch": 8.08, "grad_norm": 0.03355991095304489, "learning_rate": 5.165734520881227e-07, "loss": 0.0008, "step": 25895 }, { "epoch": 8.08, "grad_norm": 0.06265156716108322, "learning_rate": 5.148477770346616e-07, "loss": 0.0009, "step": 25900 }, { "epoch": 8.08, "grad_norm": 0.07441845536231995, "learning_rate": 5.131249130533567e-07, "loss": 0.0014, "step": 25905 }, { "epoch": 8.08, "grad_norm": 0.06214327737689018, "learning_rate": 5.114048606548061e-07, "loss": 0.001, "step": 25910 }, { "epoch": 8.08, "grad_norm": 0.059696801006793976, "learning_rate": 5.096876203487733e-07, "loss": 0.001, "step": 25915 }, { "epoch": 8.08, "grad_norm": 0.03962871804833412, "learning_rate": 5.079731926441911e-07, "loss": 0.0014, "step": 25920 }, { "epoch": 8.09, "grad_norm": 0.10176525264978409, "learning_rate": 5.062615780491531e-07, "loss": 0.001, "step": 25925 }, { "epoch": 8.09, "grad_norm": 0.07909473031759262, "learning_rate": 5.045527770709258e-07, "loss": 0.001, "step": 25930 }, { "epoch": 8.09, "grad_norm": 0.09313105791807175, "learning_rate": 5.028467902159373e-07, "loss": 0.0012, "step": 25935 }, { "epoch": 8.09, "grad_norm": 0.06423319876194, "learning_rate": 5.011436179897833e-07, "loss": 0.0007, "step": 25940 }, { "epoch": 8.09, "grad_norm": 0.08090867102146149, "learning_rate": 4.994432608972266e-07, "loss": 0.0008, "step": 25945 }, { "epoch": 8.09, "grad_norm": 0.04727114737033844, "learning_rate": 4.977457194421941e-07, "loss": 0.0008, "step": 25950 }, { "epoch": 8.1, "grad_norm": 0.11219633370637894, "learning_rate": 4.960509941277792e-07, "loss": 0.0013, "step": 25955 }, { "epoch": 8.1, "grad_norm": 0.09417059272527695, "learning_rate": 4.943590854562397e-07, "loss": 0.0016, "step": 25960 }, { "epoch": 8.1, "grad_norm": 0.03398860618472099, "learning_rate": 4.92669993928997e-07, "loss": 0.0009, "step": 25965 }, { "epoch": 8.1, "grad_norm": 0.06553380936384201, "learning_rate": 4.909837200466416e-07, "loss": 0.0017, "step": 25970 }, { "epoch": 8.1, "grad_norm": 0.08675870299339294, "learning_rate": 4.893002643089262e-07, "loss": 0.0014, "step": 25975 }, { "epoch": 8.1, "grad_norm": 0.05225129798054695, "learning_rate": 4.876196272147693e-07, "loss": 0.0008, "step": 25980 }, { "epoch": 8.1, "grad_norm": 0.13486607372760773, "learning_rate": 4.859418092622525e-07, "loss": 0.0015, "step": 25985 }, { "epoch": 8.11, "grad_norm": 0.08578328043222427, "learning_rate": 4.842668109486271e-07, "loss": 0.0011, "step": 25990 }, { "epoch": 8.11, "grad_norm": 0.08599358797073364, "learning_rate": 4.825946327703024e-07, "loss": 0.0015, "step": 25995 }, { "epoch": 8.11, "grad_norm": 0.04679775983095169, "learning_rate": 4.809252752228522e-07, "loss": 0.0007, "step": 26000 }, { "epoch": 8.11, "grad_norm": 0.09810055792331696, "learning_rate": 4.792587388010195e-07, "loss": 0.0015, "step": 26005 }, { "epoch": 8.11, "grad_norm": 0.028725087642669678, "learning_rate": 4.77595023998707e-07, "loss": 0.0014, "step": 26010 }, { "epoch": 8.11, "grad_norm": 0.09821700304746628, "learning_rate": 4.759341313089838e-07, "loss": 0.0011, "step": 26015 }, { "epoch": 8.12, "grad_norm": 0.054021578282117844, "learning_rate": 4.742760612240793e-07, "loss": 0.0011, "step": 26020 }, { "epoch": 8.12, "grad_norm": 0.09391536563634872, "learning_rate": 4.7262081423538717e-07, "loss": 0.0008, "step": 26025 }, { "epoch": 8.12, "grad_norm": 0.05493954196572304, "learning_rate": 4.7096839083347166e-07, "loss": 0.0009, "step": 26030 }, { "epoch": 8.12, "grad_norm": 0.012998754158616066, "learning_rate": 4.693187915080477e-07, "loss": 0.001, "step": 26035 }, { "epoch": 8.12, "grad_norm": 0.10303235054016113, "learning_rate": 4.676720167480031e-07, "loss": 0.0013, "step": 26040 }, { "epoch": 8.12, "grad_norm": 0.05119301378726959, "learning_rate": 4.6602806704138304e-07, "loss": 0.0009, "step": 26045 }, { "epoch": 8.12, "grad_norm": 0.043302636593580246, "learning_rate": 4.643869428753989e-07, "loss": 0.0017, "step": 26050 }, { "epoch": 8.13, "grad_norm": 0.029808012768626213, "learning_rate": 4.6274864473642266e-07, "loss": 0.0007, "step": 26055 }, { "epoch": 8.13, "grad_norm": 0.028385180979967117, "learning_rate": 4.611131731099905e-07, "loss": 0.0012, "step": 26060 }, { "epoch": 8.13, "grad_norm": 0.05698021501302719, "learning_rate": 4.594805284807979e-07, "loss": 0.0012, "step": 26065 }, { "epoch": 8.13, "grad_norm": 0.050949130207300186, "learning_rate": 4.578507113327069e-07, "loss": 0.0009, "step": 26070 }, { "epoch": 8.13, "grad_norm": 0.04433879628777504, "learning_rate": 4.5622372214873644e-07, "loss": 0.0014, "step": 26075 }, { "epoch": 8.13, "grad_norm": 0.052618153393268585, "learning_rate": 4.545995614110721e-07, "loss": 0.001, "step": 26080 }, { "epoch": 8.14, "grad_norm": 0.061884429305791855, "learning_rate": 4.5297822960105877e-07, "loss": 0.0013, "step": 26085 }, { "epoch": 8.14, "grad_norm": 0.05236772447824478, "learning_rate": 4.513597271992032e-07, "loss": 0.0009, "step": 26090 }, { "epoch": 8.14, "grad_norm": 0.08672960102558136, "learning_rate": 4.497440546851728e-07, "loss": 0.0012, "step": 26095 }, { "epoch": 8.14, "grad_norm": 0.02289748750627041, "learning_rate": 4.4813121253779787e-07, "loss": 0.0014, "step": 26100 }, { "epoch": 8.14, "grad_norm": 0.0577850304543972, "learning_rate": 4.465212012350695e-07, "loss": 0.0012, "step": 26105 }, { "epoch": 8.14, "grad_norm": 0.062029387801885605, "learning_rate": 4.4491402125413917e-07, "loss": 0.0019, "step": 26110 }, { "epoch": 8.15, "grad_norm": 0.05040305480360985, "learning_rate": 4.4330967307132043e-07, "loss": 0.0011, "step": 26115 }, { "epoch": 8.15, "grad_norm": 0.08229151368141174, "learning_rate": 4.4170815716208514e-07, "loss": 0.0012, "step": 26120 }, { "epoch": 8.15, "grad_norm": 0.060748420655727386, "learning_rate": 4.401094740010692e-07, "loss": 0.0012, "step": 26125 }, { "epoch": 8.15, "grad_norm": 0.07263582944869995, "learning_rate": 4.3851362406206575e-07, "loss": 0.0012, "step": 26130 }, { "epoch": 8.15, "grad_norm": 0.020791111513972282, "learning_rate": 4.369206078180299e-07, "loss": 0.001, "step": 26135 }, { "epoch": 8.15, "grad_norm": 0.08441232144832611, "learning_rate": 4.3533042574107953e-07, "loss": 0.0015, "step": 26140 }, { "epoch": 8.15, "grad_norm": 0.13795146346092224, "learning_rate": 4.337430783024843e-07, "loss": 0.0012, "step": 26145 }, { "epoch": 8.16, "grad_norm": 0.05792690068483353, "learning_rate": 4.3215856597268346e-07, "loss": 0.0014, "step": 26150 }, { "epoch": 8.16, "grad_norm": 0.014939947985112667, "learning_rate": 4.305768892212714e-07, "loss": 0.0011, "step": 26155 }, { "epoch": 8.16, "grad_norm": 0.12674427032470703, "learning_rate": 4.28998048517002e-07, "loss": 0.0014, "step": 26160 }, { "epoch": 8.16, "grad_norm": 0.09565210342407227, "learning_rate": 4.274220443277888e-07, "loss": 0.0011, "step": 26165 }, { "epoch": 8.16, "grad_norm": 0.10866738110780716, "learning_rate": 4.2584887712070587e-07, "loss": 0.0011, "step": 26170 }, { "epoch": 8.16, "grad_norm": 0.04313337057828903, "learning_rate": 4.2427854736198594e-07, "loss": 0.0006, "step": 26175 }, { "epoch": 8.17, "grad_norm": 0.06837520748376846, "learning_rate": 4.2271105551702e-07, "loss": 0.0011, "step": 26180 }, { "epoch": 8.17, "grad_norm": 0.04312984645366669, "learning_rate": 4.211464020503564e-07, "loss": 0.0009, "step": 26185 }, { "epoch": 8.17, "grad_norm": 0.058144159615039825, "learning_rate": 4.195845874257087e-07, "loss": 0.0005, "step": 26190 }, { "epoch": 8.17, "grad_norm": 0.04488474503159523, "learning_rate": 4.180256121059423e-07, "loss": 0.0011, "step": 26195 }, { "epoch": 8.17, "grad_norm": 0.05889725685119629, "learning_rate": 4.164694765530841e-07, "loss": 0.0007, "step": 26200 }, { "epoch": 8.17, "grad_norm": 0.04025601968169212, "learning_rate": 4.149161812283209e-07, "loss": 0.0012, "step": 26205 }, { "epoch": 8.17, "grad_norm": 0.29714614152908325, "learning_rate": 4.1336572659199435e-07, "loss": 0.0015, "step": 26210 }, { "epoch": 8.18, "grad_norm": 0.2917931377887726, "learning_rate": 4.1181811310360476e-07, "loss": 0.0023, "step": 26215 }, { "epoch": 8.18, "grad_norm": 0.04373086243867874, "learning_rate": 4.1027334122181297e-07, "loss": 0.0006, "step": 26220 }, { "epoch": 8.18, "grad_norm": 0.012117493897676468, "learning_rate": 4.0873141140443496e-07, "loss": 0.0004, "step": 26225 }, { "epoch": 8.18, "grad_norm": 0.10136426985263824, "learning_rate": 4.0719232410844414e-07, "loss": 0.0014, "step": 26230 }, { "epoch": 8.18, "grad_norm": 0.2507619261741638, "learning_rate": 4.0565607978997777e-07, "loss": 0.0017, "step": 26235 }, { "epoch": 8.18, "grad_norm": 0.0383005253970623, "learning_rate": 4.041226789043218e-07, "loss": 0.001, "step": 26240 }, { "epoch": 8.19, "grad_norm": 0.06149076670408249, "learning_rate": 4.02592121905927e-07, "loss": 0.0012, "step": 26245 }, { "epoch": 8.19, "grad_norm": 0.07256074249744415, "learning_rate": 4.0106440924839286e-07, "loss": 0.0014, "step": 26250 }, { "epoch": 8.19, "grad_norm": 0.08970825374126434, "learning_rate": 3.995395413844838e-07, "loss": 0.0009, "step": 26255 }, { "epoch": 8.19, "grad_norm": 0.08046100288629532, "learning_rate": 3.9801751876611616e-07, "loss": 0.0014, "step": 26260 }, { "epoch": 8.19, "grad_norm": 0.06438751518726349, "learning_rate": 3.9649834184436797e-07, "loss": 0.0012, "step": 26265 }, { "epoch": 8.19, "grad_norm": 0.03636743873357773, "learning_rate": 3.94982011069468e-07, "loss": 0.0012, "step": 26270 }, { "epoch": 8.2, "grad_norm": 0.04495951533317566, "learning_rate": 3.9346852689080453e-07, "loss": 0.0008, "step": 26275 }, { "epoch": 8.2, "grad_norm": 0.1346733570098877, "learning_rate": 3.919578897569254e-07, "loss": 0.0017, "step": 26280 }, { "epoch": 8.2, "grad_norm": 0.09756915271282196, "learning_rate": 3.904501001155281e-07, "loss": 0.0015, "step": 26285 }, { "epoch": 8.2, "grad_norm": 0.10456876456737518, "learning_rate": 3.8894515841347067e-07, "loss": 0.0008, "step": 26290 }, { "epoch": 8.2, "grad_norm": 0.03999922797083855, "learning_rate": 3.8744306509676534e-07, "loss": 0.0006, "step": 26295 }, { "epoch": 8.2, "grad_norm": 0.05138784274458885, "learning_rate": 3.8594382061058144e-07, "loss": 0.0009, "step": 26300 }, { "epoch": 8.2, "grad_norm": 0.042921282351017, "learning_rate": 3.8444742539924364e-07, "loss": 0.0016, "step": 26305 }, { "epoch": 8.21, "grad_norm": 0.04632921516895294, "learning_rate": 3.8295387990623156e-07, "loss": 0.0008, "step": 26310 }, { "epoch": 8.21, "grad_norm": 0.13882958889007568, "learning_rate": 3.8146318457417784e-07, "loss": 0.0014, "step": 26315 }, { "epoch": 8.21, "grad_norm": 0.07300465553998947, "learning_rate": 3.7997533984488024e-07, "loss": 0.0015, "step": 26320 }, { "epoch": 8.21, "grad_norm": 0.05495378002524376, "learning_rate": 3.7849034615927706e-07, "loss": 0.0007, "step": 26325 }, { "epoch": 8.21, "grad_norm": 0.06700440496206284, "learning_rate": 3.770082039574741e-07, "loss": 0.0009, "step": 26330 }, { "epoch": 8.21, "grad_norm": 0.08584974706172943, "learning_rate": 3.7552891367872433e-07, "loss": 0.0009, "step": 26335 }, { "epoch": 8.22, "grad_norm": 0.04152682423591614, "learning_rate": 3.7405247576144055e-07, "loss": 0.0016, "step": 26340 }, { "epoch": 8.22, "grad_norm": 0.0706639513373375, "learning_rate": 3.7257889064318596e-07, "loss": 0.0018, "step": 26345 }, { "epoch": 8.22, "grad_norm": 0.08724109083414078, "learning_rate": 3.711081587606824e-07, "loss": 0.0009, "step": 26350 }, { "epoch": 8.22, "grad_norm": 0.014516488648951054, "learning_rate": 3.696402805498034e-07, "loss": 0.0009, "step": 26355 }, { "epoch": 8.22, "grad_norm": 0.10170592367649078, "learning_rate": 3.681752564455765e-07, "loss": 0.0009, "step": 26360 }, { "epoch": 8.22, "grad_norm": 0.0802757740020752, "learning_rate": 3.6671308688218556e-07, "loss": 0.0013, "step": 26365 }, { "epoch": 8.22, "grad_norm": 0.20823164284229279, "learning_rate": 3.652537722929672e-07, "loss": 0.0017, "step": 26370 }, { "epoch": 8.23, "grad_norm": 0.09853595495223999, "learning_rate": 3.637973131104111e-07, "loss": 0.0008, "step": 26375 }, { "epoch": 8.23, "grad_norm": 0.09480353444814682, "learning_rate": 3.6234370976616196e-07, "loss": 0.0012, "step": 26380 }, { "epoch": 8.23, "grad_norm": 0.1411803662776947, "learning_rate": 3.6089296269101736e-07, "loss": 0.0012, "step": 26385 }, { "epoch": 8.23, "grad_norm": 0.1442306637763977, "learning_rate": 3.594450723149301e-07, "loss": 0.0012, "step": 26390 }, { "epoch": 8.23, "grad_norm": 0.03480003401637077, "learning_rate": 3.580000390670035e-07, "loss": 0.0008, "step": 26395 }, { "epoch": 8.23, "grad_norm": 0.059012580662965775, "learning_rate": 3.5655786337549626e-07, "loss": 0.001, "step": 26400 }, { "epoch": 8.24, "grad_norm": 0.1240774467587471, "learning_rate": 3.5511854566781966e-07, "loss": 0.0013, "step": 26405 }, { "epoch": 8.24, "grad_norm": 0.11328074336051941, "learning_rate": 3.53682086370537e-07, "loss": 0.0011, "step": 26410 }, { "epoch": 8.24, "grad_norm": 0.03457697853446007, "learning_rate": 3.522484859093667e-07, "loss": 0.0005, "step": 26415 }, { "epoch": 8.24, "grad_norm": 0.0656462237238884, "learning_rate": 3.5081774470917763e-07, "loss": 0.0009, "step": 26420 }, { "epoch": 8.24, "grad_norm": 0.054810818284749985, "learning_rate": 3.4938986319399295e-07, "loss": 0.0015, "step": 26425 }, { "epoch": 8.24, "grad_norm": 0.20973767340183258, "learning_rate": 3.479648417869863e-07, "loss": 0.0011, "step": 26430 }, { "epoch": 8.24, "grad_norm": 0.058540064841508865, "learning_rate": 3.4654268091048324e-07, "loss": 0.0012, "step": 26435 }, { "epoch": 8.25, "grad_norm": 0.059108808636665344, "learning_rate": 3.451233809859678e-07, "loss": 0.0013, "step": 26440 }, { "epoch": 8.25, "grad_norm": 0.1766173541545868, "learning_rate": 3.437069424340678e-07, "loss": 0.0014, "step": 26445 }, { "epoch": 8.25, "grad_norm": 0.07533004879951477, "learning_rate": 3.4229336567456974e-07, "loss": 0.001, "step": 26450 }, { "epoch": 8.25, "grad_norm": 0.1083541065454483, "learning_rate": 3.408826511264063e-07, "loss": 0.0009, "step": 26455 }, { "epoch": 8.25, "grad_norm": 0.1199837252497673, "learning_rate": 3.394747992076663e-07, "loss": 0.0012, "step": 26460 }, { "epoch": 8.25, "grad_norm": 0.06192876398563385, "learning_rate": 3.380698103355884e-07, "loss": 0.0007, "step": 26465 }, { "epoch": 8.26, "grad_norm": 0.11188165098428726, "learning_rate": 3.3666768492656264e-07, "loss": 0.0009, "step": 26470 }, { "epoch": 8.26, "grad_norm": 0.023300036787986755, "learning_rate": 3.352684233961301e-07, "loss": 0.0009, "step": 26475 }, { "epoch": 8.26, "grad_norm": 0.08266016840934753, "learning_rate": 3.3387202615898226e-07, "loss": 0.001, "step": 26480 }, { "epoch": 8.26, "grad_norm": 0.04422038048505783, "learning_rate": 3.3247849362896713e-07, "loss": 0.0008, "step": 26485 }, { "epoch": 8.26, "grad_norm": 0.09365659952163696, "learning_rate": 3.3108782621907643e-07, "loss": 0.0012, "step": 26490 }, { "epoch": 8.26, "grad_norm": 0.020148232579231262, "learning_rate": 3.2970002434145833e-07, "loss": 0.001, "step": 26495 }, { "epoch": 8.27, "grad_norm": 0.09790826588869095, "learning_rate": 3.283150884074082e-07, "loss": 0.0015, "step": 26500 }, { "epoch": 8.27, "grad_norm": 0.033936236053705215, "learning_rate": 3.269330188273734e-07, "loss": 0.0008, "step": 26505 }, { "epoch": 8.27, "grad_norm": 0.17181117832660675, "learning_rate": 3.2555381601095057e-07, "loss": 0.0018, "step": 26510 }, { "epoch": 8.27, "grad_norm": 0.09773623198270798, "learning_rate": 3.241774803668907e-07, "loss": 0.0008, "step": 26515 }, { "epoch": 8.27, "grad_norm": 0.08878730237483978, "learning_rate": 3.2280401230308845e-07, "loss": 0.0013, "step": 26520 }, { "epoch": 8.27, "grad_norm": 0.09682467579841614, "learning_rate": 3.2143341222659495e-07, "loss": 0.0006, "step": 26525 }, { "epoch": 8.27, "grad_norm": 0.07581570744514465, "learning_rate": 3.2006568054360974e-07, "loss": 0.0009, "step": 26530 }, { "epoch": 8.28, "grad_norm": 0.10100431740283966, "learning_rate": 3.187008176594797e-07, "loss": 0.0011, "step": 26535 }, { "epoch": 8.28, "grad_norm": 0.07675759494304657, "learning_rate": 3.173388239787034e-07, "loss": 0.0012, "step": 26540 }, { "epoch": 8.28, "grad_norm": 0.07665850967168808, "learning_rate": 3.1597969990492804e-07, "loss": 0.0005, "step": 26545 }, { "epoch": 8.28, "grad_norm": 0.030014360323548317, "learning_rate": 3.146234458409525e-07, "loss": 0.001, "step": 26550 }, { "epoch": 8.28, "grad_norm": 0.09450791031122208, "learning_rate": 3.1327006218872303e-07, "loss": 0.001, "step": 26555 }, { "epoch": 8.28, "grad_norm": 0.08888278901576996, "learning_rate": 3.1191954934933657e-07, "loss": 0.0012, "step": 26560 }, { "epoch": 8.29, "grad_norm": 0.06696563959121704, "learning_rate": 3.1057190772303626e-07, "loss": 0.0018, "step": 26565 }, { "epoch": 8.29, "grad_norm": 0.046966444700956345, "learning_rate": 3.0922713770922155e-07, "loss": 0.0008, "step": 26570 }, { "epoch": 8.29, "grad_norm": 0.029699940234422684, "learning_rate": 3.0788523970643027e-07, "loss": 0.0013, "step": 26575 }, { "epoch": 8.29, "grad_norm": 0.07722587883472443, "learning_rate": 3.0654621411235874e-07, "loss": 0.0011, "step": 26580 }, { "epoch": 8.29, "grad_norm": 0.10917071253061295, "learning_rate": 3.0521006132384513e-07, "loss": 0.0007, "step": 26585 }, { "epoch": 8.29, "grad_norm": 0.02738550864160061, "learning_rate": 3.038767817368815e-07, "loss": 0.0009, "step": 26590 }, { "epoch": 8.29, "grad_norm": 0.11273742467164993, "learning_rate": 3.0254637574660406e-07, "loss": 0.0012, "step": 26595 }, { "epoch": 8.3, "grad_norm": 0.024482088163495064, "learning_rate": 3.012188437473007e-07, "loss": 0.0008, "step": 26600 }, { "epoch": 8.3, "grad_norm": 0.0887916088104248, "learning_rate": 2.998941861324056e-07, "loss": 0.0009, "step": 26605 }, { "epoch": 8.3, "grad_norm": 0.04568292200565338, "learning_rate": 2.985724032945003e-07, "loss": 0.0008, "step": 26610 }, { "epoch": 8.3, "grad_norm": 0.07197744399309158, "learning_rate": 2.972534956253181e-07, "loss": 0.0016, "step": 26615 }, { "epoch": 8.3, "grad_norm": 0.11375932395458221, "learning_rate": 2.959374635157364e-07, "loss": 0.0011, "step": 26620 }, { "epoch": 8.3, "grad_norm": 0.07851669937372208, "learning_rate": 2.946243073557831e-07, "loss": 0.0008, "step": 26625 }, { "epoch": 8.31, "grad_norm": 0.15448705852031708, "learning_rate": 2.933140275346291e-07, "loss": 0.0009, "step": 26630 }, { "epoch": 8.31, "grad_norm": 0.06736727058887482, "learning_rate": 2.9200662444060056e-07, "loss": 0.0009, "step": 26635 }, { "epoch": 8.31, "grad_norm": 0.19965125620365143, "learning_rate": 2.9070209846116303e-07, "loss": 0.0014, "step": 26640 }, { "epoch": 8.31, "grad_norm": 0.037576086819171906, "learning_rate": 2.89400449982935e-07, "loss": 0.0007, "step": 26645 }, { "epoch": 8.31, "grad_norm": 0.0317411944270134, "learning_rate": 2.881016793916791e-07, "loss": 0.0006, "step": 26650 }, { "epoch": 8.31, "grad_norm": 0.07414986193180084, "learning_rate": 2.868057870723073e-07, "loss": 0.001, "step": 26655 }, { "epoch": 8.32, "grad_norm": 0.33764782547950745, "learning_rate": 2.8551277340887696e-07, "loss": 0.0011, "step": 26660 }, { "epoch": 8.32, "grad_norm": 0.02444450557231903, "learning_rate": 2.8422263878459256e-07, "loss": 0.0007, "step": 26665 }, { "epoch": 8.32, "grad_norm": 0.0931888222694397, "learning_rate": 2.8293538358180607e-07, "loss": 0.0015, "step": 26670 }, { "epoch": 8.32, "grad_norm": 0.053565751761198044, "learning_rate": 2.816510081820145e-07, "loss": 0.001, "step": 26675 }, { "epoch": 8.32, "grad_norm": 0.1053864061832428, "learning_rate": 2.8036951296586346e-07, "loss": 0.0007, "step": 26680 }, { "epoch": 8.32, "grad_norm": 0.249299556016922, "learning_rate": 2.790908983131435e-07, "loss": 0.0017, "step": 26685 }, { "epoch": 8.32, "grad_norm": 0.07624279707670212, "learning_rate": 2.7781516460279154e-07, "loss": 0.0013, "step": 26690 }, { "epoch": 8.33, "grad_norm": 0.06324794143438339, "learning_rate": 2.7654231221289187e-07, "loss": 0.0012, "step": 26695 }, { "epoch": 8.33, "grad_norm": 0.0402330718934536, "learning_rate": 2.752723415206737e-07, "loss": 0.0013, "step": 26700 }, { "epoch": 8.33, "grad_norm": 0.0825943797826767, "learning_rate": 2.740052529025128e-07, "loss": 0.0013, "step": 26705 }, { "epoch": 8.33, "grad_norm": 0.08207705616950989, "learning_rate": 2.727410467339309e-07, "loss": 0.0011, "step": 26710 }, { "epoch": 8.33, "grad_norm": 0.07926084846258163, "learning_rate": 2.714797233895949e-07, "loss": 0.0014, "step": 26715 }, { "epoch": 8.33, "grad_norm": 0.02722475863993168, "learning_rate": 2.70221283243316e-07, "loss": 0.0012, "step": 26720 }, { "epoch": 8.34, "grad_norm": 0.017125660553574562, "learning_rate": 2.689657266680534e-07, "loss": 0.001, "step": 26725 }, { "epoch": 8.34, "grad_norm": 0.070092111825943, "learning_rate": 2.6771305403591184e-07, "loss": 0.001, "step": 26730 }, { "epoch": 8.34, "grad_norm": 0.1087091714143753, "learning_rate": 2.6646326571813984e-07, "loss": 0.0009, "step": 26735 }, { "epoch": 8.34, "grad_norm": 0.08016277104616165, "learning_rate": 2.652163620851311e-07, "loss": 0.001, "step": 26740 }, { "epoch": 8.34, "grad_norm": 0.07137833535671234, "learning_rate": 2.639723435064245e-07, "loss": 0.0009, "step": 26745 }, { "epoch": 8.34, "grad_norm": 0.031105132773518562, "learning_rate": 2.627312103507063e-07, "loss": 0.0011, "step": 26750 }, { "epoch": 8.34, "grad_norm": 0.0440361313521862, "learning_rate": 2.614929629858043e-07, "loss": 0.0011, "step": 26755 }, { "epoch": 8.35, "grad_norm": 0.007057182025164366, "learning_rate": 2.6025760177869063e-07, "loss": 0.0012, "step": 26760 }, { "epoch": 8.35, "grad_norm": 0.07112181931734085, "learning_rate": 2.590251270954858e-07, "loss": 0.0012, "step": 26765 }, { "epoch": 8.35, "grad_norm": 0.11220315098762512, "learning_rate": 2.5779553930145194e-07, "loss": 0.002, "step": 26770 }, { "epoch": 8.35, "grad_norm": 0.09323884546756744, "learning_rate": 2.565688387609966e-07, "loss": 0.0011, "step": 26775 }, { "epoch": 8.35, "grad_norm": 0.046231284737586975, "learning_rate": 2.553450258376733e-07, "loss": 0.0009, "step": 26780 }, { "epoch": 8.35, "grad_norm": 0.11686007678508759, "learning_rate": 2.5412410089417525e-07, "loss": 0.0011, "step": 26785 }, { "epoch": 8.36, "grad_norm": 0.07207207381725311, "learning_rate": 2.5290606429234534e-07, "loss": 0.0012, "step": 26790 }, { "epoch": 8.36, "grad_norm": 0.008922003209590912, "learning_rate": 2.516909163931658e-07, "loss": 0.001, "step": 26795 }, { "epoch": 8.36, "grad_norm": 0.0872945487499237, "learning_rate": 2.5047865755676413e-07, "loss": 0.0012, "step": 26800 }, { "epoch": 8.36, "grad_norm": 0.12244457006454468, "learning_rate": 2.4926928814241305e-07, "loss": 0.0012, "step": 26805 }, { "epoch": 8.36, "grad_norm": 0.18743190169334412, "learning_rate": 2.480628085085257e-07, "loss": 0.0009, "step": 26810 }, { "epoch": 8.36, "grad_norm": 0.24648462235927582, "learning_rate": 2.4685921901266505e-07, "loss": 0.0008, "step": 26815 }, { "epoch": 8.37, "grad_norm": 0.03587842360138893, "learning_rate": 2.456585200115313e-07, "loss": 0.0007, "step": 26820 }, { "epoch": 8.37, "grad_norm": 0.08156446367502213, "learning_rate": 2.444607118609721e-07, "loss": 0.0008, "step": 26825 }, { "epoch": 8.37, "grad_norm": 0.20609742403030396, "learning_rate": 2.4326579491597335e-07, "loss": 0.0015, "step": 26830 }, { "epoch": 8.37, "grad_norm": 0.0752704069018364, "learning_rate": 2.4207376953066853e-07, "loss": 0.0017, "step": 26835 }, { "epoch": 8.37, "grad_norm": 0.1400754451751709, "learning_rate": 2.408846360583339e-07, "loss": 0.0013, "step": 26840 }, { "epoch": 8.37, "grad_norm": 0.023851117119193077, "learning_rate": 2.3969839485138644e-07, "loss": 0.0008, "step": 26845 }, { "epoch": 8.37, "grad_norm": 0.028327608481049538, "learning_rate": 2.385150462613883e-07, "loss": 0.001, "step": 26850 }, { "epoch": 8.38, "grad_norm": 0.10897330194711685, "learning_rate": 2.3733459063904009e-07, "loss": 0.0009, "step": 26855 }, { "epoch": 8.38, "grad_norm": 0.047635432332754135, "learning_rate": 2.3615702833419297e-07, "loss": 0.0014, "step": 26860 }, { "epoch": 8.38, "grad_norm": 0.19598473608493805, "learning_rate": 2.349823596958334e-07, "loss": 0.0017, "step": 26865 }, { "epoch": 8.38, "grad_norm": 0.051575757563114166, "learning_rate": 2.3381058507209175e-07, "loss": 0.0012, "step": 26870 }, { "epoch": 8.38, "grad_norm": 0.08787445724010468, "learning_rate": 2.326417048102425e-07, "loss": 0.0006, "step": 26875 }, { "epoch": 8.38, "grad_norm": 0.20478199422359467, "learning_rate": 2.3147571925670187e-07, "loss": 0.0015, "step": 26880 }, { "epoch": 8.39, "grad_norm": 0.11912625283002853, "learning_rate": 2.3031262875702787e-07, "loss": 0.0017, "step": 26885 }, { "epoch": 8.39, "grad_norm": 0.11537455767393112, "learning_rate": 2.2915243365591923e-07, "loss": 0.0011, "step": 26890 }, { "epoch": 8.39, "grad_norm": 0.18906544148921967, "learning_rate": 2.2799513429721865e-07, "loss": 0.0014, "step": 26895 }, { "epoch": 8.39, "grad_norm": 0.07237179577350616, "learning_rate": 2.268407310239107e-07, "loss": 0.0008, "step": 26900 }, { "epoch": 8.39, "grad_norm": 0.15105654299259186, "learning_rate": 2.2568922417811835e-07, "loss": 0.001, "step": 26905 }, { "epoch": 8.39, "grad_norm": 0.0675887018442154, "learning_rate": 2.2454061410111083e-07, "loss": 0.0008, "step": 26910 }, { "epoch": 8.39, "grad_norm": 0.14444920420646667, "learning_rate": 2.2339490113329475e-07, "loss": 0.002, "step": 26915 }, { "epoch": 8.4, "grad_norm": 0.0659114420413971, "learning_rate": 2.2225208561422185e-07, "loss": 0.001, "step": 26920 }, { "epoch": 8.4, "grad_norm": 0.09269290417432785, "learning_rate": 2.2111216788258117e-07, "loss": 0.0013, "step": 26925 }, { "epoch": 8.4, "grad_norm": 0.15672466158866882, "learning_rate": 2.1997514827620693e-07, "loss": 0.0014, "step": 26930 }, { "epoch": 8.4, "grad_norm": 0.0962071493268013, "learning_rate": 2.1884102713207177e-07, "loss": 0.001, "step": 26935 }, { "epoch": 8.4, "grad_norm": 0.13821633160114288, "learning_rate": 2.177098047862891e-07, "loss": 0.0011, "step": 26940 }, { "epoch": 8.4, "grad_norm": 0.025947265326976776, "learning_rate": 2.1658148157411628e-07, "loss": 0.0007, "step": 26945 }, { "epoch": 8.41, "grad_norm": 0.04380210489034653, "learning_rate": 2.1545605782994693e-07, "loss": 0.0006, "step": 26950 }, { "epoch": 8.41, "grad_norm": 0.04115161672234535, "learning_rate": 2.143335338873198e-07, "loss": 0.0013, "step": 26955 }, { "epoch": 8.41, "grad_norm": 0.11357036978006363, "learning_rate": 2.1321391007891211e-07, "loss": 0.0012, "step": 26960 }, { "epoch": 8.41, "grad_norm": 0.09247150272130966, "learning_rate": 2.1209718673654177e-07, "loss": 0.0012, "step": 26965 }, { "epoch": 8.41, "grad_norm": 0.039926059544086456, "learning_rate": 2.1098336419116628e-07, "loss": 0.0009, "step": 26970 }, { "epoch": 8.41, "grad_norm": 0.029094960540533066, "learning_rate": 2.0987244277288377e-07, "loss": 0.0009, "step": 26975 }, { "epoch": 8.41, "grad_norm": 0.05168910324573517, "learning_rate": 2.0876442281093423e-07, "loss": 0.0007, "step": 26980 }, { "epoch": 8.42, "grad_norm": 0.04371324181556702, "learning_rate": 2.076593046336972e-07, "loss": 0.0012, "step": 26985 }, { "epoch": 8.42, "grad_norm": 0.03896841034293175, "learning_rate": 2.065570885686896e-07, "loss": 0.0012, "step": 26990 }, { "epoch": 8.42, "grad_norm": 0.07638699561357498, "learning_rate": 2.0545777494257235e-07, "loss": 0.001, "step": 26995 }, { "epoch": 8.42, "grad_norm": 0.0764116644859314, "learning_rate": 2.043613640811426e-07, "loss": 0.0009, "step": 27000 }, { "epoch": 8.42, "grad_norm": 0.07070528715848923, "learning_rate": 2.0326785630934043e-07, "loss": 0.0009, "step": 27005 }, { "epoch": 8.42, "grad_norm": 0.0403977669775486, "learning_rate": 2.0217725195124104e-07, "loss": 0.0014, "step": 27010 }, { "epoch": 8.43, "grad_norm": 0.12113960087299347, "learning_rate": 2.010895513300637e-07, "loss": 0.0013, "step": 27015 }, { "epoch": 8.43, "grad_norm": 0.06756232678890228, "learning_rate": 2.0000475476816495e-07, "loss": 0.0008, "step": 27020 }, { "epoch": 8.43, "grad_norm": 0.1345250904560089, "learning_rate": 1.989228625870432e-07, "loss": 0.0011, "step": 27025 }, { "epoch": 8.43, "grad_norm": 0.05166436731815338, "learning_rate": 1.9784387510733083e-07, "loss": 0.0009, "step": 27030 }, { "epoch": 8.43, "grad_norm": 0.07139864563941956, "learning_rate": 1.967677926488043e-07, "loss": 0.001, "step": 27035 }, { "epoch": 8.43, "grad_norm": 0.06834730505943298, "learning_rate": 1.956946155303785e-07, "loss": 0.001, "step": 27040 }, { "epoch": 8.44, "grad_norm": 0.15904085338115692, "learning_rate": 1.9462434407010345e-07, "loss": 0.0014, "step": 27045 }, { "epoch": 8.44, "grad_norm": 0.07280880957841873, "learning_rate": 1.9355697858517098e-07, "loss": 0.0013, "step": 27050 }, { "epoch": 8.44, "grad_norm": 0.04626739025115967, "learning_rate": 1.924925193919136e-07, "loss": 0.001, "step": 27055 }, { "epoch": 8.44, "grad_norm": 0.05268017202615738, "learning_rate": 1.9143096680579787e-07, "loss": 0.0008, "step": 27060 }, { "epoch": 8.44, "grad_norm": 0.07796227931976318, "learning_rate": 1.9037232114143323e-07, "loss": 0.0008, "step": 27065 }, { "epoch": 8.44, "grad_norm": 0.33021658658981323, "learning_rate": 1.8931658271256426e-07, "loss": 0.0014, "step": 27070 }, { "epoch": 8.44, "grad_norm": 0.077763132750988, "learning_rate": 1.8826375183207847e-07, "loss": 0.0011, "step": 27075 }, { "epoch": 8.45, "grad_norm": 0.17922165989875793, "learning_rate": 1.8721382881199512e-07, "loss": 0.0017, "step": 27080 }, { "epoch": 8.45, "grad_norm": 0.05037185922265053, "learning_rate": 1.8616681396347646e-07, "loss": 0.0006, "step": 27085 }, { "epoch": 8.45, "grad_norm": 0.06594012677669525, "learning_rate": 1.8512270759682204e-07, "loss": 0.0006, "step": 27090 }, { "epoch": 8.45, "grad_norm": 0.07595254480838776, "learning_rate": 1.8408151002146878e-07, "loss": 0.0008, "step": 27095 }, { "epoch": 8.45, "grad_norm": 0.06146182119846344, "learning_rate": 1.8304322154599097e-07, "loss": 0.0011, "step": 27100 }, { "epoch": 8.45, "grad_norm": 0.06487392634153366, "learning_rate": 1.8200784247810243e-07, "loss": 0.0012, "step": 27105 }, { "epoch": 8.46, "grad_norm": 0.11371538043022156, "learning_rate": 1.8097537312465442e-07, "loss": 0.0008, "step": 27110 }, { "epoch": 8.46, "grad_norm": 0.08306070417165756, "learning_rate": 1.7994581379163323e-07, "loss": 0.0007, "step": 27115 }, { "epoch": 8.46, "grad_norm": 0.09612597525119781, "learning_rate": 1.7891916478416593e-07, "loss": 0.0008, "step": 27120 }, { "epoch": 8.46, "grad_norm": 0.037405677139759064, "learning_rate": 1.7789542640651692e-07, "loss": 0.0011, "step": 27125 }, { "epoch": 8.46, "grad_norm": 0.12324880063533783, "learning_rate": 1.768745989620857e-07, "loss": 0.0006, "step": 27130 }, { "epoch": 8.46, "grad_norm": 0.048658937215805054, "learning_rate": 1.7606003308179187e-07, "loss": 0.001, "step": 27135 }, { "epoch": 8.46, "grad_norm": 0.03540576621890068, "learning_rate": 1.7504444607896797e-07, "loss": 0.0013, "step": 27140 }, { "epoch": 8.47, "grad_norm": 0.06150227412581444, "learning_rate": 1.74031770854296e-07, "loss": 0.001, "step": 27145 }, { "epoch": 8.47, "grad_norm": 0.1833973377943039, "learning_rate": 1.730220077078959e-07, "loss": 0.0018, "step": 27150 }, { "epoch": 8.47, "grad_norm": 0.23827354609966278, "learning_rate": 1.7201515693902605e-07, "loss": 0.0011, "step": 27155 }, { "epoch": 8.47, "grad_norm": 0.058667294681072235, "learning_rate": 1.710112188460844e-07, "loss": 0.0009, "step": 27160 }, { "epoch": 8.47, "grad_norm": 0.07912705093622208, "learning_rate": 1.7001019372660298e-07, "loss": 0.0013, "step": 27165 }, { "epoch": 8.47, "grad_norm": 0.09708718210458755, "learning_rate": 1.6901208187725115e-07, "loss": 0.0013, "step": 27170 }, { "epoch": 8.48, "grad_norm": 0.1299380362033844, "learning_rate": 1.680168835938334e-07, "loss": 0.0012, "step": 27175 }, { "epoch": 8.48, "grad_norm": 0.07040596753358841, "learning_rate": 1.6702459917129376e-07, "loss": 0.0011, "step": 27180 }, { "epoch": 8.48, "grad_norm": 0.09709855914115906, "learning_rate": 1.6603522890370927e-07, "loss": 0.0013, "step": 27185 }, { "epoch": 8.48, "grad_norm": 0.027560172602534294, "learning_rate": 1.6504877308429757e-07, "loss": 0.0009, "step": 27190 }, { "epoch": 8.48, "grad_norm": 0.10199445486068726, "learning_rate": 1.6406523200540926e-07, "loss": 0.0009, "step": 27195 }, { "epoch": 8.48, "grad_norm": 0.08776000142097473, "learning_rate": 1.6308460595853227e-07, "loss": 0.0007, "step": 27200 }, { "epoch": 8.49, "grad_norm": 0.046178627759218216, "learning_rate": 1.6210689523428968e-07, "loss": 0.0008, "step": 27205 }, { "epoch": 8.49, "grad_norm": 0.025471661239862442, "learning_rate": 1.6113210012244196e-07, "loss": 0.0011, "step": 27210 }, { "epoch": 8.49, "grad_norm": 0.2755342125892639, "learning_rate": 1.6016022091188354e-07, "loss": 0.0016, "step": 27215 }, { "epoch": 8.49, "grad_norm": 0.1328873336315155, "learning_rate": 1.5919125789064627e-07, "loss": 0.0013, "step": 27220 }, { "epoch": 8.49, "grad_norm": 0.06281613558530807, "learning_rate": 1.5822521134589819e-07, "loss": 0.0011, "step": 27225 }, { "epoch": 8.49, "grad_norm": 0.11914748698472977, "learning_rate": 1.5726208156394252e-07, "loss": 0.0016, "step": 27230 }, { "epoch": 8.49, "grad_norm": 0.04355286806821823, "learning_rate": 1.5630186883021647e-07, "loss": 0.0011, "step": 27235 }, { "epoch": 8.5, "grad_norm": 0.13519792258739471, "learning_rate": 1.5534457342929465e-07, "loss": 0.0011, "step": 27240 }, { "epoch": 8.5, "grad_norm": 0.08627419173717499, "learning_rate": 1.5439019564488677e-07, "loss": 0.0016, "step": 27245 }, { "epoch": 8.5, "grad_norm": 0.14163166284561157, "learning_rate": 1.534387357598366e-07, "loss": 0.0012, "step": 27250 }, { "epoch": 8.5, "grad_norm": 0.0620029978454113, "learning_rate": 1.524901940561252e-07, "loss": 0.0011, "step": 27255 }, { "epoch": 8.5, "grad_norm": 0.11000042408704758, "learning_rate": 1.5154457081486663e-07, "loss": 0.0011, "step": 27260 }, { "epoch": 8.5, "grad_norm": 0.13251151144504547, "learning_rate": 1.5060186631631112e-07, "loss": 0.0014, "step": 27265 }, { "epoch": 8.51, "grad_norm": 0.09971214830875397, "learning_rate": 1.4966208083984302e-07, "loss": 0.0012, "step": 27270 }, { "epoch": 8.51, "grad_norm": 0.08008839935064316, "learning_rate": 1.4872521466398392e-07, "loss": 0.0011, "step": 27275 }, { "epoch": 8.51, "grad_norm": 0.07394137978553772, "learning_rate": 1.4779126806638954e-07, "loss": 0.0009, "step": 27280 }, { "epoch": 8.51, "grad_norm": 0.03994069993495941, "learning_rate": 1.4686024132384736e-07, "loss": 0.0012, "step": 27285 }, { "epoch": 8.51, "grad_norm": 0.13227759301662445, "learning_rate": 1.4593213471228218e-07, "loss": 0.0014, "step": 27290 }, { "epoch": 8.51, "grad_norm": 0.08159665763378143, "learning_rate": 1.450069485067518e-07, "loss": 0.0009, "step": 27295 }, { "epoch": 8.51, "grad_norm": 0.12798109650611877, "learning_rate": 1.4408468298145238e-07, "loss": 0.0014, "step": 27300 }, { "epoch": 8.52, "grad_norm": 0.09440769255161285, "learning_rate": 1.431653384097087e-07, "loss": 0.0014, "step": 27305 }, { "epoch": 8.52, "grad_norm": 0.12320124357938766, "learning_rate": 1.4224891506398386e-07, "loss": 0.0013, "step": 27310 }, { "epoch": 8.52, "grad_norm": 0.19550946354866028, "learning_rate": 1.4133541321587509e-07, "loss": 0.0022, "step": 27315 }, { "epoch": 8.52, "grad_norm": 0.05926552414894104, "learning_rate": 1.4042483313611133e-07, "loss": 0.0011, "step": 27320 }, { "epoch": 8.52, "grad_norm": 0.08274723589420319, "learning_rate": 1.3951717509455788e-07, "loss": 0.0007, "step": 27325 }, { "epoch": 8.52, "grad_norm": 0.022885218262672424, "learning_rate": 1.3861243936021286e-07, "loss": 0.001, "step": 27330 }, { "epoch": 8.53, "grad_norm": 0.01137836929410696, "learning_rate": 1.3771062620120955e-07, "loss": 0.0014, "step": 27335 }, { "epoch": 8.53, "grad_norm": 0.12840622663497925, "learning_rate": 1.3681173588481423e-07, "loss": 0.0014, "step": 27340 }, { "epoch": 8.53, "grad_norm": 0.047768354415893555, "learning_rate": 1.3591576867742594e-07, "loss": 0.0013, "step": 27345 }, { "epoch": 8.53, "grad_norm": 0.024363325908780098, "learning_rate": 1.3502272484458013e-07, "loss": 0.001, "step": 27350 }, { "epoch": 8.53, "grad_norm": 0.029072320088744164, "learning_rate": 1.341326046509428e-07, "loss": 0.0013, "step": 27355 }, { "epoch": 8.53, "grad_norm": 0.0950700044631958, "learning_rate": 1.332454083603152e-07, "loss": 0.0013, "step": 27360 }, { "epoch": 8.54, "grad_norm": 0.06451021879911423, "learning_rate": 1.323611362356314e-07, "loss": 0.0012, "step": 27365 }, { "epoch": 8.54, "grad_norm": 0.1161188930273056, "learning_rate": 1.314797885389607e-07, "loss": 0.002, "step": 27370 }, { "epoch": 8.54, "grad_norm": 0.0990685299038887, "learning_rate": 1.3060136553150194e-07, "loss": 0.0009, "step": 27375 }, { "epoch": 8.54, "grad_norm": 0.07793032377958298, "learning_rate": 1.297258674735913e-07, "loss": 0.0009, "step": 27380 }, { "epoch": 8.54, "grad_norm": 0.060773540288209915, "learning_rate": 1.2885329462469564e-07, "loss": 0.0017, "step": 27385 }, { "epoch": 8.54, "grad_norm": 0.07595169544219971, "learning_rate": 1.2798364724341483e-07, "loss": 0.0012, "step": 27390 }, { "epoch": 8.54, "grad_norm": 0.08787562698125839, "learning_rate": 1.2711692558748268e-07, "loss": 0.0013, "step": 27395 }, { "epoch": 8.55, "grad_norm": 0.05152950808405876, "learning_rate": 1.2625312991376702e-07, "loss": 0.0016, "step": 27400 }, { "epoch": 8.55, "grad_norm": 0.04772697389125824, "learning_rate": 1.2539226047826424e-07, "loss": 0.0008, "step": 27405 }, { "epoch": 8.55, "grad_norm": 0.16844956576824188, "learning_rate": 1.2453431753610913e-07, "loss": 0.0009, "step": 27410 }, { "epoch": 8.55, "grad_norm": 0.06177696958184242, "learning_rate": 1.236793013415649e-07, "loss": 0.0008, "step": 27415 }, { "epoch": 8.55, "grad_norm": 0.021597489714622498, "learning_rate": 1.2282721214803007e-07, "loss": 0.001, "step": 27420 }, { "epoch": 8.55, "grad_norm": 0.11656033992767334, "learning_rate": 1.2197805020803365e-07, "loss": 0.0011, "step": 27425 }, { "epoch": 8.56, "grad_norm": 0.06125600263476372, "learning_rate": 1.2113181577323662e-07, "loss": 0.0015, "step": 27430 }, { "epoch": 8.56, "grad_norm": 0.11162044107913971, "learning_rate": 1.2028850909443612e-07, "loss": 0.0016, "step": 27435 }, { "epoch": 8.56, "grad_norm": 0.10665519535541534, "learning_rate": 1.194481304215589e-07, "loss": 0.0012, "step": 27440 }, { "epoch": 8.56, "grad_norm": 0.09857645630836487, "learning_rate": 1.1861068000366238e-07, "loss": 0.0008, "step": 27445 }, { "epoch": 8.56, "grad_norm": 0.11146674305200577, "learning_rate": 1.1777615808894027e-07, "loss": 0.0011, "step": 27450 }, { "epoch": 8.56, "grad_norm": 0.10069559514522552, "learning_rate": 1.1694456492471362e-07, "loss": 0.0009, "step": 27455 }, { "epoch": 8.56, "grad_norm": 0.07771911472082138, "learning_rate": 1.1611590075744083e-07, "loss": 0.0012, "step": 27460 }, { "epoch": 8.57, "grad_norm": 0.13914179801940918, "learning_rate": 1.1529016583270657e-07, "loss": 0.0012, "step": 27465 }, { "epoch": 8.57, "grad_norm": 0.11388552188873291, "learning_rate": 1.1446736039523176e-07, "loss": 0.0014, "step": 27470 }, { "epoch": 8.57, "grad_norm": 0.038466423749923706, "learning_rate": 1.1364748468886688e-07, "loss": 0.0008, "step": 27475 }, { "epoch": 8.57, "grad_norm": 0.06610891222953796, "learning_rate": 1.1283053895659645e-07, "loss": 0.0006, "step": 27480 }, { "epoch": 8.57, "grad_norm": 0.02082733064889908, "learning_rate": 1.1201652344053349e-07, "loss": 0.0007, "step": 27485 }, { "epoch": 8.57, "grad_norm": 0.12689946591854095, "learning_rate": 1.1120543838192388e-07, "loss": 0.0016, "step": 27490 }, { "epoch": 8.58, "grad_norm": 0.07294479012489319, "learning_rate": 1.1039728402114757e-07, "loss": 0.0013, "step": 27495 }, { "epoch": 8.58, "grad_norm": 0.06044188141822815, "learning_rate": 1.0959206059771077e-07, "loss": 0.0007, "step": 27500 }, { "epoch": 8.58, "grad_norm": 0.0841093435883522, "learning_rate": 1.0878976835025478e-07, "loss": 0.0008, "step": 27505 }, { "epoch": 8.58, "grad_norm": 0.09894666820764542, "learning_rate": 1.0799040751655166e-07, "loss": 0.0014, "step": 27510 }, { "epoch": 8.58, "grad_norm": 0.1542588472366333, "learning_rate": 1.071939783335041e-07, "loss": 0.0012, "step": 27515 }, { "epoch": 8.58, "grad_norm": 0.044521354138851166, "learning_rate": 1.0640048103714773e-07, "loss": 0.001, "step": 27520 }, { "epoch": 8.58, "grad_norm": 0.050902362912893295, "learning_rate": 1.0560991586264669e-07, "loss": 0.0013, "step": 27525 }, { "epoch": 8.59, "grad_norm": 0.060398731380701065, "learning_rate": 1.0482228304429686e-07, "loss": 0.0012, "step": 27530 }, { "epoch": 8.59, "grad_norm": 0.08845556527376175, "learning_rate": 1.0403758281552601e-07, "loss": 0.0011, "step": 27535 }, { "epoch": 8.59, "grad_norm": 0.06878877431154251, "learning_rate": 1.0325581540889251e-07, "loss": 0.0009, "step": 27540 }, { "epoch": 8.59, "grad_norm": 0.05974522605538368, "learning_rate": 1.0247698105608439e-07, "loss": 0.0008, "step": 27545 }, { "epoch": 8.59, "grad_norm": 0.07035703212022781, "learning_rate": 1.0170107998792145e-07, "loss": 0.0013, "step": 27550 }, { "epoch": 8.59, "grad_norm": 0.04951699450612068, "learning_rate": 1.0092811243435529e-07, "loss": 0.001, "step": 27555 }, { "epoch": 8.6, "grad_norm": 0.07079406082630157, "learning_rate": 1.0015807862446381e-07, "loss": 0.0008, "step": 27560 }, { "epoch": 8.6, "grad_norm": 0.046903353184461594, "learning_rate": 9.93909787864622e-08, "loss": 0.0009, "step": 27565 }, { "epoch": 8.6, "grad_norm": 0.036676373332738876, "learning_rate": 9.862681314769196e-08, "loss": 0.0014, "step": 27570 }, { "epoch": 8.6, "grad_norm": 0.0619211420416832, "learning_rate": 9.786558193462304e-08, "loss": 0.0011, "step": 27575 }, { "epoch": 8.6, "grad_norm": 0.061969440430402756, "learning_rate": 9.710728537285829e-08, "loss": 0.0013, "step": 27580 }, { "epoch": 8.6, "grad_norm": 0.08273687213659286, "learning_rate": 9.635192368713354e-08, "loss": 0.001, "step": 27585 }, { "epoch": 8.61, "grad_norm": 0.13152478635311127, "learning_rate": 9.559949710130855e-08, "loss": 0.001, "step": 27590 }, { "epoch": 8.61, "grad_norm": 0.08596333116292953, "learning_rate": 9.485000583837944e-08, "loss": 0.001, "step": 27595 }, { "epoch": 8.61, "grad_norm": 0.07673116028308868, "learning_rate": 9.41034501204674e-08, "loss": 0.0015, "step": 27600 }, { "epoch": 8.61, "grad_norm": 0.14104962348937988, "learning_rate": 9.335983016882766e-08, "loss": 0.0014, "step": 27605 }, { "epoch": 8.61, "grad_norm": 0.10371097177267075, "learning_rate": 9.261914620384171e-08, "loss": 0.0013, "step": 27610 }, { "epoch": 8.61, "grad_norm": 0.015523822978138924, "learning_rate": 9.188139844502508e-08, "loss": 0.0009, "step": 27615 }, { "epoch": 8.61, "grad_norm": 0.08790897578001022, "learning_rate": 9.114658711101842e-08, "loss": 0.0015, "step": 27620 }, { "epoch": 8.62, "grad_norm": 0.05163196474313736, "learning_rate": 9.041471241959531e-08, "loss": 0.0008, "step": 27625 }, { "epoch": 8.62, "grad_norm": 0.08871548622846603, "learning_rate": 8.968577458765893e-08, "loss": 0.001, "step": 27630 }, { "epoch": 8.62, "grad_norm": 0.049854159355163574, "learning_rate": 8.895977383123977e-08, "loss": 0.0012, "step": 27635 }, { "epoch": 8.62, "grad_norm": 0.06681330502033234, "learning_rate": 8.82367103655013e-08, "loss": 0.0007, "step": 27640 }, { "epoch": 8.62, "grad_norm": 0.1309957057237625, "learning_rate": 8.75165844047332e-08, "loss": 0.001, "step": 27645 }, { "epoch": 8.62, "grad_norm": 0.11755676567554474, "learning_rate": 8.6799396162357e-08, "loss": 0.0008, "step": 27650 }, { "epoch": 8.63, "grad_norm": 0.08385858684778214, "learning_rate": 8.608514585092153e-08, "loss": 0.0013, "step": 27655 }, { "epoch": 8.63, "grad_norm": 0.05883089825510979, "learning_rate": 8.537383368210639e-08, "loss": 0.0011, "step": 27660 }, { "epoch": 8.63, "grad_norm": 0.16626568138599396, "learning_rate": 8.46654598667207e-08, "loss": 0.0014, "step": 27665 }, { "epoch": 8.63, "grad_norm": 0.1791432499885559, "learning_rate": 8.396002461470098e-08, "loss": 0.0014, "step": 27670 }, { "epoch": 8.63, "grad_norm": 0.037083640694618225, "learning_rate": 8.325752813511445e-08, "loss": 0.0017, "step": 27675 }, { "epoch": 8.63, "grad_norm": 0.05854767933487892, "learning_rate": 8.255797063615567e-08, "loss": 0.0012, "step": 27680 }, { "epoch": 8.63, "grad_norm": 0.020023584365844727, "learning_rate": 8.186135232515213e-08, "loss": 0.0007, "step": 27685 }, { "epoch": 8.64, "grad_norm": 0.06901459395885468, "learning_rate": 8.116767340855425e-08, "loss": 0.001, "step": 27690 }, { "epoch": 8.64, "grad_norm": 0.10090130567550659, "learning_rate": 8.047693409194757e-08, "loss": 0.0009, "step": 27695 }, { "epoch": 8.64, "grad_norm": 0.016971636563539505, "learning_rate": 7.978913458004167e-08, "loss": 0.0009, "step": 27700 }, { "epoch": 8.64, "grad_norm": 0.0443294420838356, "learning_rate": 7.910427507667573e-08, "loss": 0.0007, "step": 27705 }, { "epoch": 8.64, "grad_norm": 0.08994010090827942, "learning_rate": 7.842235578482182e-08, "loss": 0.0012, "step": 27710 }, { "epoch": 8.64, "grad_norm": 0.11785995960235596, "learning_rate": 7.774337690657274e-08, "loss": 0.0012, "step": 27715 }, { "epoch": 8.65, "grad_norm": 0.08951311558485031, "learning_rate": 7.706733864315752e-08, "loss": 0.0011, "step": 27720 }, { "epoch": 8.65, "grad_norm": 0.08338700979948044, "learning_rate": 7.639424119493033e-08, "loss": 0.0013, "step": 27725 }, { "epoch": 8.65, "grad_norm": 0.07392533868551254, "learning_rate": 7.572408476137383e-08, "loss": 0.0011, "step": 27730 }, { "epoch": 8.65, "grad_norm": 0.09654698520898819, "learning_rate": 7.505686954109915e-08, "loss": 0.0009, "step": 27735 }, { "epoch": 8.65, "grad_norm": 0.02619049698114395, "learning_rate": 7.439259573184476e-08, "loss": 0.0007, "step": 27740 }, { "epoch": 8.65, "grad_norm": 0.06405049562454224, "learning_rate": 7.373126353048099e-08, "loss": 0.0013, "step": 27745 }, { "epoch": 8.66, "grad_norm": 0.08028136193752289, "learning_rate": 7.307287313300216e-08, "loss": 0.0015, "step": 27750 }, { "epoch": 8.66, "grad_norm": 0.048656243830919266, "learning_rate": 7.241742473453217e-08, "loss": 0.0012, "step": 27755 }, { "epoch": 8.66, "grad_norm": 0.04180813208222389, "learning_rate": 7.176491852932455e-08, "loss": 0.0008, "step": 27760 }, { "epoch": 8.66, "grad_norm": 0.05267529562115669, "learning_rate": 7.111535471075792e-08, "loss": 0.0015, "step": 27765 }, { "epoch": 8.66, "grad_norm": 0.08153411746025085, "learning_rate": 7.046873347134275e-08, "loss": 0.0009, "step": 27770 }, { "epoch": 8.66, "grad_norm": 0.05059076100587845, "learning_rate": 6.982505500271464e-08, "loss": 0.0012, "step": 27775 }, { "epoch": 8.66, "grad_norm": 0.0719943568110466, "learning_rate": 6.918431949563654e-08, "loss": 0.0008, "step": 27780 }, { "epoch": 8.67, "grad_norm": 0.045789770781993866, "learning_rate": 6.85465271400021e-08, "loss": 0.0013, "step": 27785 }, { "epoch": 8.67, "grad_norm": 0.05941304191946983, "learning_rate": 6.791167812483013e-08, "loss": 0.0007, "step": 27790 }, { "epoch": 8.67, "grad_norm": 0.10284554958343506, "learning_rate": 6.72797726382668e-08, "loss": 0.0006, "step": 27795 }, { "epoch": 8.67, "grad_norm": 0.07532849162817001, "learning_rate": 6.665081086758896e-08, "loss": 0.0009, "step": 27800 }, { "epoch": 8.67, "grad_norm": 0.04503500089049339, "learning_rate": 6.602479299919861e-08, "loss": 0.0008, "step": 27805 }, { "epoch": 8.67, "grad_norm": 0.06832282245159149, "learning_rate": 6.540171921862515e-08, "loss": 0.0008, "step": 27810 }, { "epoch": 8.68, "grad_norm": 0.050845157355070114, "learning_rate": 6.47815897105264e-08, "loss": 0.0008, "step": 27815 }, { "epoch": 8.68, "grad_norm": 0.08400414139032364, "learning_rate": 6.416440465868979e-08, "loss": 0.0012, "step": 27820 }, { "epoch": 8.68, "grad_norm": 0.0506439134478569, "learning_rate": 6.355016424602346e-08, "loss": 0.0009, "step": 27825 }, { "epoch": 8.68, "grad_norm": 0.13978984951972961, "learning_rate": 6.293886865457067e-08, "loss": 0.0012, "step": 27830 }, { "epoch": 8.68, "grad_norm": 0.05353309586644173, "learning_rate": 6.233051806549651e-08, "loss": 0.001, "step": 27835 }, { "epoch": 8.68, "grad_norm": 0.057319726794958115, "learning_rate": 6.172511265909565e-08, "loss": 0.0013, "step": 27840 }, { "epoch": 8.68, "grad_norm": 0.09258401393890381, "learning_rate": 6.1122652614789e-08, "loss": 0.0011, "step": 27845 }, { "epoch": 8.69, "grad_norm": 0.07895422726869583, "learning_rate": 6.052313811112486e-08, "loss": 0.0013, "step": 27850 }, { "epoch": 8.69, "grad_norm": 0.2097400575876236, "learning_rate": 5.992656932578e-08, "loss": 0.0016, "step": 27855 }, { "epoch": 8.69, "grad_norm": 0.06631385535001755, "learning_rate": 5.9332946435556314e-08, "loss": 0.0013, "step": 27860 }, { "epoch": 8.69, "grad_norm": 0.11734557151794434, "learning_rate": 5.8742269616381965e-08, "loss": 0.0011, "step": 27865 }, { "epoch": 8.69, "grad_norm": 0.037774424999952316, "learning_rate": 5.815453904331359e-08, "loss": 0.0008, "step": 27870 }, { "epoch": 8.69, "grad_norm": 0.07216952741146088, "learning_rate": 5.7569754890535176e-08, "loss": 0.001, "step": 27875 }, { "epoch": 8.7, "grad_norm": 0.10858375579118729, "learning_rate": 5.698791733135589e-08, "loss": 0.0012, "step": 27880 }, { "epoch": 8.7, "grad_norm": 0.011037354357540607, "learning_rate": 5.640902653821334e-08, "loss": 0.0013, "step": 27885 }, { "epoch": 8.7, "grad_norm": 0.11795692145824432, "learning_rate": 5.583308268266918e-08, "loss": 0.0011, "step": 27890 }, { "epoch": 8.7, "grad_norm": 0.06190943345427513, "learning_rate": 5.526008593541465e-08, "loss": 0.0011, "step": 27895 }, { "epoch": 8.7, "grad_norm": 0.023181727156043053, "learning_rate": 5.469003646626503e-08, "loss": 0.0012, "step": 27900 }, { "epoch": 8.7, "grad_norm": 0.08947614580392838, "learning_rate": 5.412293444416405e-08, "loss": 0.0008, "step": 27905 }, { "epoch": 8.71, "grad_norm": 0.17211729288101196, "learning_rate": 5.3558780037181735e-08, "loss": 0.0017, "step": 27910 }, { "epoch": 8.71, "grad_norm": 0.0541967898607254, "learning_rate": 5.299757341251321e-08, "loss": 0.001, "step": 27915 }, { "epoch": 8.71, "grad_norm": 0.05166836455464363, "learning_rate": 5.2439314736481004e-08, "loss": 0.001, "step": 27920 }, { "epoch": 8.71, "grad_norm": 0.06777120381593704, "learning_rate": 5.1884004174533876e-08, "loss": 0.0014, "step": 27925 }, { "epoch": 8.71, "grad_norm": 0.0960666760802269, "learning_rate": 5.1331641891246844e-08, "loss": 0.0011, "step": 27930 }, { "epoch": 8.71, "grad_norm": 0.048764344304800034, "learning_rate": 5.078222805032007e-08, "loss": 0.0013, "step": 27935 }, { "epoch": 8.71, "grad_norm": 0.07357207685709, "learning_rate": 5.02357628145822e-08, "loss": 0.0008, "step": 27940 }, { "epoch": 8.72, "grad_norm": 0.05783311277627945, "learning_rate": 4.9692246345985905e-08, "loss": 0.0007, "step": 27945 }, { "epoch": 8.72, "grad_norm": 0.07628779858350754, "learning_rate": 4.915167880561122e-08, "loss": 0.0017, "step": 27950 }, { "epoch": 8.72, "grad_norm": 0.042343996465206146, "learning_rate": 4.861406035366334e-08, "loss": 0.0007, "step": 27955 }, { "epoch": 8.72, "grad_norm": 0.10046465694904327, "learning_rate": 4.8079391149473686e-08, "loss": 0.001, "step": 27960 }, { "epoch": 8.72, "grad_norm": 0.06461944431066513, "learning_rate": 4.754767135149996e-08, "loss": 0.0011, "step": 27965 }, { "epoch": 8.72, "grad_norm": 0.05853043869137764, "learning_rate": 4.70189011173261e-08, "loss": 0.001, "step": 27970 }, { "epoch": 8.73, "grad_norm": 0.08283140510320663, "learning_rate": 4.649308060366009e-08, "loss": 0.0012, "step": 27975 }, { "epoch": 8.73, "grad_norm": 0.12241838127374649, "learning_rate": 4.597020996633839e-08, "loss": 0.0012, "step": 27980 }, { "epoch": 8.73, "grad_norm": 0.057213619351387024, "learning_rate": 4.545028936032147e-08, "loss": 0.0009, "step": 27985 }, { "epoch": 8.73, "grad_norm": 0.06766070425510406, "learning_rate": 4.4933318939694995e-08, "loss": 0.0006, "step": 27990 }, { "epoch": 8.73, "grad_norm": 0.05963273346424103, "learning_rate": 4.441929885767304e-08, "loss": 0.0013, "step": 27995 }, { "epoch": 8.73, "grad_norm": 0.11648065596818924, "learning_rate": 4.390822926659155e-08, "loss": 0.0017, "step": 28000 }, { "epoch": 8.73, "grad_norm": 0.0579272024333477, "learning_rate": 4.340011031791491e-08, "loss": 0.0007, "step": 28005 }, { "epoch": 8.74, "grad_norm": 0.08932799845933914, "learning_rate": 4.2894942162231555e-08, "loss": 0.0011, "step": 28010 }, { "epoch": 8.74, "grad_norm": 0.08602336794137955, "learning_rate": 4.239272494925617e-08, "loss": 0.0011, "step": 28015 }, { "epoch": 8.74, "grad_norm": 0.11151181161403656, "learning_rate": 4.189345882782858e-08, "loss": 0.0007, "step": 28020 }, { "epoch": 8.74, "grad_norm": 0.06075332313776016, "learning_rate": 4.1397143945913766e-08, "loss": 0.0011, "step": 28025 }, { "epoch": 8.74, "grad_norm": 0.0862547978758812, "learning_rate": 4.090378045060406e-08, "loss": 0.0012, "step": 28030 }, { "epoch": 8.74, "grad_norm": 0.07240119576454163, "learning_rate": 4.04133684881125e-08, "loss": 0.0014, "step": 28035 }, { "epoch": 8.75, "grad_norm": 0.06106055900454521, "learning_rate": 3.992590820378284e-08, "loss": 0.0011, "step": 28040 }, { "epoch": 8.75, "grad_norm": 0.12147120386362076, "learning_rate": 3.944139974208061e-08, "loss": 0.0007, "step": 28045 }, { "epoch": 8.75, "grad_norm": 0.17372600734233856, "learning_rate": 3.8959843246596516e-08, "loss": 0.0013, "step": 28050 }, { "epoch": 8.75, "grad_norm": 0.050082284957170486, "learning_rate": 3.848123886004751e-08, "loss": 0.0009, "step": 28055 }, { "epoch": 8.75, "grad_norm": 0.06022435426712036, "learning_rate": 3.800558672427679e-08, "loss": 0.001, "step": 28060 }, { "epoch": 8.75, "grad_norm": 0.054569125175476074, "learning_rate": 3.7532886980250484e-08, "loss": 0.0007, "step": 28065 }, { "epoch": 8.75, "grad_norm": 0.09262100607156754, "learning_rate": 3.7063139768060976e-08, "loss": 0.0014, "step": 28070 }, { "epoch": 8.76, "grad_norm": 0.09075400233268738, "learning_rate": 3.659634522692357e-08, "loss": 0.0008, "step": 28075 }, { "epoch": 8.76, "grad_norm": 0.09641103446483612, "learning_rate": 3.6132503495182046e-08, "loss": 0.0018, "step": 28080 }, { "epoch": 8.76, "grad_norm": 0.038265615701675415, "learning_rate": 3.5671614710302e-08, "loss": 0.0013, "step": 28085 }, { "epoch": 8.76, "grad_norm": 0.07987763732671738, "learning_rate": 3.521367900887418e-08, "loss": 0.0014, "step": 28090 }, { "epoch": 8.76, "grad_norm": 0.13263726234436035, "learning_rate": 3.4758696526617783e-08, "loss": 0.0012, "step": 28095 }, { "epoch": 8.76, "grad_norm": 0.10438277572393417, "learning_rate": 3.430666739837052e-08, "loss": 0.001, "step": 28100 }, { "epoch": 8.77, "grad_norm": 0.05294902250170708, "learning_rate": 3.385759175809966e-08, "loss": 0.0012, "step": 28105 }, { "epoch": 8.77, "grad_norm": 0.15452563762664795, "learning_rate": 3.341146973889764e-08, "loss": 0.0013, "step": 28110 }, { "epoch": 8.77, "grad_norm": 0.09602925926446915, "learning_rate": 3.296830147297647e-08, "loss": 0.0013, "step": 28115 }, { "epoch": 8.77, "grad_norm": 0.06384595483541489, "learning_rate": 3.252808709167776e-08, "loss": 0.0009, "step": 28120 }, { "epoch": 8.77, "grad_norm": 0.035283368080854416, "learning_rate": 3.209082672546604e-08, "loss": 0.0017, "step": 28125 }, { "epoch": 8.77, "grad_norm": 0.1074545606970787, "learning_rate": 3.1656520503928755e-08, "loss": 0.0009, "step": 28130 }, { "epoch": 8.78, "grad_norm": 0.04797278344631195, "learning_rate": 3.1225168555780725e-08, "loss": 0.0011, "step": 28135 }, { "epoch": 8.78, "grad_norm": 0.07991745322942734, "learning_rate": 3.0796771008858586e-08, "loss": 0.0014, "step": 28140 }, { "epoch": 8.78, "grad_norm": 0.05039117485284805, "learning_rate": 3.037132799012632e-08, "loss": 0.0009, "step": 28145 }, { "epoch": 8.78, "grad_norm": 0.10603125393390656, "learning_rate": 2.9948839625669745e-08, "loss": 0.0006, "step": 28150 }, { "epoch": 8.78, "grad_norm": 0.08775552362203598, "learning_rate": 2.9529306040698703e-08, "loss": 0.0011, "step": 28155 }, { "epoch": 8.78, "grad_norm": 0.0653933510184288, "learning_rate": 2.9112727359550398e-08, "loss": 0.001, "step": 28160 }, { "epoch": 8.78, "grad_norm": 0.1549730896949768, "learning_rate": 2.869910370568274e-08, "loss": 0.0009, "step": 28165 }, { "epoch": 8.79, "grad_norm": 0.04075611010193825, "learning_rate": 2.82884352016799e-08, "loss": 0.001, "step": 28170 }, { "epoch": 8.79, "grad_norm": 0.14746034145355225, "learning_rate": 2.788072196925118e-08, "loss": 0.0011, "step": 28175 }, { "epoch": 8.79, "grad_norm": 0.03397578001022339, "learning_rate": 2.7475964129227706e-08, "loss": 0.0007, "step": 28180 }, { "epoch": 8.79, "grad_norm": 0.03829203546047211, "learning_rate": 2.7074161801564635e-08, "loss": 0.0009, "step": 28185 }, { "epoch": 8.79, "grad_norm": 0.11898770928382874, "learning_rate": 2.6675315105344492e-08, "loss": 0.001, "step": 28190 }, { "epoch": 8.79, "grad_norm": 0.09407939016819, "learning_rate": 2.6279424158770495e-08, "loss": 0.0012, "step": 28195 }, { "epoch": 8.8, "grad_norm": 0.09977072477340698, "learning_rate": 2.5886489079171016e-08, "loss": 0.0009, "step": 28200 }, { "epoch": 8.8, "grad_norm": 0.05728138983249664, "learning_rate": 2.5496509982998463e-08, "loss": 0.0012, "step": 28205 }, { "epoch": 8.8, "grad_norm": 0.17846325039863586, "learning_rate": 2.510948698583038e-08, "loss": 0.0011, "step": 28210 }, { "epoch": 8.8, "grad_norm": 0.20595858991146088, "learning_rate": 2.4725420202365015e-08, "loss": 0.0011, "step": 28215 }, { "epoch": 8.8, "grad_norm": 0.04241137206554413, "learning_rate": 2.4344309746427984e-08, "loss": 0.0012, "step": 28220 }, { "epoch": 8.8, "grad_norm": 0.06812385469675064, "learning_rate": 2.396615573096783e-08, "loss": 0.0018, "step": 28225 }, { "epoch": 8.8, "grad_norm": 0.05665101483464241, "learning_rate": 2.35909582680538e-08, "loss": 0.0008, "step": 28230 }, { "epoch": 8.81, "grad_norm": 0.0710788443684578, "learning_rate": 2.321871746888471e-08, "loss": 0.001, "step": 28235 }, { "epoch": 8.81, "grad_norm": 0.03182736039161682, "learning_rate": 2.2849433443776768e-08, "loss": 0.001, "step": 28240 }, { "epoch": 8.81, "grad_norm": 0.04013293981552124, "learning_rate": 2.2483106302175762e-08, "loss": 0.0015, "step": 28245 }, { "epoch": 8.81, "grad_norm": 0.22291040420532227, "learning_rate": 2.2119736152647064e-08, "loss": 0.0009, "step": 28250 }, { "epoch": 8.81, "grad_norm": 0.07432419061660767, "learning_rate": 2.17593231028812e-08, "loss": 0.0014, "step": 28255 }, { "epoch": 8.81, "grad_norm": 0.11221190541982651, "learning_rate": 2.1401867259691622e-08, "loss": 0.0015, "step": 28260 }, { "epoch": 8.82, "grad_norm": 0.10827355086803436, "learning_rate": 2.1047368729018025e-08, "loss": 0.0011, "step": 28265 }, { "epoch": 8.82, "grad_norm": 0.09072833508253098, "learning_rate": 2.0695827615918595e-08, "loss": 0.0006, "step": 28270 }, { "epoch": 8.82, "grad_norm": 0.07544603198766708, "learning_rate": 2.034724402457999e-08, "loss": 0.0007, "step": 28275 }, { "epoch": 8.82, "grad_norm": 0.025641784071922302, "learning_rate": 2.000161805830958e-08, "loss": 0.0009, "step": 28280 }, { "epoch": 8.82, "grad_norm": 0.21241925656795502, "learning_rate": 1.9658949819539864e-08, "loss": 0.0014, "step": 28285 }, { "epoch": 8.82, "grad_norm": 0.10191631317138672, "learning_rate": 1.9319239409825165e-08, "loss": 0.0017, "step": 28290 }, { "epoch": 8.83, "grad_norm": 0.0852564126253128, "learning_rate": 1.8982486929843835e-08, "loss": 0.0015, "step": 28295 }, { "epoch": 8.83, "grad_norm": 0.01896956004202366, "learning_rate": 1.8648692479398266e-08, "loss": 0.0011, "step": 28300 }, { "epoch": 8.83, "grad_norm": 0.046149298548698425, "learning_rate": 1.8317856157412662e-08, "loss": 0.0007, "step": 28305 }, { "epoch": 8.83, "grad_norm": 0.07430528849363327, "learning_rate": 1.7989978061936363e-08, "loss": 0.0011, "step": 28310 }, { "epoch": 8.83, "grad_norm": 0.13385100662708282, "learning_rate": 1.7665058290140537e-08, "loss": 0.0011, "step": 28315 }, { "epoch": 8.83, "grad_norm": 0.0955640971660614, "learning_rate": 1.734309693832037e-08, "loss": 0.0013, "step": 28320 }, { "epoch": 8.83, "grad_norm": 0.13716626167297363, "learning_rate": 1.7024094101895093e-08, "loss": 0.0008, "step": 28325 }, { "epoch": 8.84, "grad_norm": 0.06619204580783844, "learning_rate": 1.670804987540575e-08, "loss": 0.0012, "step": 28330 }, { "epoch": 8.84, "grad_norm": 0.07766672223806381, "learning_rate": 1.6394964352515198e-08, "loss": 0.0007, "step": 28335 }, { "epoch": 8.84, "grad_norm": 0.08800486475229263, "learning_rate": 1.6084837626013648e-08, "loss": 0.0012, "step": 28340 }, { "epoch": 8.84, "grad_norm": 0.023620784282684326, "learning_rate": 1.5777669787810923e-08, "loss": 0.001, "step": 28345 }, { "epoch": 8.84, "grad_norm": 0.16947531700134277, "learning_rate": 1.5473460928939753e-08, "loss": 0.0011, "step": 28350 }, { "epoch": 8.84, "grad_norm": 0.08615032583475113, "learning_rate": 1.5172211139559133e-08, "loss": 0.001, "step": 28355 }, { "epoch": 8.85, "grad_norm": 0.05784229934215546, "learning_rate": 1.4873920508948758e-08, "loss": 0.0007, "step": 28360 }, { "epoch": 8.85, "grad_norm": 0.11840718239545822, "learning_rate": 1.4578589125511245e-08, "loss": 0.0019, "step": 28365 }, { "epoch": 8.85, "grad_norm": 0.052449408918619156, "learning_rate": 1.4286217076772136e-08, "loss": 0.0014, "step": 28370 }, { "epoch": 8.85, "grad_norm": 0.029246846213936806, "learning_rate": 1.3996804449381007e-08, "loss": 0.0005, "step": 28375 }, { "epoch": 8.85, "grad_norm": 0.041718367487192154, "learning_rate": 1.3710351329109251e-08, "loss": 0.0011, "step": 28380 }, { "epoch": 8.85, "grad_norm": 0.07578592747449875, "learning_rate": 1.3426857800853398e-08, "loss": 0.001, "step": 28385 }, { "epoch": 8.85, "grad_norm": 0.11366725713014603, "learning_rate": 1.314632394862847e-08, "loss": 0.0013, "step": 28390 }, { "epoch": 8.86, "grad_norm": 0.06276487559080124, "learning_rate": 1.2868749855577955e-08, "loss": 0.0015, "step": 28395 }, { "epoch": 8.86, "grad_norm": 0.06069565564393997, "learning_rate": 1.259413560396272e-08, "loss": 0.0014, "step": 28400 }, { "epoch": 8.86, "grad_norm": 0.3192262351512909, "learning_rate": 1.2322481275169885e-08, "loss": 0.0015, "step": 28405 }, { "epoch": 8.86, "grad_norm": 0.10366208851337433, "learning_rate": 1.205378694970949e-08, "loss": 0.0007, "step": 28410 }, { "epoch": 8.86, "grad_norm": 0.027929887175559998, "learning_rate": 1.1788052707212283e-08, "loss": 0.0011, "step": 28415 }, { "epoch": 8.86, "grad_norm": 0.16714242100715637, "learning_rate": 1.1525278626431935e-08, "loss": 0.0008, "step": 28420 }, { "epoch": 8.87, "grad_norm": 0.08561515808105469, "learning_rate": 1.126546478524726e-08, "loss": 0.0013, "step": 28425 }, { "epoch": 8.87, "grad_norm": 0.08352944254875183, "learning_rate": 1.1008611260656666e-08, "loss": 0.0013, "step": 28430 }, { "epoch": 8.87, "grad_norm": 0.18621785938739777, "learning_rate": 1.0754718128783704e-08, "loss": 0.0015, "step": 28435 }, { "epoch": 8.87, "grad_norm": 0.08013457804918289, "learning_rate": 1.050378546487374e-08, "loss": 0.0008, "step": 28440 }, { "epoch": 8.87, "grad_norm": 0.0238870307803154, "learning_rate": 1.0255813343292843e-08, "loss": 0.0009, "step": 28445 }, { "epoch": 8.87, "grad_norm": 0.04448197782039642, "learning_rate": 1.0010801837533334e-08, "loss": 0.0011, "step": 28450 }, { "epoch": 8.88, "grad_norm": 0.11206652969121933, "learning_rate": 9.768751020208245e-09, "loss": 0.0011, "step": 28455 }, { "epoch": 8.88, "grad_norm": 0.1502913534641266, "learning_rate": 9.529660963051302e-09, "loss": 0.0013, "step": 28460 }, { "epoch": 8.88, "grad_norm": 0.10655543953180313, "learning_rate": 9.29353173692138e-09, "loss": 0.0015, "step": 28465 }, { "epoch": 8.88, "grad_norm": 0.061149418354034424, "learning_rate": 9.060363411799167e-09, "loss": 0.0013, "step": 28470 }, { "epoch": 8.88, "grad_norm": 0.03373811021447182, "learning_rate": 8.830156056788276e-09, "loss": 0.0005, "step": 28475 }, { "epoch": 8.88, "grad_norm": 0.07910972088575363, "learning_rate": 8.602909740114129e-09, "loss": 0.0014, "step": 28480 }, { "epoch": 8.88, "grad_norm": 0.10172421485185623, "learning_rate": 8.378624529123968e-09, "loss": 0.0009, "step": 28485 }, { "epoch": 8.89, "grad_norm": 0.1482880711555481, "learning_rate": 8.157300490287956e-09, "loss": 0.0018, "step": 28490 }, { "epoch": 8.89, "grad_norm": 0.047668956220149994, "learning_rate": 7.938937689199178e-09, "loss": 0.0013, "step": 28495 }, { "epoch": 8.89, "grad_norm": 0.10796379297971725, "learning_rate": 7.723536190573645e-09, "loss": 0.001, "step": 28500 }, { "epoch": 8.89, "grad_norm": 0.085243821144104, "learning_rate": 7.511096058249178e-09, "loss": 0.0008, "step": 28505 }, { "epoch": 8.89, "grad_norm": 0.04042333364486694, "learning_rate": 7.301617355184309e-09, "loss": 0.0013, "step": 28510 }, { "epoch": 8.89, "grad_norm": 0.07351209968328476, "learning_rate": 7.095100143461598e-09, "loss": 0.0006, "step": 28515 }, { "epoch": 8.9, "grad_norm": 0.09849730879068375, "learning_rate": 6.891544484286527e-09, "loss": 0.0009, "step": 28520 }, { "epoch": 8.9, "grad_norm": 0.20490317046642303, "learning_rate": 6.6909504379852885e-09, "loss": 0.0016, "step": 28525 }, { "epoch": 8.9, "grad_norm": 0.02441880851984024, "learning_rate": 6.493318064006993e-09, "loss": 0.0006, "step": 28530 }, { "epoch": 8.9, "grad_norm": 0.07980853319168091, "learning_rate": 6.337344608285945e-09, "loss": 0.0011, "step": 28535 }, { "epoch": 8.9, "grad_norm": 0.10102265328168869, "learning_rate": 6.1450433915000165e-09, "loss": 0.0008, "step": 28540 }, { "epoch": 8.9, "grad_norm": 0.5179263949394226, "learning_rate": 5.955704008824281e-09, "loss": 0.0014, "step": 28545 }, { "epoch": 8.9, "grad_norm": 0.11356153339147568, "learning_rate": 5.769326516373852e-09, "loss": 0.0011, "step": 28550 }, { "epoch": 8.91, "grad_norm": 0.01838083378970623, "learning_rate": 5.585910969384545e-09, "loss": 0.001, "step": 28555 }, { "epoch": 8.91, "grad_norm": 0.0684712678194046, "learning_rate": 5.405457422212879e-09, "loss": 0.0015, "step": 28560 }, { "epoch": 8.91, "grad_norm": 0.10153453797101974, "learning_rate": 5.227965928341627e-09, "loss": 0.0017, "step": 28565 }, { "epoch": 8.91, "grad_norm": 0.09030450135469437, "learning_rate": 5.053436540370938e-09, "loss": 0.0013, "step": 28570 }, { "epoch": 8.91, "grad_norm": 0.04882200434803963, "learning_rate": 4.881869310024989e-09, "loss": 0.0007, "step": 28575 }, { "epoch": 8.91, "grad_norm": 0.11226236820220947, "learning_rate": 4.713264288153107e-09, "loss": 0.0014, "step": 28580 }, { "epoch": 8.92, "grad_norm": 0.22697344422340393, "learning_rate": 4.5476215247219905e-09, "loss": 0.001, "step": 28585 }, { "epoch": 8.92, "grad_norm": 0.0568106546998024, "learning_rate": 4.384941068822368e-09, "loss": 0.0013, "step": 28590 }, { "epoch": 8.92, "grad_norm": 0.12141884118318558, "learning_rate": 4.225222968667897e-09, "loss": 0.001, "step": 28595 }, { "epoch": 8.92, "grad_norm": 0.11962190270423889, "learning_rate": 4.068467271592935e-09, "loss": 0.0014, "step": 28600 }, { "epoch": 8.92, "grad_norm": 0.07136228680610657, "learning_rate": 3.9146740240547655e-09, "loss": 0.0012, "step": 28605 }, { "epoch": 8.92, "grad_norm": 0.04858747869729996, "learning_rate": 3.763843271631373e-09, "loss": 0.0012, "step": 28610 }, { "epoch": 8.92, "grad_norm": 0.0264420323073864, "learning_rate": 3.615975059025889e-09, "loss": 0.001, "step": 28615 }, { "epoch": 8.93, "grad_norm": 0.061483949422836304, "learning_rate": 3.471069430061036e-09, "loss": 0.0017, "step": 28620 }, { "epoch": 8.93, "grad_norm": 0.08160971850156784, "learning_rate": 3.3291264276791302e-09, "loss": 0.0011, "step": 28625 }, { "epoch": 8.93, "grad_norm": 0.04612981900572777, "learning_rate": 3.190146093950963e-09, "loss": 0.0011, "step": 28630 }, { "epoch": 8.93, "grad_norm": 0.012312815524637699, "learning_rate": 3.0541284700624783e-09, "loss": 0.0011, "step": 28635 }, { "epoch": 8.93, "grad_norm": 0.03374966233968735, "learning_rate": 2.921073596325874e-09, "loss": 0.001, "step": 28640 }, { "epoch": 8.93, "grad_norm": 0.15238814055919647, "learning_rate": 2.7909815121740512e-09, "loss": 0.0009, "step": 28645 }, { "epoch": 8.94, "grad_norm": 0.11198405921459198, "learning_rate": 2.6638522561617254e-09, "loss": 0.001, "step": 28650 }, { "epoch": 8.94, "grad_norm": 0.10427577048540115, "learning_rate": 2.5396858659665345e-09, "loss": 0.0006, "step": 28655 }, { "epoch": 8.94, "grad_norm": 0.07450409978628159, "learning_rate": 2.418482378384601e-09, "loss": 0.001, "step": 28660 }, { "epoch": 8.94, "grad_norm": 0.5595073103904724, "learning_rate": 2.3002418293394115e-09, "loss": 0.0016, "step": 28665 }, { "epoch": 8.94, "grad_norm": 0.03920646756887436, "learning_rate": 2.184964253871824e-09, "loss": 0.0013, "step": 28670 }, { "epoch": 8.94, "grad_norm": 0.0426039844751358, "learning_rate": 2.0726496861456224e-09, "loss": 0.0009, "step": 28675 }, { "epoch": 8.95, "grad_norm": 0.050191134214401245, "learning_rate": 1.963298159447513e-09, "loss": 0.0006, "step": 28680 }, { "epoch": 8.95, "grad_norm": 0.0812898799777031, "learning_rate": 1.8569097061871266e-09, "loss": 0.001, "step": 28685 }, { "epoch": 8.95, "grad_norm": 0.04886721447110176, "learning_rate": 1.7534843578914662e-09, "loss": 0.001, "step": 28690 }, { "epoch": 8.95, "grad_norm": 0.08173787593841553, "learning_rate": 1.6530221452148998e-09, "loss": 0.001, "step": 28695 }, { "epoch": 8.95, "grad_norm": 0.04918965697288513, "learning_rate": 1.555523097929168e-09, "loss": 0.0008, "step": 28700 }, { "epoch": 8.95, "grad_norm": 0.0397033616900444, "learning_rate": 1.4609872449300455e-09, "loss": 0.001, "step": 28705 }, { "epoch": 8.95, "grad_norm": 0.08010255545377731, "learning_rate": 1.3694146142351205e-09, "loss": 0.0012, "step": 28710 }, { "epoch": 8.96, "grad_norm": 0.047962285578250885, "learning_rate": 1.2808052329826848e-09, "loss": 0.0007, "step": 28715 }, { "epoch": 8.96, "grad_norm": 0.03405216336250305, "learning_rate": 1.1951591274339535e-09, "loss": 0.0009, "step": 28720 }, { "epoch": 8.96, "grad_norm": 0.21903178095817566, "learning_rate": 1.112476322971956e-09, "loss": 0.001, "step": 28725 }, { "epoch": 8.96, "grad_norm": 0.030231649056077003, "learning_rate": 1.0327568441004243e-09, "loss": 0.0007, "step": 28730 }, { "epoch": 8.96, "grad_norm": 0.02870813198387623, "learning_rate": 9.560007144449045e-10, "loss": 0.0011, "step": 28735 }, { "epoch": 8.96, "grad_norm": 0.05557563900947571, "learning_rate": 8.822079567538666e-10, "loss": 0.001, "step": 28740 }, { "epoch": 8.97, "grad_norm": 0.06193048134446144, "learning_rate": 8.113785928975937e-10, "loss": 0.0012, "step": 28745 }, { "epoch": 8.97, "grad_norm": 0.09998735785484314, "learning_rate": 7.435126438670726e-10, "loss": 0.0011, "step": 28750 }, { "epoch": 8.97, "grad_norm": 0.0637703463435173, "learning_rate": 6.786101297751035e-10, "loss": 0.002, "step": 28755 }, { "epoch": 8.97, "grad_norm": 0.09788890182971954, "learning_rate": 6.166710698563005e-10, "loss": 0.001, "step": 28760 }, { "epoch": 8.97, "grad_norm": 0.043414145708084106, "learning_rate": 5.576954824682012e-10, "loss": 0.0008, "step": 28765 }, { "epoch": 8.97, "grad_norm": 0.07151433825492859, "learning_rate": 5.016833850879366e-10, "loss": 0.0012, "step": 28770 }, { "epoch": 8.97, "grad_norm": 0.04484909400343895, "learning_rate": 4.4863479431667136e-10, "loss": 0.0004, "step": 28775 }, { "epoch": 8.98, "grad_norm": 0.08398403972387314, "learning_rate": 3.9854972587627383e-10, "loss": 0.0011, "step": 28780 }, { "epoch": 8.98, "grad_norm": 0.09407573193311691, "learning_rate": 3.514281946082054e-10, "loss": 0.0007, "step": 28785 }, { "epoch": 8.98, "grad_norm": 0.16476760804653168, "learning_rate": 3.07270214480182e-10, "loss": 0.0013, "step": 28790 }, { "epoch": 8.98, "grad_norm": 0.04877808690071106, "learning_rate": 2.6607579857840236e-10, "loss": 0.001, "step": 28795 }, { "epoch": 8.98, "grad_norm": 0.04684771969914436, "learning_rate": 2.2784495910976866e-10, "loss": 0.0017, "step": 28800 }, { "epoch": 8.98, "grad_norm": 0.06821387261152267, "learning_rate": 1.9257770740743753e-10, "loss": 0.0012, "step": 28805 }, { "epoch": 8.99, "grad_norm": 0.134215846657753, "learning_rate": 1.6027405392082807e-10, "loss": 0.0009, "step": 28810 }, { "epoch": 8.99, "grad_norm": 0.06247800588607788, "learning_rate": 1.3093400822450364e-10, "loss": 0.0012, "step": 28815 }, { "epoch": 8.99, "grad_norm": 0.099249467253685, "learning_rate": 1.045575790148412e-10, "loss": 0.001, "step": 28820 }, { "epoch": 8.99, "grad_norm": 0.045093026012182236, "learning_rate": 8.114477410781085e-11, "loss": 0.0016, "step": 28825 }, { "epoch": 8.99, "grad_norm": 0.0406327061355114, "learning_rate": 6.069560044341671e-11, "loss": 0.001, "step": 28830 }, { "epoch": 8.99, "grad_norm": 0.04194846376776695, "learning_rate": 4.321006408014583e-11, "loss": 0.0006, "step": 28835 }, { "epoch": 9.0, "grad_norm": 0.09402500838041306, "learning_rate": 2.8688170200519282e-11, "loss": 0.0013, "step": 28840 }, { "epoch": 9.0, "grad_norm": 0.02617935836315155, "learning_rate": 1.712992310998196e-11, "loss": 0.0009, "step": 28845 }, { "epoch": 9.0, "grad_norm": 0.06014137715101242, "learning_rate": 8.535326232461671e-12, "loss": 0.0011, "step": 28850 }, { "epoch": 9.0, "step": 28854, "total_flos": 5.673215034528458e+19, "train_loss": 0.032082520662370544, "train_runtime": 172613.4265, "train_samples_per_second": 5.349, "train_steps_per_second": 0.167 } ], "logging_steps": 5, "max_steps": 28854, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 13000, "total_flos": 5.673215034528458e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }