diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.500428728920828, + "epoch": 0.7506430933812421, "eval_steps": 766, - "global_step": 1532, + "global_step": 2298, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -10755,6 +10755,5376 @@ "eval_samples_per_second": 5.155, "eval_steps_per_second": 2.578, "step": 1532 + }, + { + "epoch": 0.5007553795271732, + "grad_norm": 0.28496748208999634, + "learning_rate": 5.1173685523057366e-05, + "loss": 0.8344, + "step": 1533 + }, + { + "epoch": 0.5010820301335185, + "grad_norm": 0.2947223484516144, + "learning_rate": 5.112153067576988e-05, + "loss": 0.9075, + "step": 1534 + }, + { + "epoch": 0.5014086807398637, + "grad_norm": 0.3117678463459015, + "learning_rate": 5.106937460755636e-05, + "loss": 0.8458, + "step": 1535 + }, + { + "epoch": 0.5017353313462088, + "grad_norm": 0.3499720096588135, + "learning_rate": 5.1017217375195206e-05, + "loss": 0.8336, + "step": 1536 + }, + { + "epoch": 0.502061981952554, + "grad_norm": 0.3558485209941864, + "learning_rate": 5.0965059035466054e-05, + "loss": 0.8773, + "step": 1537 + }, + { + "epoch": 0.5023886325588992, + "grad_norm": 0.3849354386329651, + "learning_rate": 5.091289964514977e-05, + "loss": 0.853, + "step": 1538 + }, + { + "epoch": 0.5027152831652444, + "grad_norm": 0.397390216588974, + "learning_rate": 5.086073926102837e-05, + "loss": 0.9433, + "step": 1539 + }, + { + "epoch": 0.5030419337715896, + "grad_norm": 0.40508022904396057, + "learning_rate": 5.080857793988491e-05, + "loss": 0.9347, + "step": 1540 + }, + { + "epoch": 0.5033685843779347, + "grad_norm": 0.477297842502594, + "learning_rate": 5.0756415738503526e-05, + "loss": 1.0416, + "step": 1541 + }, + { + "epoch": 0.5036952349842799, + "grad_norm": 0.4862549901008606, + "learning_rate": 5.070425271366925e-05, + "loss": 1.0573, + "step": 1542 + }, + { + "epoch": 0.5040218855906251, + "grad_norm": 0.5033325552940369, + "learning_rate": 5.0652088922168086e-05, + "loss": 0.9036, + "step": 1543 + }, + { + "epoch": 0.5043485361969703, + "grad_norm": 0.5513638854026794, + "learning_rate": 5.059992442078677e-05, + "loss": 1.0243, + "step": 1544 + }, + { + "epoch": 0.5046751868033155, + "grad_norm": 0.6637901663780212, + "learning_rate": 5.05477592663129e-05, + "loss": 1.191, + "step": 1545 + }, + { + "epoch": 0.5050018374096606, + "grad_norm": 0.8518434166908264, + "learning_rate": 5.0495593515534744e-05, + "loss": 1.3829, + "step": 1546 + }, + { + "epoch": 0.5053284880160058, + "grad_norm": 0.9972793459892273, + "learning_rate": 5.0443427225241246e-05, + "loss": 1.5652, + "step": 1547 + }, + { + "epoch": 0.5056551386223511, + "grad_norm": 1.058459758758545, + "learning_rate": 5.039126045222189e-05, + "loss": 1.1677, + "step": 1548 + }, + { + "epoch": 0.5059817892286963, + "grad_norm": 1.164061188697815, + "learning_rate": 5.0339093253266735e-05, + "loss": 1.4483, + "step": 1549 + }, + { + "epoch": 0.5063084398350415, + "grad_norm": 2.050952911376953, + "learning_rate": 5.0286925685166275e-05, + "loss": 1.8747, + "step": 1550 + }, + { + "epoch": 0.5066350904413867, + "grad_norm": 0.19119663536548615, + "learning_rate": 5.023475780471143e-05, + "loss": 0.7206, + "step": 1551 + }, + { + "epoch": 0.5069617410477318, + "grad_norm": 0.24785088002681732, + "learning_rate": 5.018258966869343e-05, + "loss": 0.807, + "step": 1552 + }, + { + "epoch": 0.507288391654077, + "grad_norm": 0.23693665862083435, + "learning_rate": 5.01304213339038e-05, + "loss": 0.8574, + "step": 1553 + }, + { + "epoch": 0.5076150422604222, + "grad_norm": 0.28269410133361816, + "learning_rate": 5.007825285713429e-05, + "loss": 0.8504, + "step": 1554 + }, + { + "epoch": 0.5079416928667674, + "grad_norm": 0.27684375643730164, + "learning_rate": 5.002608429517677e-05, + "loss": 0.8391, + "step": 1555 + }, + { + "epoch": 0.5082683434731126, + "grad_norm": 0.28662434220314026, + "learning_rate": 4.9973915704823244e-05, + "loss": 0.7969, + "step": 1556 + }, + { + "epoch": 0.5085949940794577, + "grad_norm": 0.2877679169178009, + "learning_rate": 4.992174714286573e-05, + "loss": 0.8225, + "step": 1557 + }, + { + "epoch": 0.5089216446858029, + "grad_norm": 0.3176509439945221, + "learning_rate": 4.9869578666096214e-05, + "loss": 0.9153, + "step": 1558 + }, + { + "epoch": 0.5092482952921481, + "grad_norm": 0.328567773103714, + "learning_rate": 4.981741033130657e-05, + "loss": 0.8502, + "step": 1559 + }, + { + "epoch": 0.5095749458984933, + "grad_norm": 0.3393353819847107, + "learning_rate": 4.976524219528858e-05, + "loss": 0.8563, + "step": 1560 + }, + { + "epoch": 0.5099015965048385, + "grad_norm": 0.339493989944458, + "learning_rate": 4.9713074314833716e-05, + "loss": 0.8235, + "step": 1561 + }, + { + "epoch": 0.5102282471111838, + "grad_norm": 0.3445899188518524, + "learning_rate": 4.9660906746733276e-05, + "loss": 0.9349, + "step": 1562 + }, + { + "epoch": 0.5105548977175289, + "grad_norm": 0.386586993932724, + "learning_rate": 4.960873954777813e-05, + "loss": 0.9623, + "step": 1563 + }, + { + "epoch": 0.5108815483238741, + "grad_norm": 0.411825567483902, + "learning_rate": 4.955657277475877e-05, + "loss": 0.9959, + "step": 1564 + }, + { + "epoch": 0.5112081989302193, + "grad_norm": 0.4769602119922638, + "learning_rate": 4.950440648446527e-05, + "loss": 1.0555, + "step": 1565 + }, + { + "epoch": 0.5115348495365645, + "grad_norm": 0.4826885163784027, + "learning_rate": 4.945224073368711e-05, + "loss": 1.0665, + "step": 1566 + }, + { + "epoch": 0.5118615001429097, + "grad_norm": 0.5731224417686462, + "learning_rate": 4.9400075579213245e-05, + "loss": 1.1305, + "step": 1567 + }, + { + "epoch": 0.5121881507492548, + "grad_norm": 0.5455636978149414, + "learning_rate": 4.9347911077831946e-05, + "loss": 1.103, + "step": 1568 + }, + { + "epoch": 0.5125148013556, + "grad_norm": 0.6146492958068848, + "learning_rate": 4.9295747286330745e-05, + "loss": 1.184, + "step": 1569 + }, + { + "epoch": 0.5128414519619452, + "grad_norm": 0.7358331680297852, + "learning_rate": 4.9243584261496486e-05, + "loss": 1.3271, + "step": 1570 + }, + { + "epoch": 0.5131681025682904, + "grad_norm": 0.9477192163467407, + "learning_rate": 4.919142206011509e-05, + "loss": 1.3478, + "step": 1571 + }, + { + "epoch": 0.5134947531746356, + "grad_norm": 1.0120941400527954, + "learning_rate": 4.9139260738971644e-05, + "loss": 1.4512, + "step": 1572 + }, + { + "epoch": 0.5138214037809807, + "grad_norm": 1.1496292352676392, + "learning_rate": 4.908710035485024e-05, + "loss": 1.097, + "step": 1573 + }, + { + "epoch": 0.5141480543873259, + "grad_norm": 1.183207631111145, + "learning_rate": 4.903494096453396e-05, + "loss": 1.5095, + "step": 1574 + }, + { + "epoch": 0.5144747049936711, + "grad_norm": 1.803774356842041, + "learning_rate": 4.8982782624804806e-05, + "loss": 2.2187, + "step": 1575 + }, + { + "epoch": 0.5148013556000164, + "grad_norm": 0.2078971266746521, + "learning_rate": 4.8930625392443657e-05, + "loss": 0.8109, + "step": 1576 + }, + { + "epoch": 0.5151280062063616, + "grad_norm": 0.2547096610069275, + "learning_rate": 4.887846932423012e-05, + "loss": 0.8169, + "step": 1577 + }, + { + "epoch": 0.5154546568127067, + "grad_norm": 0.2599959075450897, + "learning_rate": 4.882631447694264e-05, + "loss": 0.8768, + "step": 1578 + }, + { + "epoch": 0.5157813074190519, + "grad_norm": 0.28119954466819763, + "learning_rate": 4.877416090735822e-05, + "loss": 0.8372, + "step": 1579 + }, + { + "epoch": 0.5161079580253971, + "grad_norm": 0.28370240330696106, + "learning_rate": 4.872200867225257e-05, + "loss": 0.8628, + "step": 1580 + }, + { + "epoch": 0.5164346086317423, + "grad_norm": 0.31476640701293945, + "learning_rate": 4.866985782839988e-05, + "loss": 0.9236, + "step": 1581 + }, + { + "epoch": 0.5167612592380875, + "grad_norm": 0.3151565492153168, + "learning_rate": 4.861770843257283e-05, + "loss": 0.8242, + "step": 1582 + }, + { + "epoch": 0.5170879098444326, + "grad_norm": 0.31610119342803955, + "learning_rate": 4.856556054154258e-05, + "loss": 0.7948, + "step": 1583 + }, + { + "epoch": 0.5174145604507778, + "grad_norm": 0.3681997060775757, + "learning_rate": 4.851341421207859e-05, + "loss": 0.9726, + "step": 1584 + }, + { + "epoch": 0.517741211057123, + "grad_norm": 0.3535841107368469, + "learning_rate": 4.846126950094864e-05, + "loss": 0.8712, + "step": 1585 + }, + { + "epoch": 0.5180678616634682, + "grad_norm": 0.378167986869812, + "learning_rate": 4.840912646491877e-05, + "loss": 1.0131, + "step": 1586 + }, + { + "epoch": 0.5183945122698134, + "grad_norm": 0.39976951479911804, + "learning_rate": 4.8356985160753155e-05, + "loss": 0.8891, + "step": 1587 + }, + { + "epoch": 0.5187211628761585, + "grad_norm": 0.39361557364463806, + "learning_rate": 4.830484564521414e-05, + "loss": 0.9145, + "step": 1588 + }, + { + "epoch": 0.5190478134825037, + "grad_norm": 0.44923731684684753, + "learning_rate": 4.8252707975062064e-05, + "loss": 0.9854, + "step": 1589 + }, + { + "epoch": 0.519374464088849, + "grad_norm": 0.4579111337661743, + "learning_rate": 4.820057220705529e-05, + "loss": 1.0045, + "step": 1590 + }, + { + "epoch": 0.5197011146951942, + "grad_norm": 0.48675835132598877, + "learning_rate": 4.814843839795011e-05, + "loss": 1.024, + "step": 1591 + }, + { + "epoch": 0.5200277653015394, + "grad_norm": 0.5352386832237244, + "learning_rate": 4.8096306604500674e-05, + "loss": 1.0537, + "step": 1592 + }, + { + "epoch": 0.5203544159078846, + "grad_norm": 0.578682005405426, + "learning_rate": 4.8044176883458946e-05, + "loss": 1.0153, + "step": 1593 + }, + { + "epoch": 0.5206810665142297, + "grad_norm": 0.6972869634628296, + "learning_rate": 4.799204929157463e-05, + "loss": 1.1377, + "step": 1594 + }, + { + "epoch": 0.5210077171205749, + "grad_norm": 0.7344182133674622, + "learning_rate": 4.793992388559509e-05, + "loss": 1.28, + "step": 1595 + }, + { + "epoch": 0.5213343677269201, + "grad_norm": 0.9843777418136597, + "learning_rate": 4.788780072226537e-05, + "loss": 1.2008, + "step": 1596 + }, + { + "epoch": 0.5216610183332653, + "grad_norm": 1.1680152416229248, + "learning_rate": 4.783567985832799e-05, + "loss": 1.2584, + "step": 1597 + }, + { + "epoch": 0.5219876689396105, + "grad_norm": 1.1478536128997803, + "learning_rate": 4.7783561350523065e-05, + "loss": 1.2884, + "step": 1598 + }, + { + "epoch": 0.5223143195459556, + "grad_norm": 1.6914291381835938, + "learning_rate": 4.7731445255588056e-05, + "loss": 1.4843, + "step": 1599 + }, + { + "epoch": 0.5226409701523008, + "grad_norm": 2.582030773162842, + "learning_rate": 4.767933163025781e-05, + "loss": 2.0824, + "step": 1600 + }, + { + "epoch": 0.522967620758646, + "grad_norm": 0.21092741191387177, + "learning_rate": 4.762722053126457e-05, + "loss": 0.7676, + "step": 1601 + }, + { + "epoch": 0.5232942713649912, + "grad_norm": 0.22836419939994812, + "learning_rate": 4.7575112015337724e-05, + "loss": 0.7556, + "step": 1602 + }, + { + "epoch": 0.5236209219713364, + "grad_norm": 0.2456665188074112, + "learning_rate": 4.7523006139203883e-05, + "loss": 0.7892, + "step": 1603 + }, + { + "epoch": 0.5239475725776817, + "grad_norm": 0.27531933784484863, + "learning_rate": 4.747090295958683e-05, + "loss": 0.8169, + "step": 1604 + }, + { + "epoch": 0.5242742231840268, + "grad_norm": 0.2720637619495392, + "learning_rate": 4.7418802533207335e-05, + "loss": 0.7968, + "step": 1605 + }, + { + "epoch": 0.524600873790372, + "grad_norm": 0.30058741569519043, + "learning_rate": 4.736670491678325e-05, + "loss": 0.872, + "step": 1606 + }, + { + "epoch": 0.5249275243967172, + "grad_norm": 0.2886933982372284, + "learning_rate": 4.7314610167029297e-05, + "loss": 0.8462, + "step": 1607 + }, + { + "epoch": 0.5252541750030624, + "grad_norm": 0.27622488141059875, + "learning_rate": 4.7262518340657115e-05, + "loss": 0.8096, + "step": 1608 + }, + { + "epoch": 0.5255808256094076, + "grad_norm": 0.31346070766448975, + "learning_rate": 4.721042949437516e-05, + "loss": 0.908, + "step": 1609 + }, + { + "epoch": 0.5259074762157527, + "grad_norm": 0.338704913854599, + "learning_rate": 4.7158343684888645e-05, + "loss": 0.9125, + "step": 1610 + }, + { + "epoch": 0.5262341268220979, + "grad_norm": 0.3269166648387909, + "learning_rate": 4.710626096889946e-05, + "loss": 0.8312, + "step": 1611 + }, + { + "epoch": 0.5265607774284431, + "grad_norm": 0.3700317442417145, + "learning_rate": 4.7054181403106146e-05, + "loss": 0.8932, + "step": 1612 + }, + { + "epoch": 0.5268874280347883, + "grad_norm": 0.3964787423610687, + "learning_rate": 4.700210504420378e-05, + "loss": 0.9223, + "step": 1613 + }, + { + "epoch": 0.5272140786411335, + "grad_norm": 0.4625542163848877, + "learning_rate": 4.695003194888402e-05, + "loss": 0.9979, + "step": 1614 + }, + { + "epoch": 0.5275407292474786, + "grad_norm": 0.46109652519226074, + "learning_rate": 4.68979621738349e-05, + "loss": 1.0069, + "step": 1615 + }, + { + "epoch": 0.5278673798538238, + "grad_norm": 0.46963831782341003, + "learning_rate": 4.684589577574083e-05, + "loss": 0.9752, + "step": 1616 + }, + { + "epoch": 0.528194030460169, + "grad_norm": 0.49279284477233887, + "learning_rate": 4.679383281128265e-05, + "loss": 1.0698, + "step": 1617 + }, + { + "epoch": 0.5285206810665142, + "grad_norm": 0.5191306471824646, + "learning_rate": 4.674177333713732e-05, + "loss": 1.0913, + "step": 1618 + }, + { + "epoch": 0.5288473316728595, + "grad_norm": 0.587307333946228, + "learning_rate": 4.668971740997814e-05, + "loss": 1.0703, + "step": 1619 + }, + { + "epoch": 0.5291739822792046, + "grad_norm": 0.6988803148269653, + "learning_rate": 4.663766508647443e-05, + "loss": 1.1106, + "step": 1620 + }, + { + "epoch": 0.5295006328855498, + "grad_norm": 0.763789176940918, + "learning_rate": 4.658561642329163e-05, + "loss": 1.3298, + "step": 1621 + }, + { + "epoch": 0.529827283491895, + "grad_norm": 0.9712387323379517, + "learning_rate": 4.6533571477091245e-05, + "loss": 1.3375, + "step": 1622 + }, + { + "epoch": 0.5301539340982402, + "grad_norm": 1.0462433099746704, + "learning_rate": 4.6481530304530646e-05, + "loss": 1.1795, + "step": 1623 + }, + { + "epoch": 0.5304805847045854, + "grad_norm": 1.2691247463226318, + "learning_rate": 4.6429492962263185e-05, + "loss": 1.7746, + "step": 1624 + }, + { + "epoch": 0.5308072353109305, + "grad_norm": 1.5794727802276611, + "learning_rate": 4.6377459506937966e-05, + "loss": 1.9366, + "step": 1625 + }, + { + "epoch": 0.5311338859172757, + "grad_norm": 0.19865591824054718, + "learning_rate": 4.632542999519989e-05, + "loss": 0.6034, + "step": 1626 + }, + { + "epoch": 0.5314605365236209, + "grad_norm": 0.2409842610359192, + "learning_rate": 4.627340448368961e-05, + "loss": 0.7811, + "step": 1627 + }, + { + "epoch": 0.5317871871299661, + "grad_norm": 0.26515185832977295, + "learning_rate": 4.6221383029043355e-05, + "loss": 0.8836, + "step": 1628 + }, + { + "epoch": 0.5321138377363113, + "grad_norm": 0.2678782045841217, + "learning_rate": 4.616936568789295e-05, + "loss": 0.8236, + "step": 1629 + }, + { + "epoch": 0.5324404883426564, + "grad_norm": 0.27880924940109253, + "learning_rate": 4.611735251686581e-05, + "loss": 0.8432, + "step": 1630 + }, + { + "epoch": 0.5327671389490016, + "grad_norm": 0.2928534150123596, + "learning_rate": 4.60653435725847e-05, + "loss": 0.8542, + "step": 1631 + }, + { + "epoch": 0.5330937895553468, + "grad_norm": 0.3031875789165497, + "learning_rate": 4.6013338911667896e-05, + "loss": 0.8876, + "step": 1632 + }, + { + "epoch": 0.5334204401616921, + "grad_norm": 0.3030424118041992, + "learning_rate": 4.5961338590728937e-05, + "loss": 0.911, + "step": 1633 + }, + { + "epoch": 0.5337470907680373, + "grad_norm": 0.33176282048225403, + "learning_rate": 4.5909342666376635e-05, + "loss": 0.8456, + "step": 1634 + }, + { + "epoch": 0.5340737413743825, + "grad_norm": 0.3273261785507202, + "learning_rate": 4.585735119521508e-05, + "loss": 0.8488, + "step": 1635 + }, + { + "epoch": 0.5344003919807276, + "grad_norm": 0.3342040181159973, + "learning_rate": 4.580536423384342e-05, + "loss": 0.8439, + "step": 1636 + }, + { + "epoch": 0.5347270425870728, + "grad_norm": 0.36872991919517517, + "learning_rate": 4.575338183885601e-05, + "loss": 1.0165, + "step": 1637 + }, + { + "epoch": 0.535053693193418, + "grad_norm": 0.38245999813079834, + "learning_rate": 4.570140406684214e-05, + "loss": 0.9385, + "step": 1638 + }, + { + "epoch": 0.5353803437997632, + "grad_norm": 0.39496734738349915, + "learning_rate": 4.564943097438608e-05, + "loss": 0.7809, + "step": 1639 + }, + { + "epoch": 0.5357069944061084, + "grad_norm": 0.4122662842273712, + "learning_rate": 4.559746261806704e-05, + "loss": 0.8743, + "step": 1640 + }, + { + "epoch": 0.5360336450124535, + "grad_norm": 0.4760144054889679, + "learning_rate": 4.554549905445908e-05, + "loss": 1.0506, + "step": 1641 + }, + { + "epoch": 0.5363602956187987, + "grad_norm": 0.5267740488052368, + "learning_rate": 4.549354034013097e-05, + "loss": 1.0937, + "step": 1642 + }, + { + "epoch": 0.5366869462251439, + "grad_norm": 0.537794291973114, + "learning_rate": 4.5441586531646305e-05, + "loss": 1.0207, + "step": 1643 + }, + { + "epoch": 0.5370135968314891, + "grad_norm": 0.5599216222763062, + "learning_rate": 4.5389637685563244e-05, + "loss": 1.0285, + "step": 1644 + }, + { + "epoch": 0.5373402474378343, + "grad_norm": 0.6215500235557556, + "learning_rate": 4.533769385843462e-05, + "loss": 1.1371, + "step": 1645 + }, + { + "epoch": 0.5376668980441794, + "grad_norm": 0.7406798601150513, + "learning_rate": 4.528575510680775e-05, + "loss": 1.2815, + "step": 1646 + }, + { + "epoch": 0.5379935486505247, + "grad_norm": 0.7674838304519653, + "learning_rate": 4.523382148722443e-05, + "loss": 1.1173, + "step": 1647 + }, + { + "epoch": 0.5383201992568699, + "grad_norm": 0.8467667102813721, + "learning_rate": 4.51818930562209e-05, + "loss": 1.0583, + "step": 1648 + }, + { + "epoch": 0.5386468498632151, + "grad_norm": 1.289118766784668, + "learning_rate": 4.512996987032773e-05, + "loss": 1.6204, + "step": 1649 + }, + { + "epoch": 0.5389735004695603, + "grad_norm": 1.2637457847595215, + "learning_rate": 4.507805198606979e-05, + "loss": 1.373, + "step": 1650 + }, + { + "epoch": 0.5393001510759055, + "grad_norm": 0.22539332509040833, + "learning_rate": 4.502613945996617e-05, + "loss": 0.7505, + "step": 1651 + }, + { + "epoch": 0.5396268016822506, + "grad_norm": 0.23382221162319183, + "learning_rate": 4.497423234853011e-05, + "loss": 0.7676, + "step": 1652 + }, + { + "epoch": 0.5399534522885958, + "grad_norm": 0.24569697678089142, + "learning_rate": 4.4922330708269005e-05, + "loss": 0.8458, + "step": 1653 + }, + { + "epoch": 0.540280102894941, + "grad_norm": 0.2564908564090729, + "learning_rate": 4.487043459568426e-05, + "loss": 0.8189, + "step": 1654 + }, + { + "epoch": 0.5406067535012862, + "grad_norm": 0.27906638383865356, + "learning_rate": 4.4818544067271226e-05, + "loss": 0.8337, + "step": 1655 + }, + { + "epoch": 0.5409334041076314, + "grad_norm": 0.283110648393631, + "learning_rate": 4.476665917951927e-05, + "loss": 0.8424, + "step": 1656 + }, + { + "epoch": 0.5412600547139765, + "grad_norm": 0.2966436445713043, + "learning_rate": 4.4714779988911525e-05, + "loss": 0.8646, + "step": 1657 + }, + { + "epoch": 0.5415867053203217, + "grad_norm": 0.3120681643486023, + "learning_rate": 4.466290655192498e-05, + "loss": 0.8508, + "step": 1658 + }, + { + "epoch": 0.5419133559266669, + "grad_norm": 0.3354993164539337, + "learning_rate": 4.461103892503036e-05, + "loss": 0.8795, + "step": 1659 + }, + { + "epoch": 0.5422400065330121, + "grad_norm": 0.35160109400749207, + "learning_rate": 4.455917716469199e-05, + "loss": 0.8703, + "step": 1660 + }, + { + "epoch": 0.5425666571393574, + "grad_norm": 0.34476974606513977, + "learning_rate": 4.450732132736792e-05, + "loss": 0.9471, + "step": 1661 + }, + { + "epoch": 0.5428933077457025, + "grad_norm": 0.37731653451919556, + "learning_rate": 4.445547146950966e-05, + "loss": 0.8646, + "step": 1662 + }, + { + "epoch": 0.5432199583520477, + "grad_norm": 0.41084280610084534, + "learning_rate": 4.4403627647562275e-05, + "loss": 0.9606, + "step": 1663 + }, + { + "epoch": 0.5435466089583929, + "grad_norm": 0.40829023718833923, + "learning_rate": 4.435178991796422e-05, + "loss": 1.0439, + "step": 1664 + }, + { + "epoch": 0.5438732595647381, + "grad_norm": 0.45158272981643677, + "learning_rate": 4.4299958337147314e-05, + "loss": 1.0997, + "step": 1665 + }, + { + "epoch": 0.5441999101710833, + "grad_norm": 0.4397522509098053, + "learning_rate": 4.4248132961536726e-05, + "loss": 0.9821, + "step": 1666 + }, + { + "epoch": 0.5445265607774284, + "grad_norm": 0.47100958228111267, + "learning_rate": 4.419631384755083e-05, + "loss": 1.0299, + "step": 1667 + }, + { + "epoch": 0.5448532113837736, + "grad_norm": 0.5272468328475952, + "learning_rate": 4.414450105160117e-05, + "loss": 1.045, + "step": 1668 + }, + { + "epoch": 0.5451798619901188, + "grad_norm": 0.5478541851043701, + "learning_rate": 4.409269463009248e-05, + "loss": 1.1235, + "step": 1669 + }, + { + "epoch": 0.545506512596464, + "grad_norm": 0.6685848236083984, + "learning_rate": 4.404089463942247e-05, + "loss": 1.2074, + "step": 1670 + }, + { + "epoch": 0.5458331632028092, + "grad_norm": 0.7062515616416931, + "learning_rate": 4.398910113598193e-05, + "loss": 1.2954, + "step": 1671 + }, + { + "epoch": 0.5461598138091543, + "grad_norm": 0.9928080439567566, + "learning_rate": 4.3937314176154523e-05, + "loss": 1.404, + "step": 1672 + }, + { + "epoch": 0.5464864644154995, + "grad_norm": 0.9474202990531921, + "learning_rate": 4.388553381631682e-05, + "loss": 1.1694, + "step": 1673 + }, + { + "epoch": 0.5468131150218447, + "grad_norm": 1.3454349040985107, + "learning_rate": 4.3833760112838196e-05, + "loss": 1.5713, + "step": 1674 + }, + { + "epoch": 0.54713976562819, + "grad_norm": 1.6091099977493286, + "learning_rate": 4.3781993122080795e-05, + "loss": 1.7261, + "step": 1675 + }, + { + "epoch": 0.5474664162345352, + "grad_norm": 0.20016787946224213, + "learning_rate": 4.3730232900399436e-05, + "loss": 0.8039, + "step": 1676 + }, + { + "epoch": 0.5477930668408804, + "grad_norm": 0.23649336397647858, + "learning_rate": 4.36784795041416e-05, + "loss": 0.802, + "step": 1677 + }, + { + "epoch": 0.5481197174472255, + "grad_norm": 0.2561361491680145, + "learning_rate": 4.3626732989647256e-05, + "loss": 0.825, + "step": 1678 + }, + { + "epoch": 0.5484463680535707, + "grad_norm": 0.2604573965072632, + "learning_rate": 4.357499341324901e-05, + "loss": 0.8709, + "step": 1679 + }, + { + "epoch": 0.5487730186599159, + "grad_norm": 0.270620733499527, + "learning_rate": 4.3523260831271786e-05, + "loss": 0.8286, + "step": 1680 + }, + { + "epoch": 0.5490996692662611, + "grad_norm": 0.2832171320915222, + "learning_rate": 4.3471535300032996e-05, + "loss": 0.8517, + "step": 1681 + }, + { + "epoch": 0.5494263198726063, + "grad_norm": 0.27463555335998535, + "learning_rate": 4.341981687584231e-05, + "loss": 0.7702, + "step": 1682 + }, + { + "epoch": 0.5497529704789514, + "grad_norm": 0.3110555112361908, + "learning_rate": 4.336810561500167e-05, + "loss": 0.8606, + "step": 1683 + }, + { + "epoch": 0.5500796210852966, + "grad_norm": 0.33201920986175537, + "learning_rate": 4.331640157380527e-05, + "loss": 0.9015, + "step": 1684 + }, + { + "epoch": 0.5504062716916418, + "grad_norm": 0.31452447175979614, + "learning_rate": 4.326470480853938e-05, + "loss": 0.8095, + "step": 1685 + }, + { + "epoch": 0.550732922297987, + "grad_norm": 0.35205140709877014, + "learning_rate": 4.3213015375482376e-05, + "loss": 0.9332, + "step": 1686 + }, + { + "epoch": 0.5510595729043322, + "grad_norm": 0.38866329193115234, + "learning_rate": 4.3161333330904676e-05, + "loss": 0.9075, + "step": 1687 + }, + { + "epoch": 0.5513862235106773, + "grad_norm": 0.4142824113368988, + "learning_rate": 4.31096587310686e-05, + "loss": 0.9783, + "step": 1688 + }, + { + "epoch": 0.5517128741170226, + "grad_norm": 0.41084322333335876, + "learning_rate": 4.305799163222843e-05, + "loss": 0.917, + "step": 1689 + }, + { + "epoch": 0.5520395247233678, + "grad_norm": 0.4371437728404999, + "learning_rate": 4.300633209063022e-05, + "loss": 0.99, + "step": 1690 + }, + { + "epoch": 0.552366175329713, + "grad_norm": 0.43960168957710266, + "learning_rate": 4.295468016251183e-05, + "loss": 0.9339, + "step": 1691 + }, + { + "epoch": 0.5526928259360582, + "grad_norm": 0.5299369096755981, + "learning_rate": 4.290303590410283e-05, + "loss": 0.9887, + "step": 1692 + }, + { + "epoch": 0.5530194765424034, + "grad_norm": 0.49330300092697144, + "learning_rate": 4.285139937162445e-05, + "loss": 0.9703, + "step": 1693 + }, + { + "epoch": 0.5533461271487485, + "grad_norm": 0.5797840356826782, + "learning_rate": 4.2799770621289466e-05, + "loss": 1.0255, + "step": 1694 + }, + { + "epoch": 0.5536727777550937, + "grad_norm": 0.7644694447517395, + "learning_rate": 4.274814970930225e-05, + "loss": 1.3008, + "step": 1695 + }, + { + "epoch": 0.5539994283614389, + "grad_norm": 0.8262007236480713, + "learning_rate": 4.2696536691858556e-05, + "loss": 1.4008, + "step": 1696 + }, + { + "epoch": 0.5543260789677841, + "grad_norm": 0.9084567427635193, + "learning_rate": 4.264493162514565e-05, + "loss": 1.3747, + "step": 1697 + }, + { + "epoch": 0.5546527295741293, + "grad_norm": 1.4628161191940308, + "learning_rate": 4.2593334565342034e-05, + "loss": 1.4698, + "step": 1698 + }, + { + "epoch": 0.5549793801804744, + "grad_norm": 1.404968500137329, + "learning_rate": 4.254174556861755e-05, + "loss": 1.9086, + "step": 1699 + }, + { + "epoch": 0.5553060307868196, + "grad_norm": 1.955612301826477, + "learning_rate": 4.249016469113329e-05, + "loss": 1.8205, + "step": 1700 + }, + { + "epoch": 0.5556326813931648, + "grad_norm": 0.21000787615776062, + "learning_rate": 4.243859198904142e-05, + "loss": 0.7214, + "step": 1701 + }, + { + "epoch": 0.55595933199951, + "grad_norm": 0.20873922109603882, + "learning_rate": 4.238702751848531e-05, + "loss": 0.7236, + "step": 1702 + }, + { + "epoch": 0.5562859826058553, + "grad_norm": 0.2349824458360672, + "learning_rate": 4.23354713355993e-05, + "loss": 0.8209, + "step": 1703 + }, + { + "epoch": 0.5566126332122004, + "grad_norm": 0.2445916384458542, + "learning_rate": 4.22839234965087e-05, + "loss": 0.8256, + "step": 1704 + }, + { + "epoch": 0.5569392838185456, + "grad_norm": 0.2502254545688629, + "learning_rate": 4.22323840573298e-05, + "loss": 0.7655, + "step": 1705 + }, + { + "epoch": 0.5572659344248908, + "grad_norm": 0.28509315848350525, + "learning_rate": 4.2180853074169674e-05, + "loss": 0.8841, + "step": 1706 + }, + { + "epoch": 0.557592585031236, + "grad_norm": 0.28722479939460754, + "learning_rate": 4.212933060312626e-05, + "loss": 0.8389, + "step": 1707 + }, + { + "epoch": 0.5579192356375812, + "grad_norm": 0.29638615250587463, + "learning_rate": 4.2077816700288165e-05, + "loss": 0.7169, + "step": 1708 + }, + { + "epoch": 0.5582458862439263, + "grad_norm": 0.32606878876686096, + "learning_rate": 4.202631142173471e-05, + "loss": 0.8639, + "step": 1709 + }, + { + "epoch": 0.5585725368502715, + "grad_norm": 0.31630176305770874, + "learning_rate": 4.1974814823535815e-05, + "loss": 0.8409, + "step": 1710 + }, + { + "epoch": 0.5588991874566167, + "grad_norm": 0.3516806364059448, + "learning_rate": 4.192332696175195e-05, + "loss": 0.9856, + "step": 1711 + }, + { + "epoch": 0.5592258380629619, + "grad_norm": 0.33567309379577637, + "learning_rate": 4.187184789243406e-05, + "loss": 0.8381, + "step": 1712 + }, + { + "epoch": 0.5595524886693071, + "grad_norm": 0.3585726320743561, + "learning_rate": 4.1820377671623565e-05, + "loss": 0.8846, + "step": 1713 + }, + { + "epoch": 0.5598791392756522, + "grad_norm": 0.3721924424171448, + "learning_rate": 4.176891635535218e-05, + "loss": 1.0201, + "step": 1714 + }, + { + "epoch": 0.5602057898819974, + "grad_norm": 0.3866981267929077, + "learning_rate": 4.1717463999642006e-05, + "loss": 0.8751, + "step": 1715 + }, + { + "epoch": 0.5605324404883426, + "grad_norm": 0.43328696489334106, + "learning_rate": 4.1666020660505334e-05, + "loss": 0.9957, + "step": 1716 + }, + { + "epoch": 0.5608590910946878, + "grad_norm": 0.47395583987236023, + "learning_rate": 4.1614586393944624e-05, + "loss": 0.9973, + "step": 1717 + }, + { + "epoch": 0.5611857417010331, + "grad_norm": 0.5074653029441833, + "learning_rate": 4.156316125595254e-05, + "loss": 1.0893, + "step": 1718 + }, + { + "epoch": 0.5615123923073783, + "grad_norm": 0.5391126275062561, + "learning_rate": 4.1511745302511726e-05, + "loss": 0.957, + "step": 1719 + }, + { + "epoch": 0.5618390429137234, + "grad_norm": 0.6273775696754456, + "learning_rate": 4.1460338589594885e-05, + "loss": 1.314, + "step": 1720 + }, + { + "epoch": 0.5621656935200686, + "grad_norm": 0.7783089280128479, + "learning_rate": 4.1408941173164625e-05, + "loss": 1.2052, + "step": 1721 + }, + { + "epoch": 0.5624923441264138, + "grad_norm": 0.9860546588897705, + "learning_rate": 4.1357553109173416e-05, + "loss": 1.4157, + "step": 1722 + }, + { + "epoch": 0.562818994732759, + "grad_norm": 1.039588451385498, + "learning_rate": 4.130617445356363e-05, + "loss": 1.4652, + "step": 1723 + }, + { + "epoch": 0.5631456453391042, + "grad_norm": 1.2351906299591064, + "learning_rate": 4.1254805262267296e-05, + "loss": 1.6218, + "step": 1724 + }, + { + "epoch": 0.5634722959454493, + "grad_norm": 1.5483803749084473, + "learning_rate": 4.1203445591206204e-05, + "loss": 1.6209, + "step": 1725 + }, + { + "epoch": 0.5637989465517945, + "grad_norm": 0.20997747778892517, + "learning_rate": 4.1152095496291764e-05, + "loss": 0.7082, + "step": 1726 + }, + { + "epoch": 0.5641255971581397, + "grad_norm": 0.23678404092788696, + "learning_rate": 4.110075503342495e-05, + "loss": 0.7857, + "step": 1727 + }, + { + "epoch": 0.5644522477644849, + "grad_norm": 0.2515248954296112, + "learning_rate": 4.1049424258496274e-05, + "loss": 0.8146, + "step": 1728 + }, + { + "epoch": 0.5647788983708301, + "grad_norm": 0.27904850244522095, + "learning_rate": 4.099810322738568e-05, + "loss": 0.8893, + "step": 1729 + }, + { + "epoch": 0.5651055489771752, + "grad_norm": 0.2811621427536011, + "learning_rate": 4.094679199596249e-05, + "loss": 0.8255, + "step": 1730 + }, + { + "epoch": 0.5654321995835204, + "grad_norm": 0.2903529703617096, + "learning_rate": 4.0895490620085425e-05, + "loss": 0.8682, + "step": 1731 + }, + { + "epoch": 0.5657588501898657, + "grad_norm": 0.29601550102233887, + "learning_rate": 4.084419915560238e-05, + "loss": 0.8585, + "step": 1732 + }, + { + "epoch": 0.5660855007962109, + "grad_norm": 0.2924976944923401, + "learning_rate": 4.079291765835056e-05, + "loss": 0.8122, + "step": 1733 + }, + { + "epoch": 0.5664121514025561, + "grad_norm": 0.3249928057193756, + "learning_rate": 4.074164618415625e-05, + "loss": 0.8835, + "step": 1734 + }, + { + "epoch": 0.5667388020089013, + "grad_norm": 0.32690802216529846, + "learning_rate": 4.069038478883481e-05, + "loss": 0.893, + "step": 1735 + }, + { + "epoch": 0.5670654526152464, + "grad_norm": 0.33490654826164246, + "learning_rate": 4.063913352819073e-05, + "loss": 0.9048, + "step": 1736 + }, + { + "epoch": 0.5673921032215916, + "grad_norm": 0.36397576332092285, + "learning_rate": 4.058789245801736e-05, + "loss": 0.9796, + "step": 1737 + }, + { + "epoch": 0.5677187538279368, + "grad_norm": 0.3783029615879059, + "learning_rate": 4.0536661634096977e-05, + "loss": 0.9378, + "step": 1738 + }, + { + "epoch": 0.568045404434282, + "grad_norm": 0.3783353567123413, + "learning_rate": 4.048544111220077e-05, + "loss": 0.8839, + "step": 1739 + }, + { + "epoch": 0.5683720550406272, + "grad_norm": 0.4432784616947174, + "learning_rate": 4.043423094808863e-05, + "loss": 0.9956, + "step": 1740 + }, + { + "epoch": 0.5686987056469723, + "grad_norm": 0.48436063528060913, + "learning_rate": 4.0383031197509226e-05, + "loss": 1.0038, + "step": 1741 + }, + { + "epoch": 0.5690253562533175, + "grad_norm": 0.5452389717102051, + "learning_rate": 4.0331841916199867e-05, + "loss": 1.0593, + "step": 1742 + }, + { + "epoch": 0.5693520068596627, + "grad_norm": 0.6084750890731812, + "learning_rate": 4.028066315988646e-05, + "loss": 1.1621, + "step": 1743 + }, + { + "epoch": 0.5696786574660079, + "grad_norm": 0.6114654541015625, + "learning_rate": 4.0229494984283485e-05, + "loss": 1.1292, + "step": 1744 + }, + { + "epoch": 0.5700053080723531, + "grad_norm": 0.7506347298622131, + "learning_rate": 4.017833744509387e-05, + "loss": 1.0845, + "step": 1745 + }, + { + "epoch": 0.5703319586786983, + "grad_norm": 0.6660485863685608, + "learning_rate": 4.012719059800897e-05, + "loss": 1.0965, + "step": 1746 + }, + { + "epoch": 0.5706586092850435, + "grad_norm": 0.8481147289276123, + "learning_rate": 4.0076054498708545e-05, + "loss": 1.34, + "step": 1747 + }, + { + "epoch": 0.5709852598913887, + "grad_norm": 1.0453165769577026, + "learning_rate": 4.002492920286056e-05, + "loss": 1.3245, + "step": 1748 + }, + { + "epoch": 0.5713119104977339, + "grad_norm": 1.374419093132019, + "learning_rate": 3.997381476612132e-05, + "loss": 1.4342, + "step": 1749 + }, + { + "epoch": 0.5716385611040791, + "grad_norm": 1.6070213317871094, + "learning_rate": 3.992271124413527e-05, + "loss": 1.6983, + "step": 1750 + }, + { + "epoch": 0.5719652117104242, + "grad_norm": 0.21011681854724884, + "learning_rate": 3.987161869253491e-05, + "loss": 0.7403, + "step": 1751 + }, + { + "epoch": 0.5722918623167694, + "grad_norm": 0.25537699460983276, + "learning_rate": 3.982053716694092e-05, + "loss": 0.8284, + "step": 1752 + }, + { + "epoch": 0.5726185129231146, + "grad_norm": 0.2665100693702698, + "learning_rate": 3.976946672296187e-05, + "loss": 0.8535, + "step": 1753 + }, + { + "epoch": 0.5729451635294598, + "grad_norm": 0.2679298520088196, + "learning_rate": 3.971840741619435e-05, + "loss": 0.8884, + "step": 1754 + }, + { + "epoch": 0.573271814135805, + "grad_norm": 0.27296388149261475, + "learning_rate": 3.9667359302222745e-05, + "loss": 0.8318, + "step": 1755 + }, + { + "epoch": 0.5735984647421501, + "grad_norm": 0.2760670483112335, + "learning_rate": 3.961632243661929e-05, + "loss": 0.8232, + "step": 1756 + }, + { + "epoch": 0.5739251153484953, + "grad_norm": 0.3013848662376404, + "learning_rate": 3.956529687494401e-05, + "loss": 0.9044, + "step": 1757 + }, + { + "epoch": 0.5742517659548405, + "grad_norm": 0.2987593412399292, + "learning_rate": 3.9514282672744574e-05, + "loss": 0.9345, + "step": 1758 + }, + { + "epoch": 0.5745784165611857, + "grad_norm": 0.3210752606391907, + "learning_rate": 3.946327988555632e-05, + "loss": 0.8186, + "step": 1759 + }, + { + "epoch": 0.574905067167531, + "grad_norm": 0.3270179033279419, + "learning_rate": 3.941228856890212e-05, + "loss": 0.8363, + "step": 1760 + }, + { + "epoch": 0.5752317177738762, + "grad_norm": 0.3446122705936432, + "learning_rate": 3.93613087782924e-05, + "loss": 0.8473, + "step": 1761 + }, + { + "epoch": 0.5755583683802213, + "grad_norm": 0.37168315052986145, + "learning_rate": 3.931034056922501e-05, + "loss": 0.9515, + "step": 1762 + }, + { + "epoch": 0.5758850189865665, + "grad_norm": 0.3891473710536957, + "learning_rate": 3.9259383997185216e-05, + "loss": 0.9106, + "step": 1763 + }, + { + "epoch": 0.5762116695929117, + "grad_norm": 0.43029606342315674, + "learning_rate": 3.920843911764559e-05, + "loss": 1.0014, + "step": 1764 + }, + { + "epoch": 0.5765383201992569, + "grad_norm": 0.46320056915283203, + "learning_rate": 3.9157505986066e-05, + "loss": 1.0457, + "step": 1765 + }, + { + "epoch": 0.5768649708056021, + "grad_norm": 0.47448238730430603, + "learning_rate": 3.9106584657893484e-05, + "loss": 1.0156, + "step": 1766 + }, + { + "epoch": 0.5771916214119472, + "grad_norm": 0.48397818207740784, + "learning_rate": 3.9055675188562305e-05, + "loss": 0.9642, + "step": 1767 + }, + { + "epoch": 0.5775182720182924, + "grad_norm": 0.5289010405540466, + "learning_rate": 3.900477763349374e-05, + "loss": 1.0498, + "step": 1768 + }, + { + "epoch": 0.5778449226246376, + "grad_norm": 0.5463656187057495, + "learning_rate": 3.895389204809611e-05, + "loss": 1.0686, + "step": 1769 + }, + { + "epoch": 0.5781715732309828, + "grad_norm": 0.641441822052002, + "learning_rate": 3.890301848776476e-05, + "loss": 1.1969, + "step": 1770 + }, + { + "epoch": 0.578498223837328, + "grad_norm": 0.7622916102409363, + "learning_rate": 3.885215700788188e-05, + "loss": 1.2451, + "step": 1771 + }, + { + "epoch": 0.5788248744436731, + "grad_norm": 0.8429807424545288, + "learning_rate": 3.880130766381655e-05, + "loss": 1.3678, + "step": 1772 + }, + { + "epoch": 0.5791515250500183, + "grad_norm": 1.130521535873413, + "learning_rate": 3.875047051092462e-05, + "loss": 1.6483, + "step": 1773 + }, + { + "epoch": 0.5794781756563636, + "grad_norm": 1.1499173641204834, + "learning_rate": 3.869964560454865e-05, + "loss": 1.4263, + "step": 1774 + }, + { + "epoch": 0.5798048262627088, + "grad_norm": 1.3551626205444336, + "learning_rate": 3.864883300001793e-05, + "loss": 1.6207, + "step": 1775 + }, + { + "epoch": 0.580131476869054, + "grad_norm": 0.20396538078784943, + "learning_rate": 3.859803275264827e-05, + "loss": 0.8017, + "step": 1776 + }, + { + "epoch": 0.5804581274753992, + "grad_norm": 0.23019051551818848, + "learning_rate": 3.854724491774212e-05, + "loss": 0.8158, + "step": 1777 + }, + { + "epoch": 0.5807847780817443, + "grad_norm": 0.2596058249473572, + "learning_rate": 3.849646955058835e-05, + "loss": 0.7755, + "step": 1778 + }, + { + "epoch": 0.5811114286880895, + "grad_norm": 0.26550063490867615, + "learning_rate": 3.844570670646227e-05, + "loss": 0.7717, + "step": 1779 + }, + { + "epoch": 0.5814380792944347, + "grad_norm": 0.2764573097229004, + "learning_rate": 3.8394956440625605e-05, + "loss": 0.8305, + "step": 1780 + }, + { + "epoch": 0.5817647299007799, + "grad_norm": 0.304564505815506, + "learning_rate": 3.834421880832632e-05, + "loss": 0.9052, + "step": 1781 + }, + { + "epoch": 0.5820913805071251, + "grad_norm": 0.31863516569137573, + "learning_rate": 3.8293493864798646e-05, + "loss": 0.8262, + "step": 1782 + }, + { + "epoch": 0.5824180311134702, + "grad_norm": 0.32435324788093567, + "learning_rate": 3.824278166526305e-05, + "loss": 0.8383, + "step": 1783 + }, + { + "epoch": 0.5827446817198154, + "grad_norm": 0.32743144035339355, + "learning_rate": 3.8192082264926035e-05, + "loss": 0.8747, + "step": 1784 + }, + { + "epoch": 0.5830713323261606, + "grad_norm": 0.3483351469039917, + "learning_rate": 3.8141395718980275e-05, + "loss": 0.8814, + "step": 1785 + }, + { + "epoch": 0.5833979829325058, + "grad_norm": 0.37021541595458984, + "learning_rate": 3.809072208260437e-05, + "loss": 0.9519, + "step": 1786 + }, + { + "epoch": 0.583724633538851, + "grad_norm": 0.37525907158851624, + "learning_rate": 3.804006141096288e-05, + "loss": 0.8737, + "step": 1787 + }, + { + "epoch": 0.5840512841451962, + "grad_norm": 0.43124061822891235, + "learning_rate": 3.79894137592063e-05, + "loss": 1.0225, + "step": 1788 + }, + { + "epoch": 0.5843779347515414, + "grad_norm": 0.42931702733039856, + "learning_rate": 3.793877918247087e-05, + "loss": 0.9199, + "step": 1789 + }, + { + "epoch": 0.5847045853578866, + "grad_norm": 0.4696066081523895, + "learning_rate": 3.78881577358787e-05, + "loss": 1.0533, + "step": 1790 + }, + { + "epoch": 0.5850312359642318, + "grad_norm": 0.5039542317390442, + "learning_rate": 3.783754947453751e-05, + "loss": 1.0553, + "step": 1791 + }, + { + "epoch": 0.585357886570577, + "grad_norm": 0.5651768445968628, + "learning_rate": 3.77869544535407e-05, + "loss": 1.1349, + "step": 1792 + }, + { + "epoch": 0.5856845371769221, + "grad_norm": 0.6366235017776489, + "learning_rate": 3.7736372727967284e-05, + "loss": 1.1709, + "step": 1793 + }, + { + "epoch": 0.5860111877832673, + "grad_norm": 0.623162567615509, + "learning_rate": 3.768580435288177e-05, + "loss": 1.241, + "step": 1794 + }, + { + "epoch": 0.5863378383896125, + "grad_norm": 0.7199271321296692, + "learning_rate": 3.7635249383334114e-05, + "loss": 1.1771, + "step": 1795 + }, + { + "epoch": 0.5866644889959577, + "grad_norm": 0.8708171248435974, + "learning_rate": 3.758470787435977e-05, + "loss": 1.3232, + "step": 1796 + }, + { + "epoch": 0.5869911396023029, + "grad_norm": 1.146207332611084, + "learning_rate": 3.7534179880979405e-05, + "loss": 1.5614, + "step": 1797 + }, + { + "epoch": 0.587317790208648, + "grad_norm": 1.1645411252975464, + "learning_rate": 3.748366545819909e-05, + "loss": 1.3229, + "step": 1798 + }, + { + "epoch": 0.5876444408149932, + "grad_norm": 1.5172971487045288, + "learning_rate": 3.7433164661010066e-05, + "loss": 1.465, + "step": 1799 + }, + { + "epoch": 0.5879710914213384, + "grad_norm": 1.9966884851455688, + "learning_rate": 3.738267754438872e-05, + "loss": 2.1441, + "step": 1800 + }, + { + "epoch": 0.5882977420276836, + "grad_norm": 0.22755567729473114, + "learning_rate": 3.733220416329662e-05, + "loss": 0.8176, + "step": 1801 + }, + { + "epoch": 0.5886243926340289, + "grad_norm": 0.22714000940322876, + "learning_rate": 3.7281744572680297e-05, + "loss": 0.7798, + "step": 1802 + }, + { + "epoch": 0.5889510432403741, + "grad_norm": 0.2419450730085373, + "learning_rate": 3.723129882747135e-05, + "loss": 0.7938, + "step": 1803 + }, + { + "epoch": 0.5892776938467192, + "grad_norm": 0.2522052228450775, + "learning_rate": 3.7180866982586234e-05, + "loss": 0.7725, + "step": 1804 + }, + { + "epoch": 0.5896043444530644, + "grad_norm": 0.27660250663757324, + "learning_rate": 3.713044909292629e-05, + "loss": 0.8315, + "step": 1805 + }, + { + "epoch": 0.5899309950594096, + "grad_norm": 0.283093124628067, + "learning_rate": 3.708004521337772e-05, + "loss": 0.8763, + "step": 1806 + }, + { + "epoch": 0.5902576456657548, + "grad_norm": 0.288693368434906, + "learning_rate": 3.702965539881141e-05, + "loss": 0.9238, + "step": 1807 + }, + { + "epoch": 0.5905842962721, + "grad_norm": 0.3175692558288574, + "learning_rate": 3.697927970408294e-05, + "loss": 0.8364, + "step": 1808 + }, + { + "epoch": 0.5909109468784451, + "grad_norm": 0.31832441687583923, + "learning_rate": 3.6928918184032566e-05, + "loss": 0.8666, + "step": 1809 + }, + { + "epoch": 0.5912375974847903, + "grad_norm": 0.32215848565101624, + "learning_rate": 3.687857089348509e-05, + "loss": 0.8798, + "step": 1810 + }, + { + "epoch": 0.5915642480911355, + "grad_norm": 0.3508221209049225, + "learning_rate": 3.682823788724979e-05, + "loss": 0.9086, + "step": 1811 + }, + { + "epoch": 0.5918908986974807, + "grad_norm": 0.3799271285533905, + "learning_rate": 3.677791922012045e-05, + "loss": 0.9276, + "step": 1812 + }, + { + "epoch": 0.5922175493038259, + "grad_norm": 0.400530606508255, + "learning_rate": 3.6727614946875176e-05, + "loss": 0.9829, + "step": 1813 + }, + { + "epoch": 0.592544199910171, + "grad_norm": 0.4067695438861847, + "learning_rate": 3.667732512227649e-05, + "loss": 0.8977, + "step": 1814 + }, + { + "epoch": 0.5928708505165162, + "grad_norm": 0.46635621786117554, + "learning_rate": 3.662704980107109e-05, + "loss": 1.0206, + "step": 1815 + }, + { + "epoch": 0.5931975011228615, + "grad_norm": 0.47291892766952515, + "learning_rate": 3.657678903798999e-05, + "loss": 0.9792, + "step": 1816 + }, + { + "epoch": 0.5935241517292067, + "grad_norm": 0.5139042735099792, + "learning_rate": 3.652654288774827e-05, + "loss": 0.9158, + "step": 1817 + }, + { + "epoch": 0.5938508023355519, + "grad_norm": 0.5157402157783508, + "learning_rate": 3.64763114050451e-05, + "loss": 1.0019, + "step": 1818 + }, + { + "epoch": 0.594177452941897, + "grad_norm": 0.5718170404434204, + "learning_rate": 3.642609464456377e-05, + "loss": 0.9613, + "step": 1819 + }, + { + "epoch": 0.5945041035482422, + "grad_norm": 0.6026821732521057, + "learning_rate": 3.6375892660971464e-05, + "loss": 1.1645, + "step": 1820 + }, + { + "epoch": 0.5948307541545874, + "grad_norm": 0.7391718626022339, + "learning_rate": 3.6325705508919286e-05, + "loss": 1.264, + "step": 1821 + }, + { + "epoch": 0.5951574047609326, + "grad_norm": 0.936079740524292, + "learning_rate": 3.6275533243042246e-05, + "loss": 1.4989, + "step": 1822 + }, + { + "epoch": 0.5954840553672778, + "grad_norm": 1.090334177017212, + "learning_rate": 3.622537591795907e-05, + "loss": 1.3695, + "step": 1823 + }, + { + "epoch": 0.595810705973623, + "grad_norm": 1.3231812715530396, + "learning_rate": 3.61752335882723e-05, + "loss": 1.6894, + "step": 1824 + }, + { + "epoch": 0.5961373565799681, + "grad_norm": 1.463555097579956, + "learning_rate": 3.61251063085681e-05, + "loss": 1.6237, + "step": 1825 + }, + { + "epoch": 0.5964640071863133, + "grad_norm": 0.21345098316669464, + "learning_rate": 3.607499413341626e-05, + "loss": 0.7963, + "step": 1826 + }, + { + "epoch": 0.5967906577926585, + "grad_norm": 0.2452787309885025, + "learning_rate": 3.602489711737015e-05, + "loss": 0.8304, + "step": 1827 + }, + { + "epoch": 0.5971173083990037, + "grad_norm": 0.2656680941581726, + "learning_rate": 3.59748153149666e-05, + "loss": 0.8257, + "step": 1828 + }, + { + "epoch": 0.5974439590053489, + "grad_norm": 0.25352826714515686, + "learning_rate": 3.592474878072592e-05, + "loss": 0.7629, + "step": 1829 + }, + { + "epoch": 0.597770609611694, + "grad_norm": 0.259874165058136, + "learning_rate": 3.587469756915177e-05, + "loss": 0.7529, + "step": 1830 + }, + { + "epoch": 0.5980972602180393, + "grad_norm": 0.3154599666595459, + "learning_rate": 3.5824661734731126e-05, + "loss": 0.8974, + "step": 1831 + }, + { + "epoch": 0.5984239108243845, + "grad_norm": 0.3104557693004608, + "learning_rate": 3.577464133193425e-05, + "loss": 0.9489, + "step": 1832 + }, + { + "epoch": 0.5987505614307297, + "grad_norm": 0.29206931591033936, + "learning_rate": 3.572463641521456e-05, + "loss": 0.8588, + "step": 1833 + }, + { + "epoch": 0.5990772120370749, + "grad_norm": 0.3030562996864319, + "learning_rate": 3.5674647039008694e-05, + "loss": 0.8096, + "step": 1834 + }, + { + "epoch": 0.59940386264342, + "grad_norm": 0.353023499250412, + "learning_rate": 3.5624673257736294e-05, + "loss": 0.8584, + "step": 1835 + }, + { + "epoch": 0.5997305132497652, + "grad_norm": 0.35337188839912415, + "learning_rate": 3.557471512580004e-05, + "loss": 0.8645, + "step": 1836 + }, + { + "epoch": 0.6000571638561104, + "grad_norm": 0.3872027099132538, + "learning_rate": 3.552477269758562e-05, + "loss": 0.849, + "step": 1837 + }, + { + "epoch": 0.6003838144624556, + "grad_norm": 0.36967232823371887, + "learning_rate": 3.547484602746158e-05, + "loss": 0.8275, + "step": 1838 + }, + { + "epoch": 0.6007104650688008, + "grad_norm": 0.3914700746536255, + "learning_rate": 3.5424935169779314e-05, + "loss": 0.9307, + "step": 1839 + }, + { + "epoch": 0.601037115675146, + "grad_norm": 0.4842604994773865, + "learning_rate": 3.537504017887305e-05, + "loss": 0.9365, + "step": 1840 + }, + { + "epoch": 0.6013637662814911, + "grad_norm": 0.48530450463294983, + "learning_rate": 3.5325161109059665e-05, + "loss": 1.0595, + "step": 1841 + }, + { + "epoch": 0.6016904168878363, + "grad_norm": 0.493562787771225, + "learning_rate": 3.52752980146388e-05, + "loss": 0.9919, + "step": 1842 + }, + { + "epoch": 0.6020170674941815, + "grad_norm": 0.5630847811698914, + "learning_rate": 3.522545094989262e-05, + "loss": 1.1229, + "step": 1843 + }, + { + "epoch": 0.6023437181005267, + "grad_norm": 0.6644653677940369, + "learning_rate": 3.517561996908587e-05, + "loss": 1.1268, + "step": 1844 + }, + { + "epoch": 0.602670368706872, + "grad_norm": 0.7322569489479065, + "learning_rate": 3.5125805126465795e-05, + "loss": 1.3862, + "step": 1845 + }, + { + "epoch": 0.6029970193132171, + "grad_norm": 0.8114511966705322, + "learning_rate": 3.5076006476262066e-05, + "loss": 1.0942, + "step": 1846 + }, + { + "epoch": 0.6033236699195623, + "grad_norm": 1.0138306617736816, + "learning_rate": 3.502622407268673e-05, + "loss": 1.1355, + "step": 1847 + }, + { + "epoch": 0.6036503205259075, + "grad_norm": 1.3908495903015137, + "learning_rate": 3.497645796993414e-05, + "loss": 1.4335, + "step": 1848 + }, + { + "epoch": 0.6039769711322527, + "grad_norm": 1.703803539276123, + "learning_rate": 3.492670822218089e-05, + "loss": 1.9552, + "step": 1849 + }, + { + "epoch": 0.6043036217385979, + "grad_norm": 1.7327630519866943, + "learning_rate": 3.4876974883585814e-05, + "loss": 1.5885, + "step": 1850 + }, + { + "epoch": 0.604630272344943, + "grad_norm": 0.20497965812683105, + "learning_rate": 3.482725800828984e-05, + "loss": 0.6881, + "step": 1851 + }, + { + "epoch": 0.6049569229512882, + "grad_norm": 0.24482519924640656, + "learning_rate": 3.477755765041596e-05, + "loss": 0.7687, + "step": 1852 + }, + { + "epoch": 0.6052835735576334, + "grad_norm": 0.26200076937675476, + "learning_rate": 3.472787386406926e-05, + "loss": 0.8595, + "step": 1853 + }, + { + "epoch": 0.6056102241639786, + "grad_norm": 0.2740213871002197, + "learning_rate": 3.467820670333671e-05, + "loss": 0.8254, + "step": 1854 + }, + { + "epoch": 0.6059368747703238, + "grad_norm": 0.27905282378196716, + "learning_rate": 3.462855622228723e-05, + "loss": 0.8618, + "step": 1855 + }, + { + "epoch": 0.6062635253766689, + "grad_norm": 0.2976115345954895, + "learning_rate": 3.4578922474971545e-05, + "loss": 0.8119, + "step": 1856 + }, + { + "epoch": 0.6065901759830141, + "grad_norm": 0.3185284435749054, + "learning_rate": 3.452930551542216e-05, + "loss": 0.8803, + "step": 1857 + }, + { + "epoch": 0.6069168265893593, + "grad_norm": 0.29103270173072815, + "learning_rate": 3.447970539765338e-05, + "loss": 0.7803, + "step": 1858 + }, + { + "epoch": 0.6072434771957046, + "grad_norm": 0.34088513255119324, + "learning_rate": 3.4430122175661055e-05, + "loss": 0.8361, + "step": 1859 + }, + { + "epoch": 0.6075701278020498, + "grad_norm": 0.33174625039100647, + "learning_rate": 3.438055590342276e-05, + "loss": 0.8778, + "step": 1860 + }, + { + "epoch": 0.607896778408395, + "grad_norm": 0.3289026916027069, + "learning_rate": 3.4331006634897534e-05, + "loss": 0.8508, + "step": 1861 + }, + { + "epoch": 0.6082234290147401, + "grad_norm": 0.3580840229988098, + "learning_rate": 3.428147442402595e-05, + "loss": 0.9233, + "step": 1862 + }, + { + "epoch": 0.6085500796210853, + "grad_norm": 0.38521304726600647, + "learning_rate": 3.423195932472999e-05, + "loss": 0.8952, + "step": 1863 + }, + { + "epoch": 0.6088767302274305, + "grad_norm": 0.40166860818862915, + "learning_rate": 3.4182461390913024e-05, + "loss": 0.9741, + "step": 1864 + }, + { + "epoch": 0.6092033808337757, + "grad_norm": 0.4904230535030365, + "learning_rate": 3.41329806764597e-05, + "loss": 1.0898, + "step": 1865 + }, + { + "epoch": 0.6095300314401209, + "grad_norm": 0.47519782185554504, + "learning_rate": 3.408351723523599e-05, + "loss": 1.0139, + "step": 1866 + }, + { + "epoch": 0.609856682046466, + "grad_norm": 0.4924708902835846, + "learning_rate": 3.403407112108898e-05, + "loss": 0.9999, + "step": 1867 + }, + { + "epoch": 0.6101833326528112, + "grad_norm": 0.5966106057167053, + "learning_rate": 3.398464238784696e-05, + "loss": 1.1533, + "step": 1868 + }, + { + "epoch": 0.6105099832591564, + "grad_norm": 0.6565825939178467, + "learning_rate": 3.393523108931928e-05, + "loss": 1.0461, + "step": 1869 + }, + { + "epoch": 0.6108366338655016, + "grad_norm": 0.7097316980361938, + "learning_rate": 3.388583727929625e-05, + "loss": 1.0233, + "step": 1870 + }, + { + "epoch": 0.6111632844718468, + "grad_norm": 0.814643144607544, + "learning_rate": 3.383646101154924e-05, + "loss": 1.226, + "step": 1871 + }, + { + "epoch": 0.6114899350781919, + "grad_norm": 1.0871217250823975, + "learning_rate": 3.3787102339830444e-05, + "loss": 1.2491, + "step": 1872 + }, + { + "epoch": 0.6118165856845372, + "grad_norm": 1.5111687183380127, + "learning_rate": 3.373776131787298e-05, + "loss": 1.7033, + "step": 1873 + }, + { + "epoch": 0.6121432362908824, + "grad_norm": 1.5856449604034424, + "learning_rate": 3.368843799939066e-05, + "loss": 1.7526, + "step": 1874 + }, + { + "epoch": 0.6124698868972276, + "grad_norm": 1.6733304262161255, + "learning_rate": 3.363913243807808e-05, + "loss": 1.9496, + "step": 1875 + }, + { + "epoch": 0.6127965375035728, + "grad_norm": 0.21695645153522491, + "learning_rate": 3.3589844687610506e-05, + "loss": 0.764, + "step": 1876 + }, + { + "epoch": 0.613123188109918, + "grad_norm": 0.267497181892395, + "learning_rate": 3.3540574801643776e-05, + "loss": 0.9128, + "step": 1877 + }, + { + "epoch": 0.6134498387162631, + "grad_norm": 0.26708516478538513, + "learning_rate": 3.349132283381433e-05, + "loss": 0.9034, + "step": 1878 + }, + { + "epoch": 0.6137764893226083, + "grad_norm": 0.27384889125823975, + "learning_rate": 3.3442088837739075e-05, + "loss": 0.8362, + "step": 1879 + }, + { + "epoch": 0.6141031399289535, + "grad_norm": 0.29018422961235046, + "learning_rate": 3.339287286701534e-05, + "loss": 0.8634, + "step": 1880 + }, + { + "epoch": 0.6144297905352987, + "grad_norm": 0.29773256182670593, + "learning_rate": 3.3343674975220867e-05, + "loss": 0.8958, + "step": 1881 + }, + { + "epoch": 0.6147564411416439, + "grad_norm": 0.30387818813323975, + "learning_rate": 3.32944952159137e-05, + "loss": 0.8797, + "step": 1882 + }, + { + "epoch": 0.615083091747989, + "grad_norm": 0.3108673691749573, + "learning_rate": 3.324533364263211e-05, + "loss": 0.7999, + "step": 1883 + }, + { + "epoch": 0.6154097423543342, + "grad_norm": 0.32888373732566833, + "learning_rate": 3.319619030889464e-05, + "loss": 0.8075, + "step": 1884 + }, + { + "epoch": 0.6157363929606794, + "grad_norm": 0.3333348333835602, + "learning_rate": 3.314706526819991e-05, + "loss": 0.8369, + "step": 1885 + }, + { + "epoch": 0.6160630435670246, + "grad_norm": 0.35576188564300537, + "learning_rate": 3.309795857402667e-05, + "loss": 0.9797, + "step": 1886 + }, + { + "epoch": 0.6163896941733699, + "grad_norm": 0.3945654034614563, + "learning_rate": 3.304887027983369e-05, + "loss": 1.0048, + "step": 1887 + }, + { + "epoch": 0.616716344779715, + "grad_norm": 0.3923867344856262, + "learning_rate": 3.299980043905966e-05, + "loss": 0.9386, + "step": 1888 + }, + { + "epoch": 0.6170429953860602, + "grad_norm": 0.3965389132499695, + "learning_rate": 3.295074910512327e-05, + "loss": 0.9106, + "step": 1889 + }, + { + "epoch": 0.6173696459924054, + "grad_norm": 0.45480525493621826, + "learning_rate": 3.2901716331422994e-05, + "loss": 1.0531, + "step": 1890 + }, + { + "epoch": 0.6176962965987506, + "grad_norm": 0.4836742579936981, + "learning_rate": 3.28527021713371e-05, + "loss": 1.1177, + "step": 1891 + }, + { + "epoch": 0.6180229472050958, + "grad_norm": 0.5407842397689819, + "learning_rate": 3.280370667822367e-05, + "loss": 1.1417, + "step": 1892 + }, + { + "epoch": 0.6183495978114409, + "grad_norm": 0.6356526017189026, + "learning_rate": 3.275472990542034e-05, + "loss": 1.0241, + "step": 1893 + }, + { + "epoch": 0.6186762484177861, + "grad_norm": 0.7225267291069031, + "learning_rate": 3.2705771906244496e-05, + "loss": 1.1239, + "step": 1894 + }, + { + "epoch": 0.6190028990241313, + "grad_norm": 0.7228987216949463, + "learning_rate": 3.2656832733993e-05, + "loss": 1.3345, + "step": 1895 + }, + { + "epoch": 0.6193295496304765, + "grad_norm": 0.7399430871009827, + "learning_rate": 3.260791244194225e-05, + "loss": 1.0652, + "step": 1896 + }, + { + "epoch": 0.6196562002368217, + "grad_norm": 1.0633045434951782, + "learning_rate": 3.255901108334807e-05, + "loss": 1.2763, + "step": 1897 + }, + { + "epoch": 0.6199828508431668, + "grad_norm": 1.297610878944397, + "learning_rate": 3.251012871144571e-05, + "loss": 1.2444, + "step": 1898 + }, + { + "epoch": 0.620309501449512, + "grad_norm": 1.577770471572876, + "learning_rate": 3.2461265379449725e-05, + "loss": 1.6499, + "step": 1899 + }, + { + "epoch": 0.6206361520558572, + "grad_norm": 1.752236247062683, + "learning_rate": 3.241242114055394e-05, + "loss": 1.7408, + "step": 1900 + }, + { + "epoch": 0.6209628026622025, + "grad_norm": 0.22202466428279877, + "learning_rate": 3.236359604793139e-05, + "loss": 0.7555, + "step": 1901 + }, + { + "epoch": 0.6212894532685477, + "grad_norm": 0.24626395106315613, + "learning_rate": 3.231479015473432e-05, + "loss": 0.817, + "step": 1902 + }, + { + "epoch": 0.6216161038748929, + "grad_norm": 0.2575581967830658, + "learning_rate": 3.226600351409399e-05, + "loss": 0.856, + "step": 1903 + }, + { + "epoch": 0.621942754481238, + "grad_norm": 0.29142552614212036, + "learning_rate": 3.221723617912075e-05, + "loss": 0.8771, + "step": 1904 + }, + { + "epoch": 0.6222694050875832, + "grad_norm": 0.27642571926116943, + "learning_rate": 3.2168488202903947e-05, + "loss": 0.8312, + "step": 1905 + }, + { + "epoch": 0.6225960556939284, + "grad_norm": 0.3057843744754791, + "learning_rate": 3.211975963851179e-05, + "loss": 0.9538, + "step": 1906 + }, + { + "epoch": 0.6229227063002736, + "grad_norm": 0.3055606186389923, + "learning_rate": 3.207105053899147e-05, + "loss": 0.8128, + "step": 1907 + }, + { + "epoch": 0.6232493569066188, + "grad_norm": 0.3037729561328888, + "learning_rate": 3.202236095736886e-05, + "loss": 0.8161, + "step": 1908 + }, + { + "epoch": 0.6235760075129639, + "grad_norm": 0.32354530692100525, + "learning_rate": 3.197369094664865e-05, + "loss": 0.8633, + "step": 1909 + }, + { + "epoch": 0.6239026581193091, + "grad_norm": 0.34619325399398804, + "learning_rate": 3.192504055981423e-05, + "loss": 0.8472, + "step": 1910 + }, + { + "epoch": 0.6242293087256543, + "grad_norm": 0.3544459939002991, + "learning_rate": 3.187640984982761e-05, + "loss": 0.8922, + "step": 1911 + }, + { + "epoch": 0.6245559593319995, + "grad_norm": 0.36598286032676697, + "learning_rate": 3.1827798869629396e-05, + "loss": 0.8475, + "step": 1912 + }, + { + "epoch": 0.6248826099383447, + "grad_norm": 0.4017038643360138, + "learning_rate": 3.177920767213868e-05, + "loss": 0.8745, + "step": 1913 + }, + { + "epoch": 0.6252092605446898, + "grad_norm": 0.4290722608566284, + "learning_rate": 3.1730636310253047e-05, + "loss": 0.975, + "step": 1914 + }, + { + "epoch": 0.6255359111510351, + "grad_norm": 0.44876834750175476, + "learning_rate": 3.1682084836848497e-05, + "loss": 0.9702, + "step": 1915 + }, + { + "epoch": 0.6258625617573803, + "grad_norm": 0.48596665263175964, + "learning_rate": 3.1633553304779354e-05, + "loss": 1.0497, + "step": 1916 + }, + { + "epoch": 0.6261892123637255, + "grad_norm": 0.49547937512397766, + "learning_rate": 3.158504176687824e-05, + "loss": 1.0011, + "step": 1917 + }, + { + "epoch": 0.6265158629700707, + "grad_norm": 0.5421122908592224, + "learning_rate": 3.153655027595603e-05, + "loss": 0.9569, + "step": 1918 + }, + { + "epoch": 0.6268425135764158, + "grad_norm": 0.6027096509933472, + "learning_rate": 3.1488078884801734e-05, + "loss": 1.2515, + "step": 1919 + }, + { + "epoch": 0.627169164182761, + "grad_norm": 0.7056169509887695, + "learning_rate": 3.143962764618254e-05, + "loss": 1.1874, + "step": 1920 + }, + { + "epoch": 0.6274958147891062, + "grad_norm": 0.791888415813446, + "learning_rate": 3.1391196612843646e-05, + "loss": 1.2552, + "step": 1921 + }, + { + "epoch": 0.6278224653954514, + "grad_norm": 0.998956024646759, + "learning_rate": 3.134278583750826e-05, + "loss": 1.3186, + "step": 1922 + }, + { + "epoch": 0.6281491160017966, + "grad_norm": 1.0044286251068115, + "learning_rate": 3.129439537287758e-05, + "loss": 1.3368, + "step": 1923 + }, + { + "epoch": 0.6284757666081418, + "grad_norm": 1.431918740272522, + "learning_rate": 3.124602527163062e-05, + "loss": 1.6185, + "step": 1924 + }, + { + "epoch": 0.6288024172144869, + "grad_norm": 1.8370330333709717, + "learning_rate": 3.119767558642432e-05, + "loss": 1.7886, + "step": 1925 + }, + { + "epoch": 0.6291290678208321, + "grad_norm": 0.2085886001586914, + "learning_rate": 3.1149346369893314e-05, + "loss": 0.6986, + "step": 1926 + }, + { + "epoch": 0.6294557184271773, + "grad_norm": 0.23808997869491577, + "learning_rate": 3.1101037674649966e-05, + "loss": 0.8573, + "step": 1927 + }, + { + "epoch": 0.6297823690335225, + "grad_norm": 0.24077343940734863, + "learning_rate": 3.1052749553284356e-05, + "loss": 0.7755, + "step": 1928 + }, + { + "epoch": 0.6301090196398677, + "grad_norm": 0.26138561964035034, + "learning_rate": 3.100448205836408e-05, + "loss": 0.8459, + "step": 1929 + }, + { + "epoch": 0.6304356702462129, + "grad_norm": 0.25806763768196106, + "learning_rate": 3.095623524243437e-05, + "loss": 0.8026, + "step": 1930 + }, + { + "epoch": 0.6307623208525581, + "grad_norm": 0.2821289598941803, + "learning_rate": 3.0908009158017893e-05, + "loss": 0.8433, + "step": 1931 + }, + { + "epoch": 0.6310889714589033, + "grad_norm": 0.2813338339328766, + "learning_rate": 3.0859803857614734e-05, + "loss": 0.8486, + "step": 1932 + }, + { + "epoch": 0.6314156220652485, + "grad_norm": 0.29995447397232056, + "learning_rate": 3.08116193937024e-05, + "loss": 0.8243, + "step": 1933 + }, + { + "epoch": 0.6317422726715937, + "grad_norm": 0.31056755781173706, + "learning_rate": 3.076345581873568e-05, + "loss": 0.8101, + "step": 1934 + }, + { + "epoch": 0.6320689232779388, + "grad_norm": 0.31584444642066956, + "learning_rate": 3.071531318514661e-05, + "loss": 0.8359, + "step": 1935 + }, + { + "epoch": 0.632395573884284, + "grad_norm": 0.33051276206970215, + "learning_rate": 3.066719154534451e-05, + "loss": 0.8212, + "step": 1936 + }, + { + "epoch": 0.6327222244906292, + "grad_norm": 0.3518679141998291, + "learning_rate": 3.061909095171572e-05, + "loss": 1.0019, + "step": 1937 + }, + { + "epoch": 0.6330488750969744, + "grad_norm": 0.3903535008430481, + "learning_rate": 3.0571011456623786e-05, + "loss": 0.9155, + "step": 1938 + }, + { + "epoch": 0.6333755257033196, + "grad_norm": 0.4148496091365814, + "learning_rate": 3.0522953112409224e-05, + "loss": 1.0204, + "step": 1939 + }, + { + "epoch": 0.6337021763096647, + "grad_norm": 0.4300936460494995, + "learning_rate": 3.0474915971389506e-05, + "loss": 0.9795, + "step": 1940 + }, + { + "epoch": 0.6340288269160099, + "grad_norm": 0.5103878974914551, + "learning_rate": 3.042690008585909e-05, + "loss": 1.0279, + "step": 1941 + }, + { + "epoch": 0.6343554775223551, + "grad_norm": 0.46250119805336, + "learning_rate": 3.037890550808923e-05, + "loss": 0.982, + "step": 1942 + }, + { + "epoch": 0.6346821281287003, + "grad_norm": 0.5167394876480103, + "learning_rate": 3.033093229032803e-05, + "loss": 0.9676, + "step": 1943 + }, + { + "epoch": 0.6350087787350456, + "grad_norm": 0.5678005218505859, + "learning_rate": 3.028298048480031e-05, + "loss": 0.9864, + "step": 1944 + }, + { + "epoch": 0.6353354293413908, + "grad_norm": 0.6823439002037048, + "learning_rate": 3.0235050143707577e-05, + "loss": 1.2096, + "step": 1945 + }, + { + "epoch": 0.6356620799477359, + "grad_norm": 0.8533816933631897, + "learning_rate": 3.018714131922802e-05, + "loss": 1.2273, + "step": 1946 + }, + { + "epoch": 0.6359887305540811, + "grad_norm": 0.8832264542579651, + "learning_rate": 3.0139254063516343e-05, + "loss": 1.2583, + "step": 1947 + }, + { + "epoch": 0.6363153811604263, + "grad_norm": 1.1121357679367065, + "learning_rate": 3.0091388428703793e-05, + "loss": 1.4092, + "step": 1948 + }, + { + "epoch": 0.6366420317667715, + "grad_norm": 1.4677468538284302, + "learning_rate": 3.0043544466898115e-05, + "loss": 1.9588, + "step": 1949 + }, + { + "epoch": 0.6369686823731167, + "grad_norm": 1.5762861967086792, + "learning_rate": 2.9995722230183387e-05, + "loss": 1.1872, + "step": 1950 + }, + { + "epoch": 0.6372953329794618, + "grad_norm": 0.217019721865654, + "learning_rate": 2.9947921770620126e-05, + "loss": 0.7654, + "step": 1951 + }, + { + "epoch": 0.637621983585807, + "grad_norm": 0.23723232746124268, + "learning_rate": 2.9900143140245067e-05, + "loss": 0.8003, + "step": 1952 + }, + { + "epoch": 0.6379486341921522, + "grad_norm": 0.26588693261146545, + "learning_rate": 2.9852386391071192e-05, + "loss": 0.856, + "step": 1953 + }, + { + "epoch": 0.6382752847984974, + "grad_norm": 0.25790339708328247, + "learning_rate": 2.9804651575087716e-05, + "loss": 0.8491, + "step": 1954 + }, + { + "epoch": 0.6386019354048426, + "grad_norm": 0.2517349421977997, + "learning_rate": 2.97569387442599e-05, + "loss": 0.8128, + "step": 1955 + }, + { + "epoch": 0.6389285860111877, + "grad_norm": 0.27819013595581055, + "learning_rate": 2.9709247950529157e-05, + "loss": 0.8499, + "step": 1956 + }, + { + "epoch": 0.6392552366175329, + "grad_norm": 0.30752265453338623, + "learning_rate": 2.9661579245812844e-05, + "loss": 0.8082, + "step": 1957 + }, + { + "epoch": 0.6395818872238782, + "grad_norm": 0.31690144538879395, + "learning_rate": 2.961393268200426e-05, + "loss": 0.8634, + "step": 1958 + }, + { + "epoch": 0.6399085378302234, + "grad_norm": 0.33005377650260925, + "learning_rate": 2.956630831097268e-05, + "loss": 0.9546, + "step": 1959 + }, + { + "epoch": 0.6402351884365686, + "grad_norm": 0.3165203630924225, + "learning_rate": 2.9518706184563162e-05, + "loss": 0.8628, + "step": 1960 + }, + { + "epoch": 0.6405618390429137, + "grad_norm": 0.3594956398010254, + "learning_rate": 2.9471126354596517e-05, + "loss": 0.8658, + "step": 1961 + }, + { + "epoch": 0.6408884896492589, + "grad_norm": 0.3517896831035614, + "learning_rate": 2.9423568872869383e-05, + "loss": 0.9052, + "step": 1962 + }, + { + "epoch": 0.6412151402556041, + "grad_norm": 0.42132368683815, + "learning_rate": 2.9376033791153962e-05, + "loss": 0.8795, + "step": 1963 + }, + { + "epoch": 0.6415417908619493, + "grad_norm": 0.41029414534568787, + "learning_rate": 2.9328521161198152e-05, + "loss": 0.967, + "step": 1964 + }, + { + "epoch": 0.6418684414682945, + "grad_norm": 0.444322794675827, + "learning_rate": 2.9281031034725377e-05, + "loss": 0.9924, + "step": 1965 + }, + { + "epoch": 0.6421950920746397, + "grad_norm": 0.5007185339927673, + "learning_rate": 2.9233563463434534e-05, + "loss": 1.0496, + "step": 1966 + }, + { + "epoch": 0.6425217426809848, + "grad_norm": 0.5201416015625, + "learning_rate": 2.9186118499000026e-05, + "loss": 0.9913, + "step": 1967 + }, + { + "epoch": 0.64284839328733, + "grad_norm": 0.5279054641723633, + "learning_rate": 2.913869619307159e-05, + "loss": 0.9855, + "step": 1968 + }, + { + "epoch": 0.6431750438936752, + "grad_norm": 0.603148341178894, + "learning_rate": 2.9091296597274342e-05, + "loss": 1.0872, + "step": 1969 + }, + { + "epoch": 0.6435016945000204, + "grad_norm": 0.7442902326583862, + "learning_rate": 2.904391976320864e-05, + "loss": 1.2015, + "step": 1970 + }, + { + "epoch": 0.6438283451063656, + "grad_norm": 0.9018768668174744, + "learning_rate": 2.8996565742450078e-05, + "loss": 1.2661, + "step": 1971 + }, + { + "epoch": 0.6441549957127108, + "grad_norm": 1.0673716068267822, + "learning_rate": 2.8949234586549418e-05, + "loss": 1.5745, + "step": 1972 + }, + { + "epoch": 0.644481646319056, + "grad_norm": 1.511343002319336, + "learning_rate": 2.890192634703255e-05, + "loss": 1.7574, + "step": 1973 + }, + { + "epoch": 0.6448082969254012, + "grad_norm": 1.3675079345703125, + "learning_rate": 2.8854641075400324e-05, + "loss": 1.4665, + "step": 1974 + }, + { + "epoch": 0.6451349475317464, + "grad_norm": 1.6643435955047607, + "learning_rate": 2.8807378823128744e-05, + "loss": 1.9878, + "step": 1975 + }, + { + "epoch": 0.6454615981380916, + "grad_norm": 0.23247745633125305, + "learning_rate": 2.8760139641668582e-05, + "loss": 0.7989, + "step": 1976 + }, + { + "epoch": 0.6457882487444367, + "grad_norm": 0.2454388439655304, + "learning_rate": 2.8712923582445662e-05, + "loss": 0.7655, + "step": 1977 + }, + { + "epoch": 0.6461148993507819, + "grad_norm": 0.257198691368103, + "learning_rate": 2.866573069686049e-05, + "loss": 0.8634, + "step": 1978 + }, + { + "epoch": 0.6464415499571271, + "grad_norm": 0.2427079677581787, + "learning_rate": 2.8618561036288437e-05, + "loss": 0.7737, + "step": 1979 + }, + { + "epoch": 0.6467682005634723, + "grad_norm": 0.2693164348602295, + "learning_rate": 2.8571414652079553e-05, + "loss": 0.8145, + "step": 1980 + }, + { + "epoch": 0.6470948511698175, + "grad_norm": 0.2872451841831207, + "learning_rate": 2.852429159555856e-05, + "loss": 0.8418, + "step": 1981 + }, + { + "epoch": 0.6474215017761626, + "grad_norm": 0.2881939709186554, + "learning_rate": 2.8477191918024782e-05, + "loss": 0.9102, + "step": 1982 + }, + { + "epoch": 0.6477481523825078, + "grad_norm": 0.2885768711566925, + "learning_rate": 2.8430115670752118e-05, + "loss": 0.8406, + "step": 1983 + }, + { + "epoch": 0.648074802988853, + "grad_norm": 0.3151211738586426, + "learning_rate": 2.8383062904988865e-05, + "loss": 0.8316, + "step": 1984 + }, + { + "epoch": 0.6484014535951982, + "grad_norm": 0.3474157452583313, + "learning_rate": 2.8336033671957916e-05, + "loss": 0.8728, + "step": 1985 + }, + { + "epoch": 0.6487281042015435, + "grad_norm": 0.3520103096961975, + "learning_rate": 2.828902802285639e-05, + "loss": 0.8453, + "step": 1986 + }, + { + "epoch": 0.6490547548078887, + "grad_norm": 0.38095179200172424, + "learning_rate": 2.824204600885583e-05, + "loss": 1.0082, + "step": 1987 + }, + { + "epoch": 0.6493814054142338, + "grad_norm": 0.4117640554904938, + "learning_rate": 2.8195087681102e-05, + "loss": 0.9478, + "step": 1988 + }, + { + "epoch": 0.649708056020579, + "grad_norm": 0.41134095191955566, + "learning_rate": 2.8148153090714912e-05, + "loss": 1.0398, + "step": 1989 + }, + { + "epoch": 0.6500347066269242, + "grad_norm": 0.47175323963165283, + "learning_rate": 2.810124228878872e-05, + "loss": 0.9965, + "step": 1990 + }, + { + "epoch": 0.6503613572332694, + "grad_norm": 0.5010828971862793, + "learning_rate": 2.805435532639168e-05, + "loss": 0.9616, + "step": 1991 + }, + { + "epoch": 0.6506880078396146, + "grad_norm": 0.5081592202186584, + "learning_rate": 2.8007492254566097e-05, + "loss": 0.966, + "step": 1992 + }, + { + "epoch": 0.6510146584459597, + "grad_norm": 0.5605658292770386, + "learning_rate": 2.7960653124328275e-05, + "loss": 1.1327, + "step": 1993 + }, + { + "epoch": 0.6513413090523049, + "grad_norm": 0.5658648610115051, + "learning_rate": 2.791383798666844e-05, + "loss": 0.9966, + "step": 1994 + }, + { + "epoch": 0.6516679596586501, + "grad_norm": 0.6892904043197632, + "learning_rate": 2.7867046892550718e-05, + "loss": 1.2015, + "step": 1995 + }, + { + "epoch": 0.6519946102649953, + "grad_norm": 0.7385665774345398, + "learning_rate": 2.782027989291307e-05, + "loss": 1.3089, + "step": 1996 + }, + { + "epoch": 0.6523212608713405, + "grad_norm": 0.9993577003479004, + "learning_rate": 2.777353703866714e-05, + "loss": 1.3718, + "step": 1997 + }, + { + "epoch": 0.6526479114776856, + "grad_norm": 1.1310564279556274, + "learning_rate": 2.772681838069845e-05, + "loss": 1.5095, + "step": 1998 + }, + { + "epoch": 0.6529745620840308, + "grad_norm": 1.478954553604126, + "learning_rate": 2.7680123969865988e-05, + "loss": 1.7489, + "step": 1999 + }, + { + "epoch": 0.6533012126903761, + "grad_norm": 1.603333592414856, + "learning_rate": 2.7633453857002546e-05, + "loss": 2.1632, + "step": 2000 + }, + { + "epoch": 0.6536278632967213, + "grad_norm": 0.206992968916893, + "learning_rate": 2.7586808092914273e-05, + "loss": 0.7184, + "step": 2001 + }, + { + "epoch": 0.6539545139030665, + "grad_norm": 0.24995890259742737, + "learning_rate": 2.7540186728380958e-05, + "loss": 0.8657, + "step": 2002 + }, + { + "epoch": 0.6542811645094117, + "grad_norm": 0.2497495859861374, + "learning_rate": 2.749358981415574e-05, + "loss": 0.7396, + "step": 2003 + }, + { + "epoch": 0.6546078151157568, + "grad_norm": 0.2611045837402344, + "learning_rate": 2.744701740096518e-05, + "loss": 0.861, + "step": 2004 + }, + { + "epoch": 0.654934465722102, + "grad_norm": 0.279967725276947, + "learning_rate": 2.7400469539509177e-05, + "loss": 0.856, + "step": 2005 + }, + { + "epoch": 0.6552611163284472, + "grad_norm": 0.29646557569503784, + "learning_rate": 2.7353946280460857e-05, + "loss": 0.7972, + "step": 2006 + }, + { + "epoch": 0.6555877669347924, + "grad_norm": 0.2923833131790161, + "learning_rate": 2.7307447674466607e-05, + "loss": 0.8352, + "step": 2007 + }, + { + "epoch": 0.6559144175411376, + "grad_norm": 0.3097902238368988, + "learning_rate": 2.726097377214596e-05, + "loss": 0.7813, + "step": 2008 + }, + { + "epoch": 0.6562410681474827, + "grad_norm": 0.33259230852127075, + "learning_rate": 2.7214524624091566e-05, + "loss": 0.8932, + "step": 2009 + }, + { + "epoch": 0.6565677187538279, + "grad_norm": 0.3300251066684723, + "learning_rate": 2.716810028086906e-05, + "loss": 0.8678, + "step": 2010 + }, + { + "epoch": 0.6568943693601731, + "grad_norm": 0.34975963830947876, + "learning_rate": 2.7121700793017214e-05, + "loss": 0.8711, + "step": 2011 + }, + { + "epoch": 0.6572210199665183, + "grad_norm": 0.3697607219219208, + "learning_rate": 2.707532621104757e-05, + "loss": 0.9565, + "step": 2012 + }, + { + "epoch": 0.6575476705728635, + "grad_norm": 0.41733282804489136, + "learning_rate": 2.702897658544473e-05, + "loss": 0.8783, + "step": 2013 + }, + { + "epoch": 0.6578743211792087, + "grad_norm": 0.4162128269672394, + "learning_rate": 2.6982651966665984e-05, + "loss": 1.0363, + "step": 2014 + }, + { + "epoch": 0.6582009717855539, + "grad_norm": 0.4725761115550995, + "learning_rate": 2.6936352405141473e-05, + "loss": 0.9354, + "step": 2015 + }, + { + "epoch": 0.6585276223918991, + "grad_norm": 0.46689558029174805, + "learning_rate": 2.6890077951274052e-05, + "loss": 1.0844, + "step": 2016 + }, + { + "epoch": 0.6588542729982443, + "grad_norm": 0.526522159576416, + "learning_rate": 2.684382865543924e-05, + "loss": 1.0829, + "step": 2017 + }, + { + "epoch": 0.6591809236045895, + "grad_norm": 0.5232835412025452, + "learning_rate": 2.6797604567985126e-05, + "loss": 0.9937, + "step": 2018 + }, + { + "epoch": 0.6595075742109346, + "grad_norm": 0.5499789118766785, + "learning_rate": 2.675140573923246e-05, + "loss": 0.9148, + "step": 2019 + }, + { + "epoch": 0.6598342248172798, + "grad_norm": 0.6198474764823914, + "learning_rate": 2.6705232219474352e-05, + "loss": 1.1502, + "step": 2020 + }, + { + "epoch": 0.660160875423625, + "grad_norm": 0.7691889405250549, + "learning_rate": 2.6659084058976512e-05, + "loss": 1.2477, + "step": 2021 + }, + { + "epoch": 0.6604875260299702, + "grad_norm": 1.0155854225158691, + "learning_rate": 2.6612961307976925e-05, + "loss": 1.6555, + "step": 2022 + }, + { + "epoch": 0.6608141766363154, + "grad_norm": 1.1848665475845337, + "learning_rate": 2.656686401668596e-05, + "loss": 1.6532, + "step": 2023 + }, + { + "epoch": 0.6611408272426605, + "grad_norm": 1.3584601879119873, + "learning_rate": 2.6520792235286278e-05, + "loss": 1.418, + "step": 2024 + }, + { + "epoch": 0.6614674778490057, + "grad_norm": 1.5498560667037964, + "learning_rate": 2.6474746013932762e-05, + "loss": 1.7049, + "step": 2025 + }, + { + "epoch": 0.6617941284553509, + "grad_norm": 0.20128127932548523, + "learning_rate": 2.642872540275246e-05, + "loss": 0.7196, + "step": 2026 + }, + { + "epoch": 0.6621207790616961, + "grad_norm": 0.2390238493680954, + "learning_rate": 2.638273045184455e-05, + "loss": 0.8654, + "step": 2027 + }, + { + "epoch": 0.6624474296680414, + "grad_norm": 0.2611159384250641, + "learning_rate": 2.633676121128027e-05, + "loss": 0.9451, + "step": 2028 + }, + { + "epoch": 0.6627740802743866, + "grad_norm": 0.2779772877693176, + "learning_rate": 2.629081773110288e-05, + "loss": 0.8268, + "step": 2029 + }, + { + "epoch": 0.6631007308807317, + "grad_norm": 0.27132806181907654, + "learning_rate": 2.6244900061327594e-05, + "loss": 0.9116, + "step": 2030 + }, + { + "epoch": 0.6634273814870769, + "grad_norm": 0.2833230197429657, + "learning_rate": 2.6199008251941483e-05, + "loss": 0.9296, + "step": 2031 + }, + { + "epoch": 0.6637540320934221, + "grad_norm": 0.281867653131485, + "learning_rate": 2.615314235290357e-05, + "loss": 0.8266, + "step": 2032 + }, + { + "epoch": 0.6640806826997673, + "grad_norm": 0.30270928144454956, + "learning_rate": 2.610730241414453e-05, + "loss": 0.8456, + "step": 2033 + }, + { + "epoch": 0.6644073333061125, + "grad_norm": 0.3150334060192108, + "learning_rate": 2.6061488485566944e-05, + "loss": 0.8777, + "step": 2034 + }, + { + "epoch": 0.6647339839124576, + "grad_norm": 0.3402860462665558, + "learning_rate": 2.6015700617044914e-05, + "loss": 0.9275, + "step": 2035 + }, + { + "epoch": 0.6650606345188028, + "grad_norm": 0.359759658575058, + "learning_rate": 2.596993885842427e-05, + "loss": 0.8404, + "step": 2036 + }, + { + "epoch": 0.665387285125148, + "grad_norm": 0.3418113887310028, + "learning_rate": 2.5924203259522394e-05, + "loss": 0.8434, + "step": 2037 + }, + { + "epoch": 0.6657139357314932, + "grad_norm": 0.38156867027282715, + "learning_rate": 2.587849387012819e-05, + "loss": 0.8724, + "step": 2038 + }, + { + "epoch": 0.6660405863378384, + "grad_norm": 0.40769273042678833, + "learning_rate": 2.5832810740002035e-05, + "loss": 0.8907, + "step": 2039 + }, + { + "epoch": 0.6663672369441835, + "grad_norm": 0.446984201669693, + "learning_rate": 2.5787153918875708e-05, + "loss": 1.0177, + "step": 2040 + }, + { + "epoch": 0.6666938875505287, + "grad_norm": 0.47096148133277893, + "learning_rate": 2.5741523456452354e-05, + "loss": 1.0929, + "step": 2041 + }, + { + "epoch": 0.6670205381568739, + "grad_norm": 0.4894920587539673, + "learning_rate": 2.569591940240642e-05, + "loss": 0.9955, + "step": 2042 + }, + { + "epoch": 0.6673471887632192, + "grad_norm": 0.47973906993865967, + "learning_rate": 2.565034180638364e-05, + "loss": 0.9822, + "step": 2043 + }, + { + "epoch": 0.6676738393695644, + "grad_norm": 0.5622726082801819, + "learning_rate": 2.5604790718000836e-05, + "loss": 1.0153, + "step": 2044 + }, + { + "epoch": 0.6680004899759096, + "grad_norm": 0.6539722084999084, + "learning_rate": 2.5559266186846135e-05, + "loss": 1.0542, + "step": 2045 + }, + { + "epoch": 0.6683271405822547, + "grad_norm": 0.7086659073829651, + "learning_rate": 2.5513768262478592e-05, + "loss": 1.1257, + "step": 2046 + }, + { + "epoch": 0.6686537911885999, + "grad_norm": 0.8920603394508362, + "learning_rate": 2.5468296994428454e-05, + "loss": 1.3898, + "step": 2047 + }, + { + "epoch": 0.6689804417949451, + "grad_norm": 1.0379706621170044, + "learning_rate": 2.542285243219679e-05, + "loss": 1.2713, + "step": 2048 + }, + { + "epoch": 0.6693070924012903, + "grad_norm": 1.1866716146469116, + "learning_rate": 2.5377434625255715e-05, + "loss": 1.4265, + "step": 2049 + }, + { + "epoch": 0.6696337430076355, + "grad_norm": 1.54633629322052, + "learning_rate": 2.5332043623048168e-05, + "loss": 1.9121, + "step": 2050 + }, + { + "epoch": 0.6699603936139806, + "grad_norm": 0.19281473755836487, + "learning_rate": 2.5286679474987917e-05, + "loss": 0.6382, + "step": 2051 + }, + { + "epoch": 0.6702870442203258, + "grad_norm": 0.2434716820716858, + "learning_rate": 2.5241342230459498e-05, + "loss": 0.8263, + "step": 2052 + }, + { + "epoch": 0.670613694826671, + "grad_norm": 0.2635152041912079, + "learning_rate": 2.5196031938818165e-05, + "loss": 0.8594, + "step": 2053 + }, + { + "epoch": 0.6709403454330162, + "grad_norm": 0.2647515833377838, + "learning_rate": 2.5150748649389777e-05, + "loss": 0.7964, + "step": 2054 + }, + { + "epoch": 0.6712669960393614, + "grad_norm": 0.28728336095809937, + "learning_rate": 2.510549241147091e-05, + "loss": 0.8066, + "step": 2055 + }, + { + "epoch": 0.6715936466457065, + "grad_norm": 0.3012993633747101, + "learning_rate": 2.5060263274328578e-05, + "loss": 0.9016, + "step": 2056 + }, + { + "epoch": 0.6719202972520518, + "grad_norm": 0.3128151595592499, + "learning_rate": 2.5015061287200348e-05, + "loss": 0.8255, + "step": 2057 + }, + { + "epoch": 0.672246947858397, + "grad_norm": 0.3259854316711426, + "learning_rate": 2.496988649929422e-05, + "loss": 0.9146, + "step": 2058 + }, + { + "epoch": 0.6725735984647422, + "grad_norm": 0.3326560854911804, + "learning_rate": 2.492473895978859e-05, + "loss": 0.8752, + "step": 2059 + }, + { + "epoch": 0.6729002490710874, + "grad_norm": 0.35559192299842834, + "learning_rate": 2.4879618717832187e-05, + "loss": 0.9212, + "step": 2060 + }, + { + "epoch": 0.6732268996774325, + "grad_norm": 0.355741947889328, + "learning_rate": 2.4834525822544018e-05, + "loss": 0.8778, + "step": 2061 + }, + { + "epoch": 0.6735535502837777, + "grad_norm": 0.4042257070541382, + "learning_rate": 2.4789460323013325e-05, + "loss": 0.8685, + "step": 2062 + }, + { + "epoch": 0.6738802008901229, + "grad_norm": 0.44248056411743164, + "learning_rate": 2.4744422268299522e-05, + "loss": 0.9731, + "step": 2063 + }, + { + "epoch": 0.6742068514964681, + "grad_norm": 0.4689655900001526, + "learning_rate": 2.4699411707432156e-05, + "loss": 1.0723, + "step": 2064 + }, + { + "epoch": 0.6745335021028133, + "grad_norm": 0.4722408354282379, + "learning_rate": 2.465442868941083e-05, + "loss": 1.0131, + "step": 2065 + }, + { + "epoch": 0.6748601527091584, + "grad_norm": 0.5208030343055725, + "learning_rate": 2.4609473263205197e-05, + "loss": 1.0655, + "step": 2066 + }, + { + "epoch": 0.6751868033155036, + "grad_norm": 0.5427075028419495, + "learning_rate": 2.456454547775478e-05, + "loss": 1.0494, + "step": 2067 + }, + { + "epoch": 0.6755134539218488, + "grad_norm": 0.5571867227554321, + "learning_rate": 2.4519645381969153e-05, + "loss": 0.954, + "step": 2068 + }, + { + "epoch": 0.675840104528194, + "grad_norm": 0.6750152111053467, + "learning_rate": 2.447477302472762e-05, + "loss": 1.1531, + "step": 2069 + }, + { + "epoch": 0.6761667551345392, + "grad_norm": 0.7053940296173096, + "learning_rate": 2.4429928454879357e-05, + "loss": 1.2184, + "step": 2070 + }, + { + "epoch": 0.6764934057408845, + "grad_norm": 0.9125105142593384, + "learning_rate": 2.4385111721243264e-05, + "loss": 1.2626, + "step": 2071 + }, + { + "epoch": 0.6768200563472296, + "grad_norm": 0.9873994588851929, + "learning_rate": 2.4340322872607957e-05, + "loss": 1.3534, + "step": 2072 + }, + { + "epoch": 0.6771467069535748, + "grad_norm": 1.1085621118545532, + "learning_rate": 2.4295561957731678e-05, + "loss": 1.0356, + "step": 2073 + }, + { + "epoch": 0.67747335755992, + "grad_norm": 1.32113778591156, + "learning_rate": 2.4250829025342258e-05, + "loss": 1.2397, + "step": 2074 + }, + { + "epoch": 0.6778000081662652, + "grad_norm": 1.7643086910247803, + "learning_rate": 2.4206124124137087e-05, + "loss": 1.8698, + "step": 2075 + }, + { + "epoch": 0.6781266587726104, + "grad_norm": 0.22221608459949493, + "learning_rate": 2.416144730278302e-05, + "loss": 0.73, + "step": 2076 + }, + { + "epoch": 0.6784533093789555, + "grad_norm": 0.2354651689529419, + "learning_rate": 2.4116798609916347e-05, + "loss": 0.7509, + "step": 2077 + }, + { + "epoch": 0.6787799599853007, + "grad_norm": 0.26706814765930176, + "learning_rate": 2.4072178094142745e-05, + "loss": 0.8493, + "step": 2078 + }, + { + "epoch": 0.6791066105916459, + "grad_norm": 0.27699151635169983, + "learning_rate": 2.4027585804037222e-05, + "loss": 0.779, + "step": 2079 + }, + { + "epoch": 0.6794332611979911, + "grad_norm": 0.28536492586135864, + "learning_rate": 2.3983021788144e-05, + "loss": 0.8581, + "step": 2080 + }, + { + "epoch": 0.6797599118043363, + "grad_norm": 0.29376420378685, + "learning_rate": 2.3938486094976635e-05, + "loss": 0.8732, + "step": 2081 + }, + { + "epoch": 0.6800865624106814, + "grad_norm": 0.3210821747779846, + "learning_rate": 2.3893978773017716e-05, + "loss": 0.8949, + "step": 2082 + }, + { + "epoch": 0.6804132130170266, + "grad_norm": 0.3225792348384857, + "learning_rate": 2.384949987071909e-05, + "loss": 0.8813, + "step": 2083 + }, + { + "epoch": 0.6807398636233718, + "grad_norm": 0.3447169065475464, + "learning_rate": 2.380504943650152e-05, + "loss": 0.9181, + "step": 2084 + }, + { + "epoch": 0.6810665142297171, + "grad_norm": 0.3425697088241577, + "learning_rate": 2.3760627518754884e-05, + "loss": 0.8702, + "step": 2085 + }, + { + "epoch": 0.6813931648360623, + "grad_norm": 0.3333522379398346, + "learning_rate": 2.371623416583797e-05, + "loss": 0.8651, + "step": 2086 + }, + { + "epoch": 0.6817198154424075, + "grad_norm": 0.36659929156303406, + "learning_rate": 2.3671869426078502e-05, + "loss": 0.8974, + "step": 2087 + }, + { + "epoch": 0.6820464660487526, + "grad_norm": 0.3901701271533966, + "learning_rate": 2.362753334777298e-05, + "loss": 0.9199, + "step": 2088 + }, + { + "epoch": 0.6823731166550978, + "grad_norm": 0.4033861458301544, + "learning_rate": 2.3583225979186817e-05, + "loss": 0.9545, + "step": 2089 + }, + { + "epoch": 0.682699767261443, + "grad_norm": 0.4583320617675781, + "learning_rate": 2.3538947368554055e-05, + "loss": 1.1257, + "step": 2090 + }, + { + "epoch": 0.6830264178677882, + "grad_norm": 0.49666598439216614, + "learning_rate": 2.3494697564077538e-05, + "loss": 0.9947, + "step": 2091 + }, + { + "epoch": 0.6833530684741334, + "grad_norm": 0.4935648441314697, + "learning_rate": 2.3450476613928656e-05, + "loss": 0.9831, + "step": 2092 + }, + { + "epoch": 0.6836797190804785, + "grad_norm": 0.5261000394821167, + "learning_rate": 2.3406284566247444e-05, + "loss": 1.0624, + "step": 2093 + }, + { + "epoch": 0.6840063696868237, + "grad_norm": 0.598313570022583, + "learning_rate": 2.3362121469142458e-05, + "loss": 1.0826, + "step": 2094 + }, + { + "epoch": 0.6843330202931689, + "grad_norm": 0.6607427597045898, + "learning_rate": 2.3317987370690712e-05, + "loss": 1.131, + "step": 2095 + }, + { + "epoch": 0.6846596708995141, + "grad_norm": 0.8065424561500549, + "learning_rate": 2.3273882318937746e-05, + "loss": 1.0647, + "step": 2096 + }, + { + "epoch": 0.6849863215058593, + "grad_norm": 1.110991358757019, + "learning_rate": 2.3229806361897343e-05, + "loss": 1.1449, + "step": 2097 + }, + { + "epoch": 0.6853129721122044, + "grad_norm": 1.2615911960601807, + "learning_rate": 2.31857595475517e-05, + "loss": 1.7161, + "step": 2098 + }, + { + "epoch": 0.6856396227185497, + "grad_norm": 1.3655647039413452, + "learning_rate": 2.3141741923851274e-05, + "loss": 1.693, + "step": 2099 + }, + { + "epoch": 0.6859662733248949, + "grad_norm": 1.5947473049163818, + "learning_rate": 2.3097753538714756e-05, + "loss": 2.1393, + "step": 2100 + }, + { + "epoch": 0.6862929239312401, + "grad_norm": 0.19925794005393982, + "learning_rate": 2.3053794440028927e-05, + "loss": 0.6479, + "step": 2101 + }, + { + "epoch": 0.6866195745375853, + "grad_norm": 0.24255867302417755, + "learning_rate": 2.300986467564883e-05, + "loss": 0.8837, + "step": 2102 + }, + { + "epoch": 0.6869462251439304, + "grad_norm": 0.26114514470100403, + "learning_rate": 2.2965964293397412e-05, + "loss": 0.7501, + "step": 2103 + }, + { + "epoch": 0.6872728757502756, + "grad_norm": 0.2674042880535126, + "learning_rate": 2.2922093341065798e-05, + "loss": 0.8345, + "step": 2104 + }, + { + "epoch": 0.6875995263566208, + "grad_norm": 0.27049896121025085, + "learning_rate": 2.2878251866412932e-05, + "loss": 0.7943, + "step": 2105 + }, + { + "epoch": 0.687926176962966, + "grad_norm": 0.2783470153808594, + "learning_rate": 2.283443991716574e-05, + "loss": 0.8056, + "step": 2106 + }, + { + "epoch": 0.6882528275693112, + "grad_norm": 0.3203645944595337, + "learning_rate": 2.2790657541019e-05, + "loss": 0.8878, + "step": 2107 + }, + { + "epoch": 0.6885794781756563, + "grad_norm": 0.3191487789154053, + "learning_rate": 2.274690478563529e-05, + "loss": 0.8506, + "step": 2108 + }, + { + "epoch": 0.6889061287820015, + "grad_norm": 0.35952916741371155, + "learning_rate": 2.2703181698644936e-05, + "loss": 0.8941, + "step": 2109 + }, + { + "epoch": 0.6892327793883467, + "grad_norm": 0.3359813094139099, + "learning_rate": 2.2659488327645984e-05, + "loss": 0.9174, + "step": 2110 + }, + { + "epoch": 0.6895594299946919, + "grad_norm": 0.33734071254730225, + "learning_rate": 2.26158247202041e-05, + "loss": 0.9184, + "step": 2111 + }, + { + "epoch": 0.6898860806010371, + "grad_norm": 0.34754154086112976, + "learning_rate": 2.2572190923852577e-05, + "loss": 0.8007, + "step": 2112 + }, + { + "epoch": 0.6902127312073824, + "grad_norm": 0.36550602316856384, + "learning_rate": 2.2528586986092265e-05, + "loss": 0.9123, + "step": 2113 + }, + { + "epoch": 0.6905393818137275, + "grad_norm": 0.3971017599105835, + "learning_rate": 2.2485012954391428e-05, + "loss": 0.9644, + "step": 2114 + }, + { + "epoch": 0.6908660324200727, + "grad_norm": 0.42068859934806824, + "learning_rate": 2.2441468876185905e-05, + "loss": 0.8395, + "step": 2115 + }, + { + "epoch": 0.6911926830264179, + "grad_norm": 0.42162227630615234, + "learning_rate": 2.2397954798878794e-05, + "loss": 0.9714, + "step": 2116 + }, + { + "epoch": 0.6915193336327631, + "grad_norm": 0.4547198414802551, + "learning_rate": 2.235447076984067e-05, + "loss": 0.9827, + "step": 2117 + }, + { + "epoch": 0.6918459842391083, + "grad_norm": 0.5274547338485718, + "learning_rate": 2.2311016836409247e-05, + "loss": 1.0212, + "step": 2118 + }, + { + "epoch": 0.6921726348454534, + "grad_norm": 0.604054868221283, + "learning_rate": 2.226759304588959e-05, + "loss": 1.0619, + "step": 2119 + }, + { + "epoch": 0.6924992854517986, + "grad_norm": 0.6196434497833252, + "learning_rate": 2.2224199445553906e-05, + "loss": 1.127, + "step": 2120 + }, + { + "epoch": 0.6928259360581438, + "grad_norm": 0.8015429973602295, + "learning_rate": 2.2180836082641537e-05, + "loss": 1.2795, + "step": 2121 + }, + { + "epoch": 0.693152586664489, + "grad_norm": 0.9391926527023315, + "learning_rate": 2.2137503004358917e-05, + "loss": 1.1737, + "step": 2122 + }, + { + "epoch": 0.6934792372708342, + "grad_norm": 1.2050788402557373, + "learning_rate": 2.2094200257879522e-05, + "loss": 1.6547, + "step": 2123 + }, + { + "epoch": 0.6938058878771793, + "grad_norm": 1.375234842300415, + "learning_rate": 2.2050927890343737e-05, + "loss": 1.4072, + "step": 2124 + }, + { + "epoch": 0.6941325384835245, + "grad_norm": 1.6113313436508179, + "learning_rate": 2.2007685948858988e-05, + "loss": 1.9956, + "step": 2125 + }, + { + "epoch": 0.6944591890898697, + "grad_norm": 0.21915705502033234, + "learning_rate": 2.196447448049952e-05, + "loss": 0.6856, + "step": 2126 + }, + { + "epoch": 0.694785839696215, + "grad_norm": 0.2379796952009201, + "learning_rate": 2.1921293532306354e-05, + "loss": 0.7421, + "step": 2127 + }, + { + "epoch": 0.6951124903025602, + "grad_norm": 0.2671576142311096, + "learning_rate": 2.187814315128741e-05, + "loss": 0.7911, + "step": 2128 + }, + { + "epoch": 0.6954391409089054, + "grad_norm": 0.28185057640075684, + "learning_rate": 2.1835023384417185e-05, + "loss": 0.8545, + "step": 2129 + }, + { + "epoch": 0.6957657915152505, + "grad_norm": 0.28416943550109863, + "learning_rate": 2.1791934278637e-05, + "loss": 0.8578, + "step": 2130 + }, + { + "epoch": 0.6960924421215957, + "grad_norm": 0.3110760748386383, + "learning_rate": 2.1748875880854664e-05, + "loss": 0.9482, + "step": 2131 + }, + { + "epoch": 0.6964190927279409, + "grad_norm": 0.3088090121746063, + "learning_rate": 2.1705848237944632e-05, + "loss": 0.8229, + "step": 2132 + }, + { + "epoch": 0.6967457433342861, + "grad_norm": 0.32084354758262634, + "learning_rate": 2.166285139674786e-05, + "loss": 0.9405, + "step": 2133 + }, + { + "epoch": 0.6970723939406313, + "grad_norm": 0.3231033384799957, + "learning_rate": 2.161988540407177e-05, + "loss": 0.8388, + "step": 2134 + }, + { + "epoch": 0.6973990445469764, + "grad_norm": 0.34001418948173523, + "learning_rate": 2.1576950306690207e-05, + "loss": 0.8904, + "step": 2135 + }, + { + "epoch": 0.6977256951533216, + "grad_norm": 0.36986929178237915, + "learning_rate": 2.1534046151343402e-05, + "loss": 0.9376, + "step": 2136 + }, + { + "epoch": 0.6980523457596668, + "grad_norm": 0.40653765201568604, + "learning_rate": 2.1491172984737816e-05, + "loss": 0.9017, + "step": 2137 + }, + { + "epoch": 0.698378996366012, + "grad_norm": 0.3974648416042328, + "learning_rate": 2.1448330853546317e-05, + "loss": 0.8849, + "step": 2138 + }, + { + "epoch": 0.6987056469723572, + "grad_norm": 0.4531319737434387, + "learning_rate": 2.1405519804407853e-05, + "loss": 0.9345, + "step": 2139 + }, + { + "epoch": 0.6990322975787023, + "grad_norm": 0.46302297711372375, + "learning_rate": 2.1362739883927607e-05, + "loss": 1.1282, + "step": 2140 + }, + { + "epoch": 0.6993589481850475, + "grad_norm": 0.5013945698738098, + "learning_rate": 2.131999113867686e-05, + "loss": 1.0689, + "step": 2141 + }, + { + "epoch": 0.6996855987913928, + "grad_norm": 0.528487503528595, + "learning_rate": 2.1277273615192948e-05, + "loss": 0.9382, + "step": 2142 + }, + { + "epoch": 0.700012249397738, + "grad_norm": 0.572884202003479, + "learning_rate": 2.1234587359979234e-05, + "loss": 1.1927, + "step": 2143 + }, + { + "epoch": 0.7003389000040832, + "grad_norm": 0.6027082204818726, + "learning_rate": 2.1191932419505024e-05, + "loss": 1.0561, + "step": 2144 + }, + { + "epoch": 0.7006655506104283, + "grad_norm": 0.8764929175376892, + "learning_rate": 2.114930884020554e-05, + "loss": 1.3131, + "step": 2145 + }, + { + "epoch": 0.7009922012167735, + "grad_norm": 1.0115678310394287, + "learning_rate": 2.1106716668481867e-05, + "loss": 1.4414, + "step": 2146 + }, + { + "epoch": 0.7013188518231187, + "grad_norm": 1.0085214376449585, + "learning_rate": 2.1064155950700897e-05, + "loss": 1.3486, + "step": 2147 + }, + { + "epoch": 0.7016455024294639, + "grad_norm": 1.156628966331482, + "learning_rate": 2.1021626733195283e-05, + "loss": 1.2661, + "step": 2148 + }, + { + "epoch": 0.7019721530358091, + "grad_norm": 1.412764072418213, + "learning_rate": 2.0979129062263386e-05, + "loss": 1.7921, + "step": 2149 + }, + { + "epoch": 0.7022988036421542, + "grad_norm": 1.8179196119308472, + "learning_rate": 2.0936662984169186e-05, + "loss": 2.0696, + "step": 2150 + }, + { + "epoch": 0.7026254542484994, + "grad_norm": 0.22813886404037476, + "learning_rate": 2.0894228545142362e-05, + "loss": 0.7838, + "step": 2151 + }, + { + "epoch": 0.7029521048548446, + "grad_norm": 0.25454890727996826, + "learning_rate": 2.085182579137804e-05, + "loss": 0.8303, + "step": 2152 + }, + { + "epoch": 0.7032787554611898, + "grad_norm": 0.26593437790870667, + "learning_rate": 2.0809454769036956e-05, + "loss": 0.7726, + "step": 2153 + }, + { + "epoch": 0.703605406067535, + "grad_norm": 0.2822284400463104, + "learning_rate": 2.076711552424522e-05, + "loss": 0.8171, + "step": 2154 + }, + { + "epoch": 0.7039320566738801, + "grad_norm": 0.2791661322116852, + "learning_rate": 2.0724808103094383e-05, + "loss": 0.8399, + "step": 2155 + }, + { + "epoch": 0.7042587072802254, + "grad_norm": 0.2727159857749939, + "learning_rate": 2.068253255164136e-05, + "loss": 0.7566, + "step": 2156 + }, + { + "epoch": 0.7045853578865706, + "grad_norm": 0.30942875146865845, + "learning_rate": 2.064028891590835e-05, + "loss": 0.8765, + "step": 2157 + }, + { + "epoch": 0.7049120084929158, + "grad_norm": 0.3137757480144501, + "learning_rate": 2.0598077241882834e-05, + "loss": 0.8997, + "step": 2158 + }, + { + "epoch": 0.705238659099261, + "grad_norm": 0.33212578296661377, + "learning_rate": 2.0555897575517474e-05, + "loss": 0.8881, + "step": 2159 + }, + { + "epoch": 0.7055653097056062, + "grad_norm": 0.36027947068214417, + "learning_rate": 2.05137499627301e-05, + "loss": 0.9479, + "step": 2160 + }, + { + "epoch": 0.7058919603119513, + "grad_norm": 0.3743271827697754, + "learning_rate": 2.047163444940365e-05, + "loss": 1.0198, + "step": 2161 + }, + { + "epoch": 0.7062186109182965, + "grad_norm": 0.3828783929347992, + "learning_rate": 2.042955108138612e-05, + "loss": 0.9335, + "step": 2162 + }, + { + "epoch": 0.7065452615246417, + "grad_norm": 0.3867970108985901, + "learning_rate": 2.0387499904490463e-05, + "loss": 0.9367, + "step": 2163 + }, + { + "epoch": 0.7068719121309869, + "grad_norm": 0.39452075958251953, + "learning_rate": 2.03454809644947e-05, + "loss": 0.9833, + "step": 2164 + }, + { + "epoch": 0.7071985627373321, + "grad_norm": 0.4327092170715332, + "learning_rate": 2.0303494307141614e-05, + "loss": 0.9356, + "step": 2165 + }, + { + "epoch": 0.7075252133436772, + "grad_norm": 0.469226598739624, + "learning_rate": 2.026153997813899e-05, + "loss": 0.9654, + "step": 2166 + }, + { + "epoch": 0.7078518639500224, + "grad_norm": 0.5176733136177063, + "learning_rate": 2.021961802315929e-05, + "loss": 1.1734, + "step": 2167 + }, + { + "epoch": 0.7081785145563676, + "grad_norm": 0.5132268667221069, + "learning_rate": 2.0177728487839815e-05, + "loss": 1.0919, + "step": 2168 + }, + { + "epoch": 0.7085051651627128, + "grad_norm": 0.5670648217201233, + "learning_rate": 2.0135871417782547e-05, + "loss": 1.0412, + "step": 2169 + }, + { + "epoch": 0.7088318157690581, + "grad_norm": 0.7028777003288269, + "learning_rate": 2.009404685855415e-05, + "loss": 1.1546, + "step": 2170 + }, + { + "epoch": 0.7091584663754033, + "grad_norm": 0.8300199508666992, + "learning_rate": 2.0052254855685808e-05, + "loss": 1.5468, + "step": 2171 + }, + { + "epoch": 0.7094851169817484, + "grad_norm": 0.9810918569564819, + "learning_rate": 2.0010495454673412e-05, + "loss": 1.2278, + "step": 2172 + }, + { + "epoch": 0.7098117675880936, + "grad_norm": 1.0002435445785522, + "learning_rate": 1.9968768700977203e-05, + "loss": 1.1674, + "step": 2173 + }, + { + "epoch": 0.7101384181944388, + "grad_norm": 1.225787878036499, + "learning_rate": 1.9927074640022025e-05, + "loss": 1.4316, + "step": 2174 + }, + { + "epoch": 0.710465068800784, + "grad_norm": 1.3633099794387817, + "learning_rate": 1.988541331719701e-05, + "loss": 1.6954, + "step": 2175 + }, + { + "epoch": 0.7107917194071292, + "grad_norm": 0.2008531093597412, + "learning_rate": 1.984378477785573e-05, + "loss": 0.6916, + "step": 2176 + }, + { + "epoch": 0.7111183700134743, + "grad_norm": 0.25324690341949463, + "learning_rate": 1.9802189067316034e-05, + "loss": 0.7937, + "step": 2177 + }, + { + "epoch": 0.7114450206198195, + "grad_norm": 0.25503653287887573, + "learning_rate": 1.9760626230860046e-05, + "loss": 0.8058, + "step": 2178 + }, + { + "epoch": 0.7117716712261647, + "grad_norm": 0.2745538651943207, + "learning_rate": 1.9719096313734097e-05, + "loss": 0.8408, + "step": 2179 + }, + { + "epoch": 0.7120983218325099, + "grad_norm": 0.2929899990558624, + "learning_rate": 1.967759936114868e-05, + "loss": 0.8549, + "step": 2180 + }, + { + "epoch": 0.712424972438855, + "grad_norm": 0.29745033383369446, + "learning_rate": 1.9636135418278407e-05, + "loss": 0.8669, + "step": 2181 + }, + { + "epoch": 0.7127516230452002, + "grad_norm": 0.3240223824977875, + "learning_rate": 1.959470453026195e-05, + "loss": 0.8879, + "step": 2182 + }, + { + "epoch": 0.7130782736515454, + "grad_norm": 0.3056224286556244, + "learning_rate": 1.955330674220201e-05, + "loss": 0.8368, + "step": 2183 + }, + { + "epoch": 0.7134049242578907, + "grad_norm": 0.3351876437664032, + "learning_rate": 1.9511942099165193e-05, + "loss": 0.7966, + "step": 2184 + }, + { + "epoch": 0.7137315748642359, + "grad_norm": 0.33827874064445496, + "learning_rate": 1.9470610646182147e-05, + "loss": 0.8094, + "step": 2185 + }, + { + "epoch": 0.7140582254705811, + "grad_norm": 0.38939139246940613, + "learning_rate": 1.9429312428247244e-05, + "loss": 0.9772, + "step": 2186 + }, + { + "epoch": 0.7143848760769262, + "grad_norm": 0.39794233441352844, + "learning_rate": 1.9388047490318804e-05, + "loss": 0.8952, + "step": 2187 + }, + { + "epoch": 0.7147115266832714, + "grad_norm": 0.3879866898059845, + "learning_rate": 1.9346815877318824e-05, + "loss": 0.8264, + "step": 2188 + }, + { + "epoch": 0.7150381772896166, + "grad_norm": 0.43497055768966675, + "learning_rate": 1.930561763413306e-05, + "loss": 0.8812, + "step": 2189 + }, + { + "epoch": 0.7153648278959618, + "grad_norm": 0.47120532393455505, + "learning_rate": 1.9264452805610943e-05, + "loss": 1.0343, + "step": 2190 + }, + { + "epoch": 0.715691478502307, + "grad_norm": 0.495140016078949, + "learning_rate": 1.9223321436565528e-05, + "loss": 0.9261, + "step": 2191 + }, + { + "epoch": 0.7160181291086521, + "grad_norm": 0.47505465149879456, + "learning_rate": 1.9182223571773433e-05, + "loss": 0.9779, + "step": 2192 + }, + { + "epoch": 0.7163447797149973, + "grad_norm": 0.5782510638237, + "learning_rate": 1.9141159255974817e-05, + "loss": 1.0439, + "step": 2193 + }, + { + "epoch": 0.7166714303213425, + "grad_norm": 0.6006953120231628, + "learning_rate": 1.9100128533873303e-05, + "loss": 1.0357, + "step": 2194 + }, + { + "epoch": 0.7169980809276877, + "grad_norm": 0.6175481081008911, + "learning_rate": 1.9059131450135948e-05, + "loss": 1.1558, + "step": 2195 + }, + { + "epoch": 0.7173247315340329, + "grad_norm": 0.8074755668640137, + "learning_rate": 1.901816804939322e-05, + "loss": 1.168, + "step": 2196 + }, + { + "epoch": 0.717651382140378, + "grad_norm": 0.9405984282493591, + "learning_rate": 1.8977238376238827e-05, + "loss": 1.2225, + "step": 2197 + }, + { + "epoch": 0.7179780327467233, + "grad_norm": 1.0690345764160156, + "learning_rate": 1.89363424752299e-05, + "loss": 1.5153, + "step": 2198 + }, + { + "epoch": 0.7183046833530685, + "grad_norm": 1.3357555866241455, + "learning_rate": 1.8895480390886655e-05, + "loss": 1.31, + "step": 2199 + }, + { + "epoch": 0.7186313339594137, + "grad_norm": 1.9466137886047363, + "learning_rate": 1.885465216769263e-05, + "loss": 2.1009, + "step": 2200 + }, + { + "epoch": 0.7189579845657589, + "grad_norm": 0.20754139125347137, + "learning_rate": 1.88138578500944e-05, + "loss": 0.7188, + "step": 2201 + }, + { + "epoch": 0.7192846351721041, + "grad_norm": 0.24247871339321136, + "learning_rate": 1.8773097482501683e-05, + "loss": 0.8604, + "step": 2202 + }, + { + "epoch": 0.7196112857784492, + "grad_norm": 0.2585400938987732, + "learning_rate": 1.873237110928721e-05, + "loss": 0.8103, + "step": 2203 + }, + { + "epoch": 0.7199379363847944, + "grad_norm": 0.2706741690635681, + "learning_rate": 1.869167877478673e-05, + "loss": 0.7943, + "step": 2204 + }, + { + "epoch": 0.7202645869911396, + "grad_norm": 0.26471370458602905, + "learning_rate": 1.8651020523298923e-05, + "loss": 0.7537, + "step": 2205 + }, + { + "epoch": 0.7205912375974848, + "grad_norm": 0.28643709421157837, + "learning_rate": 1.8610396399085383e-05, + "loss": 0.829, + "step": 2206 + }, + { + "epoch": 0.72091788820383, + "grad_norm": 0.2852267324924469, + "learning_rate": 1.8569806446370486e-05, + "loss": 0.8191, + "step": 2207 + }, + { + "epoch": 0.7212445388101751, + "grad_norm": 0.3136483430862427, + "learning_rate": 1.8529250709341516e-05, + "loss": 0.8443, + "step": 2208 + }, + { + "epoch": 0.7215711894165203, + "grad_norm": 0.31696316599845886, + "learning_rate": 1.8488729232148418e-05, + "loss": 0.8331, + "step": 2209 + }, + { + "epoch": 0.7218978400228655, + "grad_norm": 0.30480316281318665, + "learning_rate": 1.8448242058903876e-05, + "loss": 0.7433, + "step": 2210 + }, + { + "epoch": 0.7222244906292107, + "grad_norm": 0.3624013364315033, + "learning_rate": 1.8407789233683238e-05, + "loss": 0.9242, + "step": 2211 + }, + { + "epoch": 0.722551141235556, + "grad_norm": 0.37554246187210083, + "learning_rate": 1.8367370800524443e-05, + "loss": 0.9267, + "step": 2212 + }, + { + "epoch": 0.7228777918419012, + "grad_norm": 0.3949258625507355, + "learning_rate": 1.8326986803428004e-05, + "loss": 0.8639, + "step": 2213 + }, + { + "epoch": 0.7232044424482463, + "grad_norm": 0.432748943567276, + "learning_rate": 1.8286637286356926e-05, + "loss": 0.9655, + "step": 2214 + }, + { + "epoch": 0.7235310930545915, + "grad_norm": 0.4370628595352173, + "learning_rate": 1.8246322293236706e-05, + "loss": 0.9346, + "step": 2215 + }, + { + "epoch": 0.7238577436609367, + "grad_norm": 0.463162362575531, + "learning_rate": 1.8206041867955237e-05, + "loss": 1.0436, + "step": 2216 + }, + { + "epoch": 0.7241843942672819, + "grad_norm": 0.48003801703453064, + "learning_rate": 1.8165796054362782e-05, + "loss": 0.9261, + "step": 2217 + }, + { + "epoch": 0.724511044873627, + "grad_norm": 0.5734748244285583, + "learning_rate": 1.812558489627193e-05, + "loss": 1.1375, + "step": 2218 + }, + { + "epoch": 0.7248376954799722, + "grad_norm": 0.6173572540283203, + "learning_rate": 1.8085408437457562e-05, + "loss": 1.044, + "step": 2219 + }, + { + "epoch": 0.7251643460863174, + "grad_norm": 0.7554861903190613, + "learning_rate": 1.804526672165671e-05, + "loss": 1.0502, + "step": 2220 + }, + { + "epoch": 0.7254909966926626, + "grad_norm": 0.9487894177436829, + "learning_rate": 1.8005159792568715e-05, + "loss": 1.4518, + "step": 2221 + }, + { + "epoch": 0.7258176472990078, + "grad_norm": 0.9427092671394348, + "learning_rate": 1.796508769385491e-05, + "loss": 0.9175, + "step": 2222 + }, + { + "epoch": 0.726144297905353, + "grad_norm": 1.1542737483978271, + "learning_rate": 1.7925050469138792e-05, + "loss": 1.48, + "step": 2223 + }, + { + "epoch": 0.7264709485116981, + "grad_norm": 1.3235793113708496, + "learning_rate": 1.7885048162005887e-05, + "loss": 1.3694, + "step": 2224 + }, + { + "epoch": 0.7267975991180433, + "grad_norm": 1.7609529495239258, + "learning_rate": 1.7845080816003672e-05, + "loss": 1.9989, + "step": 2225 + }, + { + "epoch": 0.7271242497243886, + "grad_norm": 0.20694789290428162, + "learning_rate": 1.78051484746416e-05, + "loss": 0.6905, + "step": 2226 + }, + { + "epoch": 0.7274509003307338, + "grad_norm": 0.23232302069664001, + "learning_rate": 1.7765251181391018e-05, + "loss": 0.8029, + "step": 2227 + }, + { + "epoch": 0.727777550937079, + "grad_norm": 0.2653183341026306, + "learning_rate": 1.772538897968509e-05, + "loss": 0.8776, + "step": 2228 + }, + { + "epoch": 0.7281042015434241, + "grad_norm": 0.27905741333961487, + "learning_rate": 1.768556191291879e-05, + "loss": 0.7946, + "step": 2229 + }, + { + "epoch": 0.7284308521497693, + "grad_norm": 0.2898816764354706, + "learning_rate": 1.7645770024448865e-05, + "loss": 0.8578, + "step": 2230 + }, + { + "epoch": 0.7287575027561145, + "grad_norm": 0.298490047454834, + "learning_rate": 1.760601335759374e-05, + "loss": 0.8191, + "step": 2231 + }, + { + "epoch": 0.7290841533624597, + "grad_norm": 0.30617550015449524, + "learning_rate": 1.756629195563352e-05, + "loss": 0.8365, + "step": 2232 + }, + { + "epoch": 0.7294108039688049, + "grad_norm": 0.3224206268787384, + "learning_rate": 1.7526605861809853e-05, + "loss": 0.8298, + "step": 2233 + }, + { + "epoch": 0.72973745457515, + "grad_norm": 0.33401164412498474, + "learning_rate": 1.7486955119326075e-05, + "loss": 0.9214, + "step": 2234 + }, + { + "epoch": 0.7300641051814952, + "grad_norm": 0.34572505950927734, + "learning_rate": 1.74473397713469e-05, + "loss": 0.8883, + "step": 2235 + }, + { + "epoch": 0.7303907557878404, + "grad_norm": 0.35026663541793823, + "learning_rate": 1.740775986099864e-05, + "loss": 0.9183, + "step": 2236 + }, + { + "epoch": 0.7307174063941856, + "grad_norm": 0.38163483142852783, + "learning_rate": 1.7368215431368905e-05, + "loss": 0.9629, + "step": 2237 + }, + { + "epoch": 0.7310440570005308, + "grad_norm": 0.4158123731613159, + "learning_rate": 1.732870652550677e-05, + "loss": 0.9803, + "step": 2238 + }, + { + "epoch": 0.731370707606876, + "grad_norm": 0.4073357880115509, + "learning_rate": 1.7289233186422598e-05, + "loss": 0.8948, + "step": 2239 + }, + { + "epoch": 0.7316973582132212, + "grad_norm": 0.453538715839386, + "learning_rate": 1.7249795457088064e-05, + "loss": 0.939, + "step": 2240 + }, + { + "epoch": 0.7320240088195664, + "grad_norm": 0.5066556334495544, + "learning_rate": 1.7210393380436003e-05, + "loss": 1.1209, + "step": 2241 + }, + { + "epoch": 0.7323506594259116, + "grad_norm": 0.5031617283821106, + "learning_rate": 1.717102699936055e-05, + "loss": 1.0776, + "step": 2242 + }, + { + "epoch": 0.7326773100322568, + "grad_norm": 0.5175497531890869, + "learning_rate": 1.7131696356716863e-05, + "loss": 0.9019, + "step": 2243 + }, + { + "epoch": 0.733003960638602, + "grad_norm": 0.5329145789146423, + "learning_rate": 1.7092401495321315e-05, + "loss": 1.0561, + "step": 2244 + }, + { + "epoch": 0.7333306112449471, + "grad_norm": 0.6562705039978027, + "learning_rate": 1.7053142457951215e-05, + "loss": 1.171, + "step": 2245 + }, + { + "epoch": 0.7336572618512923, + "grad_norm": 0.7525113224983215, + "learning_rate": 1.7013919287344938e-05, + "loss": 1.1676, + "step": 2246 + }, + { + "epoch": 0.7339839124576375, + "grad_norm": 0.8737092018127441, + "learning_rate": 1.69747320262018e-05, + "loss": 1.138, + "step": 2247 + }, + { + "epoch": 0.7343105630639827, + "grad_norm": 1.1151552200317383, + "learning_rate": 1.6935580717182003e-05, + "loss": 1.183, + "step": 2248 + }, + { + "epoch": 0.7346372136703279, + "grad_norm": 1.3022023439407349, + "learning_rate": 1.689646540290668e-05, + "loss": 1.7971, + "step": 2249 + }, + { + "epoch": 0.734963864276673, + "grad_norm": 1.8182604312896729, + "learning_rate": 1.6857386125957676e-05, + "loss": 1.5969, + "step": 2250 + }, + { + "epoch": 0.7352905148830182, + "grad_norm": 0.21018333733081818, + "learning_rate": 1.6818342928877685e-05, + "loss": 0.7596, + "step": 2251 + }, + { + "epoch": 0.7356171654893634, + "grad_norm": 0.22652126848697662, + "learning_rate": 1.6779335854170087e-05, + "loss": 0.8035, + "step": 2252 + }, + { + "epoch": 0.7359438160957086, + "grad_norm": 0.2571515440940857, + "learning_rate": 1.674036494429897e-05, + "loss": 0.815, + "step": 2253 + }, + { + "epoch": 0.7362704667020538, + "grad_norm": 0.2594307065010071, + "learning_rate": 1.6701430241688977e-05, + "loss": 0.8435, + "step": 2254 + }, + { + "epoch": 0.736597117308399, + "grad_norm": 0.2772468328475952, + "learning_rate": 1.6662531788725456e-05, + "loss": 0.8371, + "step": 2255 + }, + { + "epoch": 0.7369237679147442, + "grad_norm": 0.28283077478408813, + "learning_rate": 1.6623669627754162e-05, + "loss": 0.8683, + "step": 2256 + }, + { + "epoch": 0.7372504185210894, + "grad_norm": 0.2877868115901947, + "learning_rate": 1.6584843801081463e-05, + "loss": 0.8402, + "step": 2257 + }, + { + "epoch": 0.7375770691274346, + "grad_norm": 0.29893478751182556, + "learning_rate": 1.6546054350974073e-05, + "loss": 0.854, + "step": 2258 + }, + { + "epoch": 0.7379037197337798, + "grad_norm": 0.3104216158390045, + "learning_rate": 1.650730131965915e-05, + "loss": 0.8575, + "step": 2259 + }, + { + "epoch": 0.738230370340125, + "grad_norm": 0.31806516647338867, + "learning_rate": 1.6468584749324217e-05, + "loss": 0.8733, + "step": 2260 + }, + { + "epoch": 0.7385570209464701, + "grad_norm": 0.3343586027622223, + "learning_rate": 1.6429904682117087e-05, + "loss": 0.8407, + "step": 2261 + }, + { + "epoch": 0.7388836715528153, + "grad_norm": 0.35887300968170166, + "learning_rate": 1.6391261160145833e-05, + "loss": 0.9061, + "step": 2262 + }, + { + "epoch": 0.7392103221591605, + "grad_norm": 0.3796674311161041, + "learning_rate": 1.635265422547875e-05, + "loss": 0.959, + "step": 2263 + }, + { + "epoch": 0.7395369727655057, + "grad_norm": 0.39429622888565063, + "learning_rate": 1.6314083920144295e-05, + "loss": 0.9746, + "step": 2264 + }, + { + "epoch": 0.7398636233718509, + "grad_norm": 0.4258107542991638, + "learning_rate": 1.6275550286131064e-05, + "loss": 0.9826, + "step": 2265 + }, + { + "epoch": 0.740190273978196, + "grad_norm": 0.4779113233089447, + "learning_rate": 1.6237053365387745e-05, + "loss": 1.0346, + "step": 2266 + }, + { + "epoch": 0.7405169245845412, + "grad_norm": 0.48150232434272766, + "learning_rate": 1.6198593199822982e-05, + "loss": 0.9831, + "step": 2267 + }, + { + "epoch": 0.7408435751908864, + "grad_norm": 0.5278285145759583, + "learning_rate": 1.616016983130552e-05, + "loss": 0.9729, + "step": 2268 + }, + { + "epoch": 0.7411702257972317, + "grad_norm": 0.590967059135437, + "learning_rate": 1.612178330166394e-05, + "loss": 1.0734, + "step": 2269 + }, + { + "epoch": 0.7414968764035769, + "grad_norm": 0.6668139696121216, + "learning_rate": 1.6083433652686824e-05, + "loss": 1.2927, + "step": 2270 + }, + { + "epoch": 0.741823527009922, + "grad_norm": 0.7391326427459717, + "learning_rate": 1.6045120926122492e-05, + "loss": 1.1458, + "step": 2271 + }, + { + "epoch": 0.7421501776162672, + "grad_norm": 0.8988819122314453, + "learning_rate": 1.6006845163679156e-05, + "loss": 1.2814, + "step": 2272 + }, + { + "epoch": 0.7424768282226124, + "grad_norm": 1.155909538269043, + "learning_rate": 1.5968606407024743e-05, + "loss": 1.3477, + "step": 2273 + }, + { + "epoch": 0.7428034788289576, + "grad_norm": 1.253177285194397, + "learning_rate": 1.5930404697786928e-05, + "loss": 1.2069, + "step": 2274 + }, + { + "epoch": 0.7431301294353028, + "grad_norm": 1.7872546911239624, + "learning_rate": 1.5892240077553018e-05, + "loss": 2.0905, + "step": 2275 + }, + { + "epoch": 0.743456780041648, + "grad_norm": 0.21168266236782074, + "learning_rate": 1.5854112587869984e-05, + "loss": 0.7841, + "step": 2276 + }, + { + "epoch": 0.7437834306479931, + "grad_norm": 0.24086590111255646, + "learning_rate": 1.5816022270244306e-05, + "loss": 0.7966, + "step": 2277 + }, + { + "epoch": 0.7441100812543383, + "grad_norm": 0.26869410276412964, + "learning_rate": 1.57779691661421e-05, + "loss": 0.8551, + "step": 2278 + }, + { + "epoch": 0.7444367318606835, + "grad_norm": 0.27102309465408325, + "learning_rate": 1.5739953316988904e-05, + "loss": 0.8208, + "step": 2279 + }, + { + "epoch": 0.7447633824670287, + "grad_norm": 0.2761625647544861, + "learning_rate": 1.5701974764169674e-05, + "loss": 0.7825, + "step": 2280 + }, + { + "epoch": 0.7450900330733738, + "grad_norm": 0.2879788279533386, + "learning_rate": 1.5664033549028862e-05, + "loss": 0.8061, + "step": 2281 + }, + { + "epoch": 0.745416683679719, + "grad_norm": 0.3057396709918976, + "learning_rate": 1.562612971287015e-05, + "loss": 0.8626, + "step": 2282 + }, + { + "epoch": 0.7457433342860643, + "grad_norm": 0.30929502844810486, + "learning_rate": 1.5588263296956656e-05, + "loss": 0.8419, + "step": 2283 + }, + { + "epoch": 0.7460699848924095, + "grad_norm": 0.32596832513809204, + "learning_rate": 1.5550434342510646e-05, + "loss": 0.8941, + "step": 2284 + }, + { + "epoch": 0.7463966354987547, + "grad_norm": 0.3355480134487152, + "learning_rate": 1.551264289071367e-05, + "loss": 0.8169, + "step": 2285 + }, + { + "epoch": 0.7467232861050999, + "grad_norm": 0.38037872314453125, + "learning_rate": 1.5474888982706436e-05, + "loss": 0.8659, + "step": 2286 + }, + { + "epoch": 0.747049936711445, + "grad_norm": 0.3562227189540863, + "learning_rate": 1.5437172659588793e-05, + "loss": 0.8644, + "step": 2287 + }, + { + "epoch": 0.7473765873177902, + "grad_norm": 0.38439279794692993, + "learning_rate": 1.5399493962419653e-05, + "loss": 0.7793, + "step": 2288 + }, + { + "epoch": 0.7477032379241354, + "grad_norm": 0.4378209412097931, + "learning_rate": 1.5361852932216992e-05, + "loss": 0.9637, + "step": 2289 + }, + { + "epoch": 0.7480298885304806, + "grad_norm": 0.4555678367614746, + "learning_rate": 1.5324249609957725e-05, + "loss": 0.9449, + "step": 2290 + }, + { + "epoch": 0.7483565391368258, + "grad_norm": 0.5037922263145447, + "learning_rate": 1.528668403657782e-05, + "loss": 0.9338, + "step": 2291 + }, + { + "epoch": 0.7486831897431709, + "grad_norm": 0.499163955450058, + "learning_rate": 1.5249156252972035e-05, + "loss": 0.9826, + "step": 2292 + }, + { + "epoch": 0.7490098403495161, + "grad_norm": 0.6524832844734192, + "learning_rate": 1.5211666299994071e-05, + "loss": 1.0665, + "step": 2293 + }, + { + "epoch": 0.7493364909558613, + "grad_norm": 0.6492843627929688, + "learning_rate": 1.5174214218456407e-05, + "loss": 1.0666, + "step": 2294 + }, + { + "epoch": 0.7496631415622065, + "grad_norm": 0.6795768737792969, + "learning_rate": 1.5136800049130312e-05, + "loss": 1.0677, + "step": 2295 + }, + { + "epoch": 0.7499897921685517, + "grad_norm": 0.876645028591156, + "learning_rate": 1.5099423832745774e-05, + "loss": 1.5102, + "step": 2296 + }, + { + "epoch": 0.750316442774897, + "grad_norm": 1.0358703136444092, + "learning_rate": 1.5062085609991461e-05, + "loss": 1.3524, + "step": 2297 + }, + { + "epoch": 0.7506430933812421, + "grad_norm": 1.2098932266235352, + "learning_rate": 1.5024785421514692e-05, + "loss": 1.4479, + "step": 2298 + }, + { + "epoch": 0.7506430933812421, + "eval_loss": 1.028357982635498, + "eval_runtime": 503.6236, + "eval_samples_per_second": 5.119, + "eval_steps_per_second": 2.559, + "step": 2298 } ], "logging_steps": 1, @@ -10774,7 +16144,7 @@ "attributes": {} } }, - "total_flos": 1.0089220027501773e+18, + "total_flos": 1.5133624248230216e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null