diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.4999794145497962, + "epoch": 0.7499691218246943, "eval_steps": 759, - "global_step": 1518, + "global_step": 2277, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -10657,6 +10657,5327 @@ "eval_samples_per_second": 3.343, "eval_steps_per_second": 1.672, "step": 1518 + }, + { + "epoch": 0.5003087817530569, + "grad_norm": 3.1498970985412598, + "learning_rate": 5.126239470409546e-05, + "loss": 2.2301, + "step": 1519 + }, + { + "epoch": 0.5006381489563176, + "grad_norm": 2.7907960414886475, + "learning_rate": 5.1209805415304224e-05, + "loss": 2.0078, + "step": 1520 + }, + { + "epoch": 0.5009675161595785, + "grad_norm": 3.4877936840057373, + "learning_rate": 5.115721478734323e-05, + "loss": 2.7673, + "step": 1521 + }, + { + "epoch": 0.5012968833628392, + "grad_norm": 3.1865594387054443, + "learning_rate": 5.1104622878426664e-05, + "loss": 1.9934, + "step": 1522 + }, + { + "epoch": 0.5016262505660999, + "grad_norm": 2.964200258255005, + "learning_rate": 5.105202974677008e-05, + "loss": 1.8623, + "step": 1523 + }, + { + "epoch": 0.5019556177693606, + "grad_norm": 3.1753077507019043, + "learning_rate": 5.099943545059035e-05, + "loss": 1.9103, + "step": 1524 + }, + { + "epoch": 0.5022849849726213, + "grad_norm": 3.5453011989593506, + "learning_rate": 5.094684004810568e-05, + "loss": 2.223, + "step": 1525 + }, + { + "epoch": 0.5026143521758821, + "grad_norm": 1.9202203750610352, + "learning_rate": 5.089424359753553e-05, + "loss": 2.3095, + "step": 1526 + }, + { + "epoch": 0.5029437193791428, + "grad_norm": 2.3142552375793457, + "learning_rate": 5.084164615710042e-05, + "loss": 2.0537, + "step": 1527 + }, + { + "epoch": 0.5032730865824036, + "grad_norm": 2.545691728591919, + "learning_rate": 5.078904778502206e-05, + "loss": 2.222, + "step": 1528 + }, + { + "epoch": 0.5036024537856643, + "grad_norm": 2.6911139488220215, + "learning_rate": 5.0736448539523174e-05, + "loss": 2.2372, + "step": 1529 + }, + { + "epoch": 0.5039318209889251, + "grad_norm": 2.5566017627716064, + "learning_rate": 5.06838484788274e-05, + "loss": 2.2237, + "step": 1530 + }, + { + "epoch": 0.5042611881921858, + "grad_norm": 3.251295804977417, + "learning_rate": 5.063124766115933e-05, + "loss": 2.6693, + "step": 1531 + }, + { + "epoch": 0.5045905553954465, + "grad_norm": 2.5430779457092285, + "learning_rate": 5.057864614474439e-05, + "loss": 2.6509, + "step": 1532 + }, + { + "epoch": 0.5049199225987072, + "grad_norm": 2.509148597717285, + "learning_rate": 5.052604398780877e-05, + "loss": 2.2311, + "step": 1533 + }, + { + "epoch": 0.5052492898019679, + "grad_norm": 2.435283899307251, + "learning_rate": 5.047344124857933e-05, + "loss": 2.2194, + "step": 1534 + }, + { + "epoch": 0.5055786570052287, + "grad_norm": 2.5991721153259277, + "learning_rate": 5.0420837985283664e-05, + "loss": 2.3425, + "step": 1535 + }, + { + "epoch": 0.5059080242084895, + "grad_norm": 2.383134126663208, + "learning_rate": 5.036823425614986e-05, + "loss": 2.3328, + "step": 1536 + }, + { + "epoch": 0.5062373914117502, + "grad_norm": 3.121763229370117, + "learning_rate": 5.0315630119406565e-05, + "loss": 2.4681, + "step": 1537 + }, + { + "epoch": 0.5065667586150109, + "grad_norm": 2.824033260345459, + "learning_rate": 5.0263025633282866e-05, + "loss": 2.3247, + "step": 1538 + }, + { + "epoch": 0.5068961258182716, + "grad_norm": 2.942976713180542, + "learning_rate": 5.021042085600827e-05, + "loss": 2.254, + "step": 1539 + }, + { + "epoch": 0.5072254930215324, + "grad_norm": 2.7724997997283936, + "learning_rate": 5.015781584581252e-05, + "loss": 2.0605, + "step": 1540 + }, + { + "epoch": 0.5075548602247931, + "grad_norm": 2.791670322418213, + "learning_rate": 5.0105210660925704e-05, + "loss": 2.0396, + "step": 1541 + }, + { + "epoch": 0.5078842274280538, + "grad_norm": 2.5132830142974854, + "learning_rate": 5.0052605359578086e-05, + "loss": 1.7545, + "step": 1542 + }, + { + "epoch": 0.5082135946313145, + "grad_norm": 2.5325276851654053, + "learning_rate": 5e-05, + "loss": 1.737, + "step": 1543 + }, + { + "epoch": 0.5085429618345754, + "grad_norm": 3.5093564987182617, + "learning_rate": 4.994739464042193e-05, + "loss": 2.3988, + "step": 1544 + }, + { + "epoch": 0.5088723290378361, + "grad_norm": 3.0075719356536865, + "learning_rate": 4.989478933907431e-05, + "loss": 2.0844, + "step": 1545 + }, + { + "epoch": 0.5092016962410968, + "grad_norm": 2.6146292686462402, + "learning_rate": 4.9842184154187486e-05, + "loss": 2.0929, + "step": 1546 + }, + { + "epoch": 0.5095310634443575, + "grad_norm": 2.921995162963867, + "learning_rate": 4.978957914399174e-05, + "loss": 2.162, + "step": 1547 + }, + { + "epoch": 0.5098604306476182, + "grad_norm": 3.16227650642395, + "learning_rate": 4.973697436671714e-05, + "loss": 2.2389, + "step": 1548 + }, + { + "epoch": 0.510189797850879, + "grad_norm": 2.8888611793518066, + "learning_rate": 4.9684369880593446e-05, + "loss": 1.9225, + "step": 1549 + }, + { + "epoch": 0.5105191650541397, + "grad_norm": 3.1876556873321533, + "learning_rate": 4.963176574385015e-05, + "loss": 1.7554, + "step": 1550 + }, + { + "epoch": 0.5108485322574005, + "grad_norm": 2.1988892555236816, + "learning_rate": 4.957916201471635e-05, + "loss": 2.6424, + "step": 1551 + }, + { + "epoch": 0.5111778994606612, + "grad_norm": 2.707622766494751, + "learning_rate": 4.952655875142068e-05, + "loss": 2.4397, + "step": 1552 + }, + { + "epoch": 0.511507266663922, + "grad_norm": 2.7695624828338623, + "learning_rate": 4.947395601219126e-05, + "loss": 2.2901, + "step": 1553 + }, + { + "epoch": 0.5118366338671827, + "grad_norm": 2.7299704551696777, + "learning_rate": 4.9421353855255624e-05, + "loss": 2.4199, + "step": 1554 + }, + { + "epoch": 0.5121660010704434, + "grad_norm": 2.7451207637786865, + "learning_rate": 4.936875233884069e-05, + "loss": 2.1827, + "step": 1555 + }, + { + "epoch": 0.5124953682737041, + "grad_norm": 2.116290807723999, + "learning_rate": 4.931615152117262e-05, + "loss": 1.9413, + "step": 1556 + }, + { + "epoch": 0.5128247354769648, + "grad_norm": 2.1912240982055664, + "learning_rate": 4.926355146047685e-05, + "loss": 2.3163, + "step": 1557 + }, + { + "epoch": 0.5131541026802257, + "grad_norm": 2.397310495376587, + "learning_rate": 4.9210952214977954e-05, + "loss": 2.022, + "step": 1558 + }, + { + "epoch": 0.5134834698834864, + "grad_norm": 2.5802431106567383, + "learning_rate": 4.915835384289958e-05, + "loss": 2.3069, + "step": 1559 + }, + { + "epoch": 0.5138128370867471, + "grad_norm": 2.7203660011291504, + "learning_rate": 4.9105756402464486e-05, + "loss": 2.6652, + "step": 1560 + }, + { + "epoch": 0.5141422042900078, + "grad_norm": 2.513911247253418, + "learning_rate": 4.905315995189431e-05, + "loss": 2.012, + "step": 1561 + }, + { + "epoch": 0.5144715714932686, + "grad_norm": 3.2579002380371094, + "learning_rate": 4.900056454940965e-05, + "loss": 2.3097, + "step": 1562 + }, + { + "epoch": 0.5148009386965293, + "grad_norm": 3.077726364135742, + "learning_rate": 4.894797025322993e-05, + "loss": 2.1601, + "step": 1563 + }, + { + "epoch": 0.51513030589979, + "grad_norm": 3.066802501678467, + "learning_rate": 4.8895377121573334e-05, + "loss": 2.5118, + "step": 1564 + }, + { + "epoch": 0.5154596731030507, + "grad_norm": 2.064492702484131, + "learning_rate": 4.884278521265677e-05, + "loss": 1.8952, + "step": 1565 + }, + { + "epoch": 0.5157890403063115, + "grad_norm": 2.6084978580474854, + "learning_rate": 4.879019458469579e-05, + "loss": 2.2112, + "step": 1566 + }, + { + "epoch": 0.5161184075095723, + "grad_norm": 2.9014785289764404, + "learning_rate": 4.873760529590455e-05, + "loss": 2.0184, + "step": 1567 + }, + { + "epoch": 0.516447774712833, + "grad_norm": 2.735502243041992, + "learning_rate": 4.8685017404495683e-05, + "loss": 2.0125, + "step": 1568 + }, + { + "epoch": 0.5167771419160937, + "grad_norm": 2.60123348236084, + "learning_rate": 4.863243096868029e-05, + "loss": 2.0574, + "step": 1569 + }, + { + "epoch": 0.5171065091193544, + "grad_norm": 2.9571070671081543, + "learning_rate": 4.85798460466679e-05, + "loss": 2.1061, + "step": 1570 + }, + { + "epoch": 0.5174358763226152, + "grad_norm": 2.7076237201690674, + "learning_rate": 4.85272626966663e-05, + "loss": 2.0312, + "step": 1571 + }, + { + "epoch": 0.5177652435258759, + "grad_norm": 2.76377010345459, + "learning_rate": 4.847468097688157e-05, + "loss": 1.9232, + "step": 1572 + }, + { + "epoch": 0.5180946107291367, + "grad_norm": 2.923283338546753, + "learning_rate": 4.8422100945518015e-05, + "loss": 2.0029, + "step": 1573 + }, + { + "epoch": 0.5184239779323974, + "grad_norm": 2.9562487602233887, + "learning_rate": 4.836952266077801e-05, + "loss": 2.1253, + "step": 1574 + }, + { + "epoch": 0.5187533451356581, + "grad_norm": 3.1904914379119873, + "learning_rate": 4.8316946180862036e-05, + "loss": 2.2682, + "step": 1575 + }, + { + "epoch": 0.5190827123389189, + "grad_norm": 2.154838800430298, + "learning_rate": 4.826437156396858e-05, + "loss": 2.6257, + "step": 1576 + }, + { + "epoch": 0.5194120795421796, + "grad_norm": 2.501347541809082, + "learning_rate": 4.821179886829405e-05, + "loss": 2.0692, + "step": 1577 + }, + { + "epoch": 0.5197414467454403, + "grad_norm": 2.355531692504883, + "learning_rate": 4.815922815203272e-05, + "loss": 2.2775, + "step": 1578 + }, + { + "epoch": 0.520070813948701, + "grad_norm": 2.39996075630188, + "learning_rate": 4.8106659473376695e-05, + "loss": 2.3885, + "step": 1579 + }, + { + "epoch": 0.5204001811519617, + "grad_norm": 2.339895486831665, + "learning_rate": 4.805409289051582e-05, + "loss": 2.2418, + "step": 1580 + }, + { + "epoch": 0.5207295483552226, + "grad_norm": 2.3130595684051514, + "learning_rate": 4.80015284616376e-05, + "loss": 2.0223, + "step": 1581 + }, + { + "epoch": 0.5210589155584833, + "grad_norm": 2.9214603900909424, + "learning_rate": 4.794896624492718e-05, + "loss": 2.4215, + "step": 1582 + }, + { + "epoch": 0.521388282761744, + "grad_norm": 2.5012197494506836, + "learning_rate": 4.789640629856725e-05, + "loss": 2.3697, + "step": 1583 + }, + { + "epoch": 0.5217176499650047, + "grad_norm": 2.809415817260742, + "learning_rate": 4.7843848680737966e-05, + "loss": 2.1699, + "step": 1584 + }, + { + "epoch": 0.5220470171682655, + "grad_norm": 2.9632301330566406, + "learning_rate": 4.779129344961694e-05, + "loss": 2.2572, + "step": 1585 + }, + { + "epoch": 0.5223763843715262, + "grad_norm": 2.583806276321411, + "learning_rate": 4.7738740663379135e-05, + "loss": 2.1202, + "step": 1586 + }, + { + "epoch": 0.5227057515747869, + "grad_norm": 3.0589942932128906, + "learning_rate": 4.768619038019677e-05, + "loss": 2.2599, + "step": 1587 + }, + { + "epoch": 0.5230351187780476, + "grad_norm": 2.770036220550537, + "learning_rate": 4.7633642658239345e-05, + "loss": 1.9833, + "step": 1588 + }, + { + "epoch": 0.5233644859813084, + "grad_norm": 3.0664565563201904, + "learning_rate": 4.758109755567348e-05, + "loss": 2.0921, + "step": 1589 + }, + { + "epoch": 0.5236938531845692, + "grad_norm": 2.592406749725342, + "learning_rate": 4.752855513066293e-05, + "loss": 1.9637, + "step": 1590 + }, + { + "epoch": 0.5240232203878299, + "grad_norm": 2.8975071907043457, + "learning_rate": 4.747601544136849e-05, + "loss": 2.2294, + "step": 1591 + }, + { + "epoch": 0.5243525875910906, + "grad_norm": 2.795776844024658, + "learning_rate": 4.742347854594791e-05, + "loss": 2.2874, + "step": 1592 + }, + { + "epoch": 0.5246819547943513, + "grad_norm": 2.9049065113067627, + "learning_rate": 4.737094450255581e-05, + "loss": 2.2325, + "step": 1593 + }, + { + "epoch": 0.5250113219976121, + "grad_norm": 3.0760581493377686, + "learning_rate": 4.731841336934372e-05, + "loss": 2.2926, + "step": 1594 + }, + { + "epoch": 0.5253406892008728, + "grad_norm": 2.4571144580841064, + "learning_rate": 4.726588520445993e-05, + "loss": 2.0032, + "step": 1595 + }, + { + "epoch": 0.5256700564041336, + "grad_norm": 3.1097042560577393, + "learning_rate": 4.721336006604941e-05, + "loss": 1.9375, + "step": 1596 + }, + { + "epoch": 0.5259994236073943, + "grad_norm": 2.750455617904663, + "learning_rate": 4.71608380122538e-05, + "loss": 2.1235, + "step": 1597 + }, + { + "epoch": 0.526328790810655, + "grad_norm": 2.6923322677612305, + "learning_rate": 4.710831910121135e-05, + "loss": 1.7062, + "step": 1598 + }, + { + "epoch": 0.5266581580139158, + "grad_norm": 3.282520055770874, + "learning_rate": 4.7055803391056795e-05, + "loss": 2.2095, + "step": 1599 + }, + { + "epoch": 0.5269875252171765, + "grad_norm": 2.924677610397339, + "learning_rate": 4.700329093992135e-05, + "loss": 1.6683, + "step": 1600 + }, + { + "epoch": 0.5273168924204372, + "grad_norm": 2.091074228286743, + "learning_rate": 4.695078180593262e-05, + "loss": 2.4635, + "step": 1601 + }, + { + "epoch": 0.5276462596236979, + "grad_norm": 2.103097677230835, + "learning_rate": 4.689827604721451e-05, + "loss": 2.1667, + "step": 1602 + }, + { + "epoch": 0.5279756268269588, + "grad_norm": 2.7929534912109375, + "learning_rate": 4.684577372188723e-05, + "loss": 2.7994, + "step": 1603 + }, + { + "epoch": 0.5283049940302195, + "grad_norm": 2.534282684326172, + "learning_rate": 4.679327488806716e-05, + "loss": 2.3134, + "step": 1604 + }, + { + "epoch": 0.5286343612334802, + "grad_norm": 2.393833875656128, + "learning_rate": 4.674077960386685e-05, + "loss": 2.3605, + "step": 1605 + }, + { + "epoch": 0.5289637284367409, + "grad_norm": 2.5953030586242676, + "learning_rate": 4.668828792739487e-05, + "loss": 2.2843, + "step": 1606 + }, + { + "epoch": 0.5292930956400016, + "grad_norm": 2.9071319103240967, + "learning_rate": 4.6635799916755836e-05, + "loss": 2.35, + "step": 1607 + }, + { + "epoch": 0.5296224628432624, + "grad_norm": 2.4467287063598633, + "learning_rate": 4.65833156300503e-05, + "loss": 2.1182, + "step": 1608 + }, + { + "epoch": 0.5299518300465231, + "grad_norm": 2.2593226432800293, + "learning_rate": 4.653083512537467e-05, + "loss": 2.3638, + "step": 1609 + }, + { + "epoch": 0.5302811972497838, + "grad_norm": 2.5257232189178467, + "learning_rate": 4.6478358460821184e-05, + "loss": 2.2619, + "step": 1610 + }, + { + "epoch": 0.5306105644530446, + "grad_norm": 2.365927219390869, + "learning_rate": 4.642588569447785e-05, + "loss": 2.1333, + "step": 1611 + }, + { + "epoch": 0.5309399316563054, + "grad_norm": 2.346470594406128, + "learning_rate": 4.637341688442831e-05, + "loss": 2.1835, + "step": 1612 + }, + { + "epoch": 0.5312692988595661, + "grad_norm": 2.5479326248168945, + "learning_rate": 4.6320952088751876e-05, + "loss": 2.1302, + "step": 1613 + }, + { + "epoch": 0.5315986660628268, + "grad_norm": 2.7286736965179443, + "learning_rate": 4.626849136552338e-05, + "loss": 2.0868, + "step": 1614 + }, + { + "epoch": 0.5319280332660875, + "grad_norm": 2.7833032608032227, + "learning_rate": 4.6216034772813195e-05, + "loss": 2.5112, + "step": 1615 + }, + { + "epoch": 0.5322574004693482, + "grad_norm": 2.5549001693725586, + "learning_rate": 4.616358236868705e-05, + "loss": 2.0309, + "step": 1616 + }, + { + "epoch": 0.532586767672609, + "grad_norm": 3.951969623565674, + "learning_rate": 4.61111342112061e-05, + "loss": 2.3313, + "step": 1617 + }, + { + "epoch": 0.5329161348758698, + "grad_norm": 2.874086380004883, + "learning_rate": 4.605869035842677e-05, + "loss": 2.2732, + "step": 1618 + }, + { + "epoch": 0.5332455020791305, + "grad_norm": 2.5527641773223877, + "learning_rate": 4.6006250868400726e-05, + "loss": 1.9412, + "step": 1619 + }, + { + "epoch": 0.5335748692823912, + "grad_norm": 2.8807637691497803, + "learning_rate": 4.595381579917478e-05, + "loss": 2.283, + "step": 1620 + }, + { + "epoch": 0.533904236485652, + "grad_norm": 2.820483922958374, + "learning_rate": 4.5901385208790924e-05, + "loss": 2.3101, + "step": 1621 + }, + { + "epoch": 0.5342336036889127, + "grad_norm": 2.790860652923584, + "learning_rate": 4.584895915528611e-05, + "loss": 2.0345, + "step": 1622 + }, + { + "epoch": 0.5345629708921734, + "grad_norm": 2.5604536533355713, + "learning_rate": 4.579653769669233e-05, + "loss": 1.8557, + "step": 1623 + }, + { + "epoch": 0.5348923380954341, + "grad_norm": 3.321770668029785, + "learning_rate": 4.574412089103643e-05, + "loss": 1.9905, + "step": 1624 + }, + { + "epoch": 0.5352217052986948, + "grad_norm": 3.51432204246521, + "learning_rate": 4.569170879634014e-05, + "loss": 1.9512, + "step": 1625 + }, + { + "epoch": 0.5355510725019557, + "grad_norm": 2.5113067626953125, + "learning_rate": 4.563930147062e-05, + "loss": 2.393, + "step": 1626 + }, + { + "epoch": 0.5358804397052164, + "grad_norm": 1.724272608757019, + "learning_rate": 4.558689897188721e-05, + "loss": 2.1496, + "step": 1627 + }, + { + "epoch": 0.5362098069084771, + "grad_norm": 2.2979555130004883, + "learning_rate": 4.553450135814768e-05, + "loss": 2.4137, + "step": 1628 + }, + { + "epoch": 0.5365391741117378, + "grad_norm": 2.4890944957733154, + "learning_rate": 4.548210868740188e-05, + "loss": 2.3685, + "step": 1629 + }, + { + "epoch": 0.5368685413149985, + "grad_norm": 2.3017985820770264, + "learning_rate": 4.5429721017644835e-05, + "loss": 2.4172, + "step": 1630 + }, + { + "epoch": 0.5371979085182593, + "grad_norm": 2.6837596893310547, + "learning_rate": 4.537733840686601e-05, + "loss": 2.424, + "step": 1631 + }, + { + "epoch": 0.53752727572152, + "grad_norm": 2.583272695541382, + "learning_rate": 4.532496091304929e-05, + "loss": 2.5061, + "step": 1632 + }, + { + "epoch": 0.5378566429247807, + "grad_norm": 2.5158188343048096, + "learning_rate": 4.5272588594172875e-05, + "loss": 2.3061, + "step": 1633 + }, + { + "epoch": 0.5381860101280415, + "grad_norm": 2.558367967605591, + "learning_rate": 4.522022150820925e-05, + "loss": 2.0809, + "step": 1634 + }, + { + "epoch": 0.5385153773313023, + "grad_norm": 2.473259687423706, + "learning_rate": 4.516785971312511e-05, + "loss": 2.2443, + "step": 1635 + }, + { + "epoch": 0.538844744534563, + "grad_norm": 2.3816640377044678, + "learning_rate": 4.51155032668813e-05, + "loss": 2.0707, + "step": 1636 + }, + { + "epoch": 0.5391741117378237, + "grad_norm": 2.75467586517334, + "learning_rate": 4.5063152227432705e-05, + "loss": 2.307, + "step": 1637 + }, + { + "epoch": 0.5395034789410844, + "grad_norm": 2.2179129123687744, + "learning_rate": 4.501080665272827e-05, + "loss": 2.1801, + "step": 1638 + }, + { + "epoch": 0.5398328461443451, + "grad_norm": 2.5906922817230225, + "learning_rate": 4.495846660071088e-05, + "loss": 2.136, + "step": 1639 + }, + { + "epoch": 0.540162213347606, + "grad_norm": 3.0321009159088135, + "learning_rate": 4.490613212931729e-05, + "loss": 2.5038, + "step": 1640 + }, + { + "epoch": 0.5404915805508667, + "grad_norm": 2.619051694869995, + "learning_rate": 4.485380329647808e-05, + "loss": 2.2298, + "step": 1641 + }, + { + "epoch": 0.5408209477541274, + "grad_norm": 3.0019948482513428, + "learning_rate": 4.480148016011762e-05, + "loss": 2.0229, + "step": 1642 + }, + { + "epoch": 0.5411503149573881, + "grad_norm": 2.2649595737457275, + "learning_rate": 4.4749162778153954e-05, + "loss": 1.8919, + "step": 1643 + }, + { + "epoch": 0.5414796821606489, + "grad_norm": 2.849843740463257, + "learning_rate": 4.469685120849872e-05, + "loss": 2.0903, + "step": 1644 + }, + { + "epoch": 0.5418090493639096, + "grad_norm": 3.021907091140747, + "learning_rate": 4.4644545509057185e-05, + "loss": 2.3181, + "step": 1645 + }, + { + "epoch": 0.5421384165671703, + "grad_norm": 2.7080013751983643, + "learning_rate": 4.459224573772808e-05, + "loss": 1.8712, + "step": 1646 + }, + { + "epoch": 0.542467783770431, + "grad_norm": 2.6799404621124268, + "learning_rate": 4.453995195240358e-05, + "loss": 1.4717, + "step": 1647 + }, + { + "epoch": 0.5427971509736917, + "grad_norm": 3.217162609100342, + "learning_rate": 4.448766421096924e-05, + "loss": 2.263, + "step": 1648 + }, + { + "epoch": 0.5431265181769526, + "grad_norm": 3.4610049724578857, + "learning_rate": 4.443538257130393e-05, + "loss": 2.0226, + "step": 1649 + }, + { + "epoch": 0.5434558853802133, + "grad_norm": 3.7474148273468018, + "learning_rate": 4.4383107091279724e-05, + "loss": 1.9848, + "step": 1650 + }, + { + "epoch": 0.543785252583474, + "grad_norm": 1.9629192352294922, + "learning_rate": 4.433083782876196e-05, + "loss": 2.4658, + "step": 1651 + }, + { + "epoch": 0.5441146197867347, + "grad_norm": 2.639218807220459, + "learning_rate": 4.427857484160902e-05, + "loss": 2.5598, + "step": 1652 + }, + { + "epoch": 0.5444439869899955, + "grad_norm": 2.2541027069091797, + "learning_rate": 4.4226318187672357e-05, + "loss": 2.4328, + "step": 1653 + }, + { + "epoch": 0.5447733541932562, + "grad_norm": 2.1472740173339844, + "learning_rate": 4.4174067924796444e-05, + "loss": 2.1159, + "step": 1654 + }, + { + "epoch": 0.5451027213965169, + "grad_norm": 2.170586585998535, + "learning_rate": 4.4121824110818636e-05, + "loss": 2.193, + "step": 1655 + }, + { + "epoch": 0.5454320885997777, + "grad_norm": 2.1903934478759766, + "learning_rate": 4.406958680356917e-05, + "loss": 2.0941, + "step": 1656 + }, + { + "epoch": 0.5457614558030384, + "grad_norm": 2.934429883956909, + "learning_rate": 4.4017356060871084e-05, + "loss": 2.7441, + "step": 1657 + }, + { + "epoch": 0.5460908230062992, + "grad_norm": 2.7060625553131104, + "learning_rate": 4.396513194054017e-05, + "loss": 2.2778, + "step": 1658 + }, + { + "epoch": 0.5464201902095599, + "grad_norm": 2.726825714111328, + "learning_rate": 4.3912914500384825e-05, + "loss": 2.4721, + "step": 1659 + }, + { + "epoch": 0.5467495574128206, + "grad_norm": 2.721712589263916, + "learning_rate": 4.386070379820612e-05, + "loss": 2.6526, + "step": 1660 + }, + { + "epoch": 0.5470789246160813, + "grad_norm": 2.7384960651397705, + "learning_rate": 4.380849989179764e-05, + "loss": 2.0193, + "step": 1661 + }, + { + "epoch": 0.5474082918193421, + "grad_norm": 2.4211504459381104, + "learning_rate": 4.375630283894543e-05, + "loss": 1.86, + "step": 1662 + }, + { + "epoch": 0.5477376590226029, + "grad_norm": 2.456686019897461, + "learning_rate": 4.370411269742797e-05, + "loss": 2.2503, + "step": 1663 + }, + { + "epoch": 0.5480670262258636, + "grad_norm": 2.7705039978027344, + "learning_rate": 4.36519295250161e-05, + "loss": 2.1732, + "step": 1664 + }, + { + "epoch": 0.5483963934291243, + "grad_norm": 2.6563832759857178, + "learning_rate": 4.3599753379472916e-05, + "loss": 1.8923, + "step": 1665 + }, + { + "epoch": 0.548725760632385, + "grad_norm": 3.264117479324341, + "learning_rate": 4.354758431855375e-05, + "loss": 2.1551, + "step": 1666 + }, + { + "epoch": 0.5490551278356458, + "grad_norm": 3.068531036376953, + "learning_rate": 4.349542240000612e-05, + "loss": 2.342, + "step": 1667 + }, + { + "epoch": 0.5493844950389065, + "grad_norm": 2.412754535675049, + "learning_rate": 4.344326768156957e-05, + "loss": 1.961, + "step": 1668 + }, + { + "epoch": 0.5497138622421672, + "grad_norm": 2.5070178508758545, + "learning_rate": 4.339112022097574e-05, + "loss": 2.0561, + "step": 1669 + }, + { + "epoch": 0.5500432294454279, + "grad_norm": 3.7027029991149902, + "learning_rate": 4.33389800759482e-05, + "loss": 2.4097, + "step": 1670 + }, + { + "epoch": 0.5503725966486888, + "grad_norm": 2.5179436206817627, + "learning_rate": 4.3286847304202446e-05, + "loss": 2.1429, + "step": 1671 + }, + { + "epoch": 0.5507019638519495, + "grad_norm": 2.777573823928833, + "learning_rate": 4.323472196344579e-05, + "loss": 1.8857, + "step": 1672 + }, + { + "epoch": 0.5510313310552102, + "grad_norm": 2.600545644760132, + "learning_rate": 4.3182604111377324e-05, + "loss": 1.7022, + "step": 1673 + }, + { + "epoch": 0.5513606982584709, + "grad_norm": 3.057023048400879, + "learning_rate": 4.3130493805687864e-05, + "loss": 2.03, + "step": 1674 + }, + { + "epoch": 0.5516900654617316, + "grad_norm": 3.4655754566192627, + "learning_rate": 4.3078391104059854e-05, + "loss": 1.8471, + "step": 1675 + }, + { + "epoch": 0.5520194326649924, + "grad_norm": 2.065298318862915, + "learning_rate": 4.3026296064167334e-05, + "loss": 2.4574, + "step": 1676 + }, + { + "epoch": 0.5523487998682531, + "grad_norm": 2.120046615600586, + "learning_rate": 4.297420874367586e-05, + "loss": 2.1619, + "step": 1677 + }, + { + "epoch": 0.5526781670715138, + "grad_norm": 2.614112615585327, + "learning_rate": 4.292212920024243e-05, + "loss": 2.3831, + "step": 1678 + }, + { + "epoch": 0.5530075342747746, + "grad_norm": 2.4009101390838623, + "learning_rate": 4.287005749151546e-05, + "loss": 2.4063, + "step": 1679 + }, + { + "epoch": 0.5533369014780353, + "grad_norm": 2.5356178283691406, + "learning_rate": 4.281799367513467e-05, + "loss": 2.2704, + "step": 1680 + }, + { + "epoch": 0.5536662686812961, + "grad_norm": 2.489375352859497, + "learning_rate": 4.276593780873105e-05, + "loss": 2.1472, + "step": 1681 + }, + { + "epoch": 0.5539956358845568, + "grad_norm": 2.381366491317749, + "learning_rate": 4.271388994992682e-05, + "loss": 2.0989, + "step": 1682 + }, + { + "epoch": 0.5543250030878175, + "grad_norm": 2.4626338481903076, + "learning_rate": 4.266185015633527e-05, + "loss": 2.2773, + "step": 1683 + }, + { + "epoch": 0.5546543702910782, + "grad_norm": 2.7277164459228516, + "learning_rate": 4.260981848556081e-05, + "loss": 2.3875, + "step": 1684 + }, + { + "epoch": 0.554983737494339, + "grad_norm": 2.389749765396118, + "learning_rate": 4.2557794995198876e-05, + "loss": 2.0901, + "step": 1685 + }, + { + "epoch": 0.5553131046975998, + "grad_norm": 2.2613539695739746, + "learning_rate": 4.25057797428358e-05, + "loss": 2.3888, + "step": 1686 + }, + { + "epoch": 0.5556424719008605, + "grad_norm": 2.8356738090515137, + "learning_rate": 4.245377278604881e-05, + "loss": 2.1503, + "step": 1687 + }, + { + "epoch": 0.5559718391041212, + "grad_norm": 2.471827745437622, + "learning_rate": 4.240177418240597e-05, + "loss": 2.1751, + "step": 1688 + }, + { + "epoch": 0.5563012063073819, + "grad_norm": 2.6936299800872803, + "learning_rate": 4.234978398946611e-05, + "loss": 2.2912, + "step": 1689 + }, + { + "epoch": 0.5566305735106427, + "grad_norm": 2.365645170211792, + "learning_rate": 4.2297802264778676e-05, + "loss": 2.0137, + "step": 1690 + }, + { + "epoch": 0.5569599407139034, + "grad_norm": 2.8537073135375977, + "learning_rate": 4.224582906588382e-05, + "loss": 1.9902, + "step": 1691 + }, + { + "epoch": 0.5572893079171641, + "grad_norm": 2.6542677879333496, + "learning_rate": 4.2193864450312236e-05, + "loss": 2.0595, + "step": 1692 + }, + { + "epoch": 0.5576186751204248, + "grad_norm": 2.9382853507995605, + "learning_rate": 4.214190847558509e-05, + "loss": 2.1753, + "step": 1693 + }, + { + "epoch": 0.5579480423236857, + "grad_norm": 2.9632253646850586, + "learning_rate": 4.2089961199214e-05, + "loss": 2.0176, + "step": 1694 + }, + { + "epoch": 0.5582774095269464, + "grad_norm": 3.0138721466064453, + "learning_rate": 4.203802267870097e-05, + "loss": 1.9451, + "step": 1695 + }, + { + "epoch": 0.5586067767302071, + "grad_norm": 2.569395065307617, + "learning_rate": 4.1986092971538306e-05, + "loss": 1.8728, + "step": 1696 + }, + { + "epoch": 0.5589361439334678, + "grad_norm": 2.5671072006225586, + "learning_rate": 4.193417213520852e-05, + "loss": 1.9411, + "step": 1697 + }, + { + "epoch": 0.5592655111367285, + "grad_norm": 2.9711215496063232, + "learning_rate": 4.1882260227184354e-05, + "loss": 2.2304, + "step": 1698 + }, + { + "epoch": 0.5595948783399893, + "grad_norm": 2.82068133354187, + "learning_rate": 4.1830357304928664e-05, + "loss": 1.9563, + "step": 1699 + }, + { + "epoch": 0.55992424554325, + "grad_norm": 3.187591314315796, + "learning_rate": 4.177846342589431e-05, + "loss": 1.8838, + "step": 1700 + }, + { + "epoch": 0.5602536127465108, + "grad_norm": 2.2826199531555176, + "learning_rate": 4.17265786475242e-05, + "loss": 2.3018, + "step": 1701 + }, + { + "epoch": 0.5605829799497715, + "grad_norm": 2.2217764854431152, + "learning_rate": 4.167470302725114e-05, + "loss": 2.3331, + "step": 1702 + }, + { + "epoch": 0.5609123471530323, + "grad_norm": 2.2242486476898193, + "learning_rate": 4.16228366224978e-05, + "loss": 2.2806, + "step": 1703 + }, + { + "epoch": 0.561241714356293, + "grad_norm": 2.391951560974121, + "learning_rate": 4.157097949067664e-05, + "loss": 2.4716, + "step": 1704 + }, + { + "epoch": 0.5615710815595537, + "grad_norm": 2.481973886489868, + "learning_rate": 4.1519131689189894e-05, + "loss": 2.1806, + "step": 1705 + }, + { + "epoch": 0.5619004487628144, + "grad_norm": 2.7211787700653076, + "learning_rate": 4.146729327542942e-05, + "loss": 2.3138, + "step": 1706 + }, + { + "epoch": 0.5622298159660751, + "grad_norm": 2.246211528778076, + "learning_rate": 4.1415464306776704e-05, + "loss": 2.339, + "step": 1707 + }, + { + "epoch": 0.562559183169336, + "grad_norm": 2.3714375495910645, + "learning_rate": 4.136364484060279e-05, + "loss": 2.3938, + "step": 1708 + }, + { + "epoch": 0.5628885503725967, + "grad_norm": 2.315984010696411, + "learning_rate": 4.131183493426819e-05, + "loss": 2.0192, + "step": 1709 + }, + { + "epoch": 0.5632179175758574, + "grad_norm": 3.121124029159546, + "learning_rate": 4.126003464512283e-05, + "loss": 2.5295, + "step": 1710 + }, + { + "epoch": 0.5635472847791181, + "grad_norm": 2.801370620727539, + "learning_rate": 4.120824403050598e-05, + "loss": 2.0249, + "step": 1711 + }, + { + "epoch": 0.5638766519823789, + "grad_norm": 2.729098320007324, + "learning_rate": 4.115646314774626e-05, + "loss": 2.1085, + "step": 1712 + }, + { + "epoch": 0.5642060191856396, + "grad_norm": 2.5545506477355957, + "learning_rate": 4.110469205416144e-05, + "loss": 2.1962, + "step": 1713 + }, + { + "epoch": 0.5645353863889003, + "grad_norm": 2.6717350482940674, + "learning_rate": 4.105293080705852e-05, + "loss": 2.124, + "step": 1714 + }, + { + "epoch": 0.564864753592161, + "grad_norm": 2.6053621768951416, + "learning_rate": 4.100117946373353e-05, + "loss": 2.4692, + "step": 1715 + }, + { + "epoch": 0.5651941207954218, + "grad_norm": 2.648757219314575, + "learning_rate": 4.09494380814716e-05, + "loss": 2.2406, + "step": 1716 + }, + { + "epoch": 0.5655234879986826, + "grad_norm": 2.497184991836548, + "learning_rate": 4.089770671754683e-05, + "loss": 1.9408, + "step": 1717 + }, + { + "epoch": 0.5658528552019433, + "grad_norm": 3.321964740753174, + "learning_rate": 4.0845985429222156e-05, + "loss": 1.9579, + "step": 1718 + }, + { + "epoch": 0.566182222405204, + "grad_norm": 3.0194036960601807, + "learning_rate": 4.079427427374945e-05, + "loss": 2.1183, + "step": 1719 + }, + { + "epoch": 0.5665115896084647, + "grad_norm": 3.1421256065368652, + "learning_rate": 4.0742573308369356e-05, + "loss": 2.041, + "step": 1720 + }, + { + "epoch": 0.5668409568117255, + "grad_norm": 2.7292587757110596, + "learning_rate": 4.069088259031117e-05, + "loss": 1.9314, + "step": 1721 + }, + { + "epoch": 0.5671703240149862, + "grad_norm": 2.4826438426971436, + "learning_rate": 4.0639202176792914e-05, + "loss": 1.9008, + "step": 1722 + }, + { + "epoch": 0.567499691218247, + "grad_norm": 3.0865981578826904, + "learning_rate": 4.0587532125021173e-05, + "loss": 2.1393, + "step": 1723 + }, + { + "epoch": 0.5678290584215077, + "grad_norm": 3.053807020187378, + "learning_rate": 4.053587249219108e-05, + "loss": 2.0525, + "step": 1724 + }, + { + "epoch": 0.5681584256247684, + "grad_norm": 3.325843572616577, + "learning_rate": 4.048422333548622e-05, + "loss": 1.8684, + "step": 1725 + }, + { + "epoch": 0.5684877928280292, + "grad_norm": 2.406651020050049, + "learning_rate": 4.043258471207858e-05, + "loss": 2.2508, + "step": 1726 + }, + { + "epoch": 0.5688171600312899, + "grad_norm": 2.377819061279297, + "learning_rate": 4.038095667912851e-05, + "loss": 2.4474, + "step": 1727 + }, + { + "epoch": 0.5691465272345506, + "grad_norm": 2.348951816558838, + "learning_rate": 4.03293392937846e-05, + "loss": 2.4961, + "step": 1728 + }, + { + "epoch": 0.5694758944378113, + "grad_norm": 2.3945837020874023, + "learning_rate": 4.027773261318368e-05, + "loss": 2.359, + "step": 1729 + }, + { + "epoch": 0.569805261641072, + "grad_norm": 2.5263919830322266, + "learning_rate": 4.022613669445075e-05, + "loss": 2.2331, + "step": 1730 + }, + { + "epoch": 0.5701346288443329, + "grad_norm": 2.5810630321502686, + "learning_rate": 4.0174551594698836e-05, + "loss": 2.0708, + "step": 1731 + }, + { + "epoch": 0.5704639960475936, + "grad_norm": 2.346571207046509, + "learning_rate": 4.012297737102903e-05, + "loss": 2.222, + "step": 1732 + }, + { + "epoch": 0.5707933632508543, + "grad_norm": 2.419288158416748, + "learning_rate": 4.00714140805304e-05, + "loss": 2.1208, + "step": 1733 + }, + { + "epoch": 0.571122730454115, + "grad_norm": 2.1099019050598145, + "learning_rate": 4.0019861780279886e-05, + "loss": 2.0208, + "step": 1734 + }, + { + "epoch": 0.5714520976573758, + "grad_norm": 2.775758981704712, + "learning_rate": 3.9968320527342265e-05, + "loss": 2.4634, + "step": 1735 + }, + { + "epoch": 0.5717814648606365, + "grad_norm": 2.646881103515625, + "learning_rate": 3.991679037877008e-05, + "loss": 2.1785, + "step": 1736 + }, + { + "epoch": 0.5721108320638972, + "grad_norm": 2.8634984493255615, + "learning_rate": 3.98652713916036e-05, + "loss": 2.4894, + "step": 1737 + }, + { + "epoch": 0.572440199267158, + "grad_norm": 2.8133482933044434, + "learning_rate": 3.981376362287072e-05, + "loss": 2.1222, + "step": 1738 + }, + { + "epoch": 0.5727695664704187, + "grad_norm": 2.9048733711242676, + "learning_rate": 3.9762267129586934e-05, + "loss": 2.2615, + "step": 1739 + }, + { + "epoch": 0.5730989336736795, + "grad_norm": 2.7857484817504883, + "learning_rate": 3.971078196875526e-05, + "loss": 2.0236, + "step": 1740 + }, + { + "epoch": 0.5734283008769402, + "grad_norm": 2.681157350540161, + "learning_rate": 3.965930819736613e-05, + "loss": 1.9672, + "step": 1741 + }, + { + "epoch": 0.5737576680802009, + "grad_norm": 2.8750627040863037, + "learning_rate": 3.960784587239741e-05, + "loss": 2.0623, + "step": 1742 + }, + { + "epoch": 0.5740870352834616, + "grad_norm": 2.7466681003570557, + "learning_rate": 3.95563950508143e-05, + "loss": 2.1302, + "step": 1743 + }, + { + "epoch": 0.5744164024867224, + "grad_norm": 3.013389825820923, + "learning_rate": 3.950495578956923e-05, + "loss": 2.1544, + "step": 1744 + }, + { + "epoch": 0.5747457696899831, + "grad_norm": 2.87448787689209, + "learning_rate": 3.9453528145601875e-05, + "loss": 2.085, + "step": 1745 + }, + { + "epoch": 0.5750751368932439, + "grad_norm": 2.522451162338257, + "learning_rate": 3.9402112175839005e-05, + "loss": 2.0753, + "step": 1746 + }, + { + "epoch": 0.5754045040965046, + "grad_norm": 2.751380681991577, + "learning_rate": 3.9350707937194504e-05, + "loss": 2.3245, + "step": 1747 + }, + { + "epoch": 0.5757338712997653, + "grad_norm": 2.8028810024261475, + "learning_rate": 3.929931548656925e-05, + "loss": 1.9075, + "step": 1748 + }, + { + "epoch": 0.5760632385030261, + "grad_norm": 2.744546413421631, + "learning_rate": 3.924793488085111e-05, + "loss": 1.889, + "step": 1749 + }, + { + "epoch": 0.5763926057062868, + "grad_norm": 2.8225655555725098, + "learning_rate": 3.9196566176914775e-05, + "loss": 1.7777, + "step": 1750 + }, + { + "epoch": 0.5767219729095475, + "grad_norm": 2.291588306427002, + "learning_rate": 3.91452094316218e-05, + "loss": 2.5541, + "step": 1751 + }, + { + "epoch": 0.5770513401128082, + "grad_norm": 2.511906862258911, + "learning_rate": 3.909386470182053e-05, + "loss": 2.2869, + "step": 1752 + }, + { + "epoch": 0.577380707316069, + "grad_norm": 2.5849432945251465, + "learning_rate": 3.9042532044345934e-05, + "loss": 2.3356, + "step": 1753 + }, + { + "epoch": 0.5777100745193298, + "grad_norm": 2.452805280685425, + "learning_rate": 3.899121151601969e-05, + "loss": 2.3691, + "step": 1754 + }, + { + "epoch": 0.5780394417225905, + "grad_norm": 2.3843283653259277, + "learning_rate": 3.893990317365003e-05, + "loss": 2.2706, + "step": 1755 + }, + { + "epoch": 0.5783688089258512, + "grad_norm": 2.5602409839630127, + "learning_rate": 3.888860707403167e-05, + "loss": 2.2975, + "step": 1756 + }, + { + "epoch": 0.5786981761291119, + "grad_norm": 2.4516329765319824, + "learning_rate": 3.88373232739458e-05, + "loss": 2.2493, + "step": 1757 + }, + { + "epoch": 0.5790275433323727, + "grad_norm": 2.7871108055114746, + "learning_rate": 3.878605183016001e-05, + "loss": 2.2575, + "step": 1758 + }, + { + "epoch": 0.5793569105356334, + "grad_norm": 2.2499959468841553, + "learning_rate": 3.873479279942815e-05, + "loss": 2.1826, + "step": 1759 + }, + { + "epoch": 0.5796862777388941, + "grad_norm": 2.571885347366333, + "learning_rate": 3.8683546238490396e-05, + "loss": 2.0972, + "step": 1760 + }, + { + "epoch": 0.5800156449421549, + "grad_norm": 2.976599931716919, + "learning_rate": 3.86323122040731e-05, + "loss": 2.4199, + "step": 1761 + }, + { + "epoch": 0.5803450121454157, + "grad_norm": 2.53981876373291, + "learning_rate": 3.858109075288875e-05, + "loss": 2.0601, + "step": 1762 + }, + { + "epoch": 0.5806743793486764, + "grad_norm": 2.889158248901367, + "learning_rate": 3.852988194163587e-05, + "loss": 2.5461, + "step": 1763 + }, + { + "epoch": 0.5810037465519371, + "grad_norm": 2.411149740219116, + "learning_rate": 3.847868582699904e-05, + "loss": 2.027, + "step": 1764 + }, + { + "epoch": 0.5813331137551978, + "grad_norm": 2.5908608436584473, + "learning_rate": 3.8427502465648776e-05, + "loss": 2.4121, + "step": 1765 + }, + { + "epoch": 0.5816624809584585, + "grad_norm": 2.567713499069214, + "learning_rate": 3.8376331914241446e-05, + "loss": 1.9522, + "step": 1766 + }, + { + "epoch": 0.5819918481617193, + "grad_norm": 2.662276029586792, + "learning_rate": 3.832517422941928e-05, + "loss": 2.1719, + "step": 1767 + }, + { + "epoch": 0.58232121536498, + "grad_norm": 2.723975419998169, + "learning_rate": 3.8274029467810245e-05, + "loss": 2.2837, + "step": 1768 + }, + { + "epoch": 0.5826505825682408, + "grad_norm": 2.5226011276245117, + "learning_rate": 3.822289768602799e-05, + "loss": 1.8823, + "step": 1769 + }, + { + "epoch": 0.5829799497715015, + "grad_norm": 2.713068962097168, + "learning_rate": 3.817177894067182e-05, + "loss": 2.2014, + "step": 1770 + }, + { + "epoch": 0.5833093169747623, + "grad_norm": 2.9934639930725098, + "learning_rate": 3.81206732883266e-05, + "loss": 2.0643, + "step": 1771 + }, + { + "epoch": 0.583638684178023, + "grad_norm": 2.897000312805176, + "learning_rate": 3.8069580785562686e-05, + "loss": 2.0665, + "step": 1772 + }, + { + "epoch": 0.5839680513812837, + "grad_norm": 2.8157401084899902, + "learning_rate": 3.8018501488935936e-05, + "loss": 1.9893, + "step": 1773 + }, + { + "epoch": 0.5842974185845444, + "grad_norm": 3.132835865020752, + "learning_rate": 3.796743545498751e-05, + "loss": 1.8069, + "step": 1774 + }, + { + "epoch": 0.5846267857878051, + "grad_norm": 3.8937413692474365, + "learning_rate": 3.791638274024394e-05, + "loss": 1.9874, + "step": 1775 + }, + { + "epoch": 0.584956152991066, + "grad_norm": 2.0166850090026855, + "learning_rate": 3.7865343401217e-05, + "loss": 2.5567, + "step": 1776 + }, + { + "epoch": 0.5852855201943267, + "grad_norm": 2.2458670139312744, + "learning_rate": 3.781431749440365e-05, + "loss": 2.3101, + "step": 1777 + }, + { + "epoch": 0.5856148873975874, + "grad_norm": 2.270277738571167, + "learning_rate": 3.7763305076286e-05, + "loss": 2.5257, + "step": 1778 + }, + { + "epoch": 0.5859442546008481, + "grad_norm": 2.389111042022705, + "learning_rate": 3.7712306203331205e-05, + "loss": 2.4234, + "step": 1779 + }, + { + "epoch": 0.5862736218041088, + "grad_norm": 2.423489809036255, + "learning_rate": 3.766132093199146e-05, + "loss": 2.3983, + "step": 1780 + }, + { + "epoch": 0.5866029890073696, + "grad_norm": 2.0370967388153076, + "learning_rate": 3.761034931870386e-05, + "loss": 2.3665, + "step": 1781 + }, + { + "epoch": 0.5869323562106303, + "grad_norm": 2.6825292110443115, + "learning_rate": 3.7559391419890414e-05, + "loss": 2.561, + "step": 1782 + }, + { + "epoch": 0.587261723413891, + "grad_norm": 2.5416064262390137, + "learning_rate": 3.7508447291957956e-05, + "loss": 2.5464, + "step": 1783 + }, + { + "epoch": 0.5875910906171518, + "grad_norm": 2.385080099105835, + "learning_rate": 3.7457516991298036e-05, + "loss": 2.143, + "step": 1784 + }, + { + "epoch": 0.5879204578204126, + "grad_norm": 3.0276386737823486, + "learning_rate": 3.740660057428694e-05, + "loss": 2.3312, + "step": 1785 + }, + { + "epoch": 0.5882498250236733, + "grad_norm": 2.4158527851104736, + "learning_rate": 3.735569809728556e-05, + "loss": 2.2548, + "step": 1786 + }, + { + "epoch": 0.588579192226934, + "grad_norm": 2.7139041423797607, + "learning_rate": 3.730480961663939e-05, + "loss": 2.2324, + "step": 1787 + }, + { + "epoch": 0.5889085594301947, + "grad_norm": 2.506220579147339, + "learning_rate": 3.7253935188678386e-05, + "loss": 2.2346, + "step": 1788 + }, + { + "epoch": 0.5892379266334554, + "grad_norm": 2.876122236251831, + "learning_rate": 3.720307486971697e-05, + "loss": 2.5066, + "step": 1789 + }, + { + "epoch": 0.5895672938367162, + "grad_norm": 2.6854052543640137, + "learning_rate": 3.715222871605397e-05, + "loss": 2.2645, + "step": 1790 + }, + { + "epoch": 0.589896661039977, + "grad_norm": 2.4886717796325684, + "learning_rate": 3.710139678397249e-05, + "loss": 2.0398, + "step": 1791 + }, + { + "epoch": 0.5902260282432377, + "grad_norm": 2.610877513885498, + "learning_rate": 3.7050579129739904e-05, + "loss": 2.2531, + "step": 1792 + }, + { + "epoch": 0.5905553954464984, + "grad_norm": 2.7850184440612793, + "learning_rate": 3.699977580960782e-05, + "loss": 2.1468, + "step": 1793 + }, + { + "epoch": 0.5908847626497592, + "grad_norm": 2.5817010402679443, + "learning_rate": 3.694898687981193e-05, + "loss": 2.2793, + "step": 1794 + }, + { + "epoch": 0.5912141298530199, + "grad_norm": 2.5994668006896973, + "learning_rate": 3.689821239657202e-05, + "loss": 2.0205, + "step": 1795 + }, + { + "epoch": 0.5915434970562806, + "grad_norm": 2.431635856628418, + "learning_rate": 3.684745241609189e-05, + "loss": 1.9347, + "step": 1796 + }, + { + "epoch": 0.5918728642595413, + "grad_norm": 2.6019957065582275, + "learning_rate": 3.6796706994559255e-05, + "loss": 1.982, + "step": 1797 + }, + { + "epoch": 0.592202231462802, + "grad_norm": 2.8037219047546387, + "learning_rate": 3.6745976188145755e-05, + "loss": 1.8823, + "step": 1798 + }, + { + "epoch": 0.5925315986660629, + "grad_norm": 2.723085641860962, + "learning_rate": 3.6695260053006825e-05, + "loss": 1.5098, + "step": 1799 + }, + { + "epoch": 0.5928609658693236, + "grad_norm": 2.8144564628601074, + "learning_rate": 3.664455864528169e-05, + "loss": 1.6849, + "step": 1800 + }, + { + "epoch": 0.5931903330725843, + "grad_norm": 2.097994327545166, + "learning_rate": 3.659387202109322e-05, + "loss": 2.33, + "step": 1801 + }, + { + "epoch": 0.593519700275845, + "grad_norm": 2.1393368244171143, + "learning_rate": 3.6543200236547956e-05, + "loss": 2.2933, + "step": 1802 + }, + { + "epoch": 0.5938490674791058, + "grad_norm": 2.3984479904174805, + "learning_rate": 3.649254334773604e-05, + "loss": 2.487, + "step": 1803 + }, + { + "epoch": 0.5941784346823665, + "grad_norm": 2.082158088684082, + "learning_rate": 3.6441901410731064e-05, + "loss": 2.2317, + "step": 1804 + }, + { + "epoch": 0.5945078018856272, + "grad_norm": 2.287839412689209, + "learning_rate": 3.639127448159013e-05, + "loss": 2.2207, + "step": 1805 + }, + { + "epoch": 0.594837169088888, + "grad_norm": 2.7098779678344727, + "learning_rate": 3.634066261635366e-05, + "loss": 2.7025, + "step": 1806 + }, + { + "epoch": 0.5951665362921487, + "grad_norm": 2.2493362426757812, + "learning_rate": 3.629006587104546e-05, + "loss": 2.2232, + "step": 1807 + }, + { + "epoch": 0.5954959034954095, + "grad_norm": 2.7404730319976807, + "learning_rate": 3.623948430167258e-05, + "loss": 2.2783, + "step": 1808 + }, + { + "epoch": 0.5958252706986702, + "grad_norm": 2.522005796432495, + "learning_rate": 3.6188917964225256e-05, + "loss": 2.1626, + "step": 1809 + }, + { + "epoch": 0.5961546379019309, + "grad_norm": 2.4059793949127197, + "learning_rate": 3.613836691467688e-05, + "loss": 2.338, + "step": 1810 + }, + { + "epoch": 0.5964840051051916, + "grad_norm": 2.5347237586975098, + "learning_rate": 3.608783120898392e-05, + "loss": 2.3689, + "step": 1811 + }, + { + "epoch": 0.5968133723084524, + "grad_norm": 2.6831188201904297, + "learning_rate": 3.603731090308586e-05, + "loss": 2.1, + "step": 1812 + }, + { + "epoch": 0.5971427395117131, + "grad_norm": 2.634315013885498, + "learning_rate": 3.598680605290513e-05, + "loss": 2.4177, + "step": 1813 + }, + { + "epoch": 0.5974721067149739, + "grad_norm": 2.7967567443847656, + "learning_rate": 3.593631671434706e-05, + "loss": 2.406, + "step": 1814 + }, + { + "epoch": 0.5978014739182346, + "grad_norm": 2.2884232997894287, + "learning_rate": 3.588584294329981e-05, + "loss": 1.8473, + "step": 1815 + }, + { + "epoch": 0.5981308411214953, + "grad_norm": 2.73101806640625, + "learning_rate": 3.5835384795634285e-05, + "loss": 2.1567, + "step": 1816 + }, + { + "epoch": 0.5984602083247561, + "grad_norm": 2.5716552734375, + "learning_rate": 3.578494232720413e-05, + "loss": 2.1229, + "step": 1817 + }, + { + "epoch": 0.5987895755280168, + "grad_norm": 2.678837537765503, + "learning_rate": 3.573451559384563e-05, + "loss": 2.1249, + "step": 1818 + }, + { + "epoch": 0.5991189427312775, + "grad_norm": 2.7849841117858887, + "learning_rate": 3.568410465137762e-05, + "loss": 1.8091, + "step": 1819 + }, + { + "epoch": 0.5994483099345382, + "grad_norm": 2.5732271671295166, + "learning_rate": 3.563370955560147e-05, + "loss": 2.0009, + "step": 1820 + }, + { + "epoch": 0.5997776771377991, + "grad_norm": 2.8257884979248047, + "learning_rate": 3.558333036230105e-05, + "loss": 2.0211, + "step": 1821 + }, + { + "epoch": 0.6001070443410598, + "grad_norm": 2.7145349979400635, + "learning_rate": 3.553296712724256e-05, + "loss": 2.0294, + "step": 1822 + }, + { + "epoch": 0.6004364115443205, + "grad_norm": 2.5253031253814697, + "learning_rate": 3.548261990617459e-05, + "loss": 1.754, + "step": 1823 + }, + { + "epoch": 0.6007657787475812, + "grad_norm": 3.2652945518493652, + "learning_rate": 3.543228875482796e-05, + "loss": 2.1409, + "step": 1824 + }, + { + "epoch": 0.6010951459508419, + "grad_norm": 4.042289733886719, + "learning_rate": 3.538197372891575e-05, + "loss": 2.3513, + "step": 1825 + }, + { + "epoch": 0.6014245131541027, + "grad_norm": 2.0667660236358643, + "learning_rate": 3.533167488413315e-05, + "loss": 2.6748, + "step": 1826 + }, + { + "epoch": 0.6017538803573634, + "grad_norm": 2.205390691757202, + "learning_rate": 3.528139227615744e-05, + "loss": 2.2744, + "step": 1827 + }, + { + "epoch": 0.6020832475606241, + "grad_norm": 2.3584322929382324, + "learning_rate": 3.5231125960647974e-05, + "loss": 2.0508, + "step": 1828 + }, + { + "epoch": 0.6024126147638849, + "grad_norm": 2.636195659637451, + "learning_rate": 3.5180875993246005e-05, + "loss": 2.3369, + "step": 1829 + }, + { + "epoch": 0.6027419819671456, + "grad_norm": 2.2127506732940674, + "learning_rate": 3.513064242957473e-05, + "loss": 2.2606, + "step": 1830 + }, + { + "epoch": 0.6030713491704064, + "grad_norm": 2.4520390033721924, + "learning_rate": 3.50804253252392e-05, + "loss": 2.2014, + "step": 1831 + }, + { + "epoch": 0.6034007163736671, + "grad_norm": 2.1871538162231445, + "learning_rate": 3.503022473582619e-05, + "loss": 2.0991, + "step": 1832 + }, + { + "epoch": 0.6037300835769278, + "grad_norm": 2.8282971382141113, + "learning_rate": 3.498004071690424e-05, + "loss": 2.5463, + "step": 1833 + }, + { + "epoch": 0.6040594507801885, + "grad_norm": 2.2347660064697266, + "learning_rate": 3.492987332402356e-05, + "loss": 2.189, + "step": 1834 + }, + { + "epoch": 0.6043888179834493, + "grad_norm": 2.6401114463806152, + "learning_rate": 3.487972261271594e-05, + "loss": 2.2381, + "step": 1835 + }, + { + "epoch": 0.60471818518671, + "grad_norm": 2.1282289028167725, + "learning_rate": 3.482958863849467e-05, + "loss": 2.0018, + "step": 1836 + }, + { + "epoch": 0.6050475523899708, + "grad_norm": 2.5486040115356445, + "learning_rate": 3.477947145685456e-05, + "loss": 2.2497, + "step": 1837 + }, + { + "epoch": 0.6053769195932315, + "grad_norm": 2.6828370094299316, + "learning_rate": 3.47293711232718e-05, + "loss": 2.3501, + "step": 1838 + }, + { + "epoch": 0.6057062867964922, + "grad_norm": 3.3055360317230225, + "learning_rate": 3.467928769320397e-05, + "loss": 2.3995, + "step": 1839 + }, + { + "epoch": 0.606035653999753, + "grad_norm": 2.5054500102996826, + "learning_rate": 3.462922122208989e-05, + "loss": 1.9583, + "step": 1840 + }, + { + "epoch": 0.6063650212030137, + "grad_norm": 2.4648640155792236, + "learning_rate": 3.457917176534964e-05, + "loss": 2.1108, + "step": 1841 + }, + { + "epoch": 0.6066943884062744, + "grad_norm": 2.776409149169922, + "learning_rate": 3.452913937838446e-05, + "loss": 2.2656, + "step": 1842 + }, + { + "epoch": 0.6070237556095351, + "grad_norm": 2.3845038414001465, + "learning_rate": 3.447912411657669e-05, + "loss": 1.9668, + "step": 1843 + }, + { + "epoch": 0.607353122812796, + "grad_norm": 2.431183099746704, + "learning_rate": 3.442912603528971e-05, + "loss": 2.0438, + "step": 1844 + }, + { + "epoch": 0.6076824900160567, + "grad_norm": 2.8428471088409424, + "learning_rate": 3.43791451898679e-05, + "loss": 2.3823, + "step": 1845 + }, + { + "epoch": 0.6080118572193174, + "grad_norm": 3.1004912853240967, + "learning_rate": 3.432918163563654e-05, + "loss": 2.1599, + "step": 1846 + }, + { + "epoch": 0.6083412244225781, + "grad_norm": 3.722520112991333, + "learning_rate": 3.4279235427901785e-05, + "loss": 2.3052, + "step": 1847 + }, + { + "epoch": 0.6086705916258388, + "grad_norm": 2.688524007797241, + "learning_rate": 3.422930662195058e-05, + "loss": 1.7832, + "step": 1848 + }, + { + "epoch": 0.6089999588290996, + "grad_norm": 2.663485527038574, + "learning_rate": 3.417939527305062e-05, + "loss": 1.9782, + "step": 1849 + }, + { + "epoch": 0.6093293260323603, + "grad_norm": 3.491044759750366, + "learning_rate": 3.412950143645025e-05, + "loss": 1.9132, + "step": 1850 + }, + { + "epoch": 0.609658693235621, + "grad_norm": 1.7720582485198975, + "learning_rate": 3.407962516737846e-05, + "loss": 2.376, + "step": 1851 + }, + { + "epoch": 0.6099880604388818, + "grad_norm": 2.126549005508423, + "learning_rate": 3.402976652104477e-05, + "loss": 1.8721, + "step": 1852 + }, + { + "epoch": 0.6103174276421426, + "grad_norm": 2.1321988105773926, + "learning_rate": 3.3979925552639224e-05, + "loss": 2.2343, + "step": 1853 + }, + { + "epoch": 0.6106467948454033, + "grad_norm": 2.4410877227783203, + "learning_rate": 3.3930102317332255e-05, + "loss": 2.3504, + "step": 1854 + }, + { + "epoch": 0.610976162048664, + "grad_norm": 2.425161361694336, + "learning_rate": 3.38802968702747e-05, + "loss": 2.1846, + "step": 1855 + }, + { + "epoch": 0.6113055292519247, + "grad_norm": 2.4529266357421875, + "learning_rate": 3.383050926659771e-05, + "loss": 2.3507, + "step": 1856 + }, + { + "epoch": 0.6116348964551854, + "grad_norm": 2.2834224700927734, + "learning_rate": 3.378073956141263e-05, + "loss": 2.2235, + "step": 1857 + }, + { + "epoch": 0.6119642636584463, + "grad_norm": 2.8359508514404297, + "learning_rate": 3.3730987809811064e-05, + "loss": 2.3002, + "step": 1858 + }, + { + "epoch": 0.612293630861707, + "grad_norm": 2.947082042694092, + "learning_rate": 3.368125406686472e-05, + "loss": 2.2171, + "step": 1859 + }, + { + "epoch": 0.6126229980649677, + "grad_norm": 2.474220037460327, + "learning_rate": 3.3631538387625325e-05, + "loss": 2.3359, + "step": 1860 + }, + { + "epoch": 0.6129523652682284, + "grad_norm": 2.4733357429504395, + "learning_rate": 3.3581840827124665e-05, + "loss": 2.3127, + "step": 1861 + }, + { + "epoch": 0.6132817324714892, + "grad_norm": 2.6637370586395264, + "learning_rate": 3.353216144037448e-05, + "loss": 2.2253, + "step": 1862 + }, + { + "epoch": 0.6136110996747499, + "grad_norm": 2.6768898963928223, + "learning_rate": 3.3482500282366304e-05, + "loss": 2.0104, + "step": 1863 + }, + { + "epoch": 0.6139404668780106, + "grad_norm": 2.4703683853149414, + "learning_rate": 3.3432857408071626e-05, + "loss": 2.4931, + "step": 1864 + }, + { + "epoch": 0.6142698340812713, + "grad_norm": 2.216553211212158, + "learning_rate": 3.338323287244158e-05, + "loss": 2.0299, + "step": 1865 + }, + { + "epoch": 0.614599201284532, + "grad_norm": 2.8111774921417236, + "learning_rate": 3.333362673040706e-05, + "loss": 2.5135, + "step": 1866 + }, + { + "epoch": 0.6149285684877929, + "grad_norm": 2.6286466121673584, + "learning_rate": 3.328403903687859e-05, + "loss": 2.1726, + "step": 1867 + }, + { + "epoch": 0.6152579356910536, + "grad_norm": 2.4072277545928955, + "learning_rate": 3.323446984674627e-05, + "loss": 2.1707, + "step": 1868 + }, + { + "epoch": 0.6155873028943143, + "grad_norm": 2.4936254024505615, + "learning_rate": 3.3184919214879696e-05, + "loss": 1.993, + "step": 1869 + }, + { + "epoch": 0.615916670097575, + "grad_norm": 2.5818185806274414, + "learning_rate": 3.313538719612796e-05, + "loss": 2.1305, + "step": 1870 + }, + { + "epoch": 0.6162460373008357, + "grad_norm": 2.4668569564819336, + "learning_rate": 3.308587384531953e-05, + "loss": 1.7286, + "step": 1871 + }, + { + "epoch": 0.6165754045040965, + "grad_norm": 2.7164247035980225, + "learning_rate": 3.30363792172622e-05, + "loss": 1.9667, + "step": 1872 + }, + { + "epoch": 0.6169047717073572, + "grad_norm": 2.8039917945861816, + "learning_rate": 3.2986903366743056e-05, + "loss": 1.9472, + "step": 1873 + }, + { + "epoch": 0.617234138910618, + "grad_norm": 2.910883903503418, + "learning_rate": 3.293744634852841e-05, + "loss": 2.1599, + "step": 1874 + }, + { + "epoch": 0.6175635061138787, + "grad_norm": 2.8668439388275146, + "learning_rate": 3.288800821736369e-05, + "loss": 1.6236, + "step": 1875 + }, + { + "epoch": 0.6178928733171395, + "grad_norm": 1.9064861536026, + "learning_rate": 3.2838589027973444e-05, + "loss": 2.2323, + "step": 1876 + }, + { + "epoch": 0.6182222405204002, + "grad_norm": 2.109698534011841, + "learning_rate": 3.278918883506126e-05, + "loss": 2.2206, + "step": 1877 + }, + { + "epoch": 0.6185516077236609, + "grad_norm": 2.2354044914245605, + "learning_rate": 3.2739807693309675e-05, + "loss": 2.2063, + "step": 1878 + }, + { + "epoch": 0.6188809749269216, + "grad_norm": 2.030697822570801, + "learning_rate": 3.269044565738014e-05, + "loss": 2.0355, + "step": 1879 + }, + { + "epoch": 0.6192103421301823, + "grad_norm": 2.2695322036743164, + "learning_rate": 3.2641102781912994e-05, + "loss": 2.0768, + "step": 1880 + }, + { + "epoch": 0.6195397093334432, + "grad_norm": 2.603289842605591, + "learning_rate": 3.259177912152732e-05, + "loss": 2.2277, + "step": 1881 + }, + { + "epoch": 0.6198690765367039, + "grad_norm": 2.7053635120391846, + "learning_rate": 3.254247473082094e-05, + "loss": 2.565, + "step": 1882 + }, + { + "epoch": 0.6201984437399646, + "grad_norm": 2.517655849456787, + "learning_rate": 3.249318966437037e-05, + "loss": 2.2004, + "step": 1883 + }, + { + "epoch": 0.6205278109432253, + "grad_norm": 2.3863282203674316, + "learning_rate": 3.244392397673073e-05, + "loss": 1.9685, + "step": 1884 + }, + { + "epoch": 0.6208571781464861, + "grad_norm": 2.740966320037842, + "learning_rate": 3.239467772243566e-05, + "loss": 2.052, + "step": 1885 + }, + { + "epoch": 0.6211865453497468, + "grad_norm": 2.55629301071167, + "learning_rate": 3.234545095599732e-05, + "loss": 2.1718, + "step": 1886 + }, + { + "epoch": 0.6215159125530075, + "grad_norm": 2.6892268657684326, + "learning_rate": 3.22962437319063e-05, + "loss": 2.1374, + "step": 1887 + }, + { + "epoch": 0.6218452797562682, + "grad_norm": 2.641112804412842, + "learning_rate": 3.2247056104631505e-05, + "loss": 2.2445, + "step": 1888 + }, + { + "epoch": 0.622174646959529, + "grad_norm": 2.7154104709625244, + "learning_rate": 3.2197888128620224e-05, + "loss": 2.1997, + "step": 1889 + }, + { + "epoch": 0.6225040141627898, + "grad_norm": 2.327333450317383, + "learning_rate": 3.2148739858297936e-05, + "loss": 2.0276, + "step": 1890 + }, + { + "epoch": 0.6228333813660505, + "grad_norm": 2.518122434616089, + "learning_rate": 3.209961134806836e-05, + "loss": 2.1551, + "step": 1891 + }, + { + "epoch": 0.6231627485693112, + "grad_norm": 2.6456973552703857, + "learning_rate": 3.205050265231327e-05, + "loss": 2.1652, + "step": 1892 + }, + { + "epoch": 0.6234921157725719, + "grad_norm": 2.701111316680908, + "learning_rate": 3.2001413825392574e-05, + "loss": 2.2113, + "step": 1893 + }, + { + "epoch": 0.6238214829758327, + "grad_norm": 2.5059456825256348, + "learning_rate": 3.195234492164414e-05, + "loss": 2.0026, + "step": 1894 + }, + { + "epoch": 0.6241508501790934, + "grad_norm": 2.855924367904663, + "learning_rate": 3.190329599538382e-05, + "loss": 2.1053, + "step": 1895 + }, + { + "epoch": 0.6244802173823542, + "grad_norm": 2.850864887237549, + "learning_rate": 3.185426710090534e-05, + "loss": 2.3436, + "step": 1896 + }, + { + "epoch": 0.6248095845856149, + "grad_norm": 2.7715375423431396, + "learning_rate": 3.180525829248023e-05, + "loss": 1.9533, + "step": 1897 + }, + { + "epoch": 0.6251389517888756, + "grad_norm": 3.283682107925415, + "learning_rate": 3.1756269624357806e-05, + "loss": 1.8841, + "step": 1898 + }, + { + "epoch": 0.6254683189921364, + "grad_norm": 2.5927112102508545, + "learning_rate": 3.17073011507651e-05, + "loss": 1.5226, + "step": 1899 + }, + { + "epoch": 0.6257976861953971, + "grad_norm": 3.1886491775512695, + "learning_rate": 3.165835292590675e-05, + "loss": 2.0477, + "step": 1900 + }, + { + "epoch": 0.6261270533986578, + "grad_norm": 2.009676218032837, + "learning_rate": 3.160942500396503e-05, + "loss": 2.5363, + "step": 1901 + }, + { + "epoch": 0.6264564206019185, + "grad_norm": 2.3882784843444824, + "learning_rate": 3.1560517439099715e-05, + "loss": 2.4409, + "step": 1902 + }, + { + "epoch": 0.6267857878051794, + "grad_norm": 2.1249465942382812, + "learning_rate": 3.151163028544804e-05, + "loss": 2.4448, + "step": 1903 + }, + { + "epoch": 0.6271151550084401, + "grad_norm": 2.7128207683563232, + "learning_rate": 3.146276359712466e-05, + "loss": 2.3409, + "step": 1904 + }, + { + "epoch": 0.6274445222117008, + "grad_norm": 2.4485268592834473, + "learning_rate": 3.141391742822156e-05, + "loss": 2.4835, + "step": 1905 + }, + { + "epoch": 0.6277738894149615, + "grad_norm": 2.6135315895080566, + "learning_rate": 3.136509183280805e-05, + "loss": 2.4324, + "step": 1906 + }, + { + "epoch": 0.6281032566182222, + "grad_norm": 2.25262713432312, + "learning_rate": 3.131628686493061e-05, + "loss": 2.3033, + "step": 1907 + }, + { + "epoch": 0.628432623821483, + "grad_norm": 2.3247017860412598, + "learning_rate": 3.1267502578612926e-05, + "loss": 2.1639, + "step": 1908 + }, + { + "epoch": 0.6287619910247437, + "grad_norm": 2.385807514190674, + "learning_rate": 3.121873902785579e-05, + "loss": 2.1444, + "step": 1909 + }, + { + "epoch": 0.6290913582280044, + "grad_norm": 2.7242722511291504, + "learning_rate": 3.116999626663701e-05, + "loss": 2.3064, + "step": 1910 + }, + { + "epoch": 0.6294207254312651, + "grad_norm": 2.3983850479125977, + "learning_rate": 3.112127434891143e-05, + "loss": 2.275, + "step": 1911 + }, + { + "epoch": 0.629750092634526, + "grad_norm": 2.6150460243225098, + "learning_rate": 3.107257332861078e-05, + "loss": 2.321, + "step": 1912 + }, + { + "epoch": 0.6300794598377867, + "grad_norm": 2.5469534397125244, + "learning_rate": 3.1023893259643666e-05, + "loss": 1.8771, + "step": 1913 + }, + { + "epoch": 0.6304088270410474, + "grad_norm": 2.3631060123443604, + "learning_rate": 3.0975234195895526e-05, + "loss": 1.8878, + "step": 1914 + }, + { + "epoch": 0.6307381942443081, + "grad_norm": 2.889796495437622, + "learning_rate": 3.092659619122852e-05, + "loss": 2.1742, + "step": 1915 + }, + { + "epoch": 0.6310675614475688, + "grad_norm": 2.6340792179107666, + "learning_rate": 3.087797929948151e-05, + "loss": 1.9413, + "step": 1916 + }, + { + "epoch": 0.6313969286508296, + "grad_norm": 2.559239149093628, + "learning_rate": 3.0829383574469976e-05, + "loss": 2.0802, + "step": 1917 + }, + { + "epoch": 0.6317262958540903, + "grad_norm": 2.652400016784668, + "learning_rate": 3.078080906998599e-05, + "loss": 2.24, + "step": 1918 + }, + { + "epoch": 0.6320556630573511, + "grad_norm": 2.5503621101379395, + "learning_rate": 3.073225583979812e-05, + "loss": 1.9102, + "step": 1919 + }, + { + "epoch": 0.6323850302606118, + "grad_norm": 2.8662219047546387, + "learning_rate": 3.068372393765137e-05, + "loss": 2.2393, + "step": 1920 + }, + { + "epoch": 0.6327143974638725, + "grad_norm": 3.3057799339294434, + "learning_rate": 3.063521341726717e-05, + "loss": 2.1511, + "step": 1921 + }, + { + "epoch": 0.6330437646671333, + "grad_norm": 2.57186222076416, + "learning_rate": 3.0586724332343266e-05, + "loss": 1.9987, + "step": 1922 + }, + { + "epoch": 0.633373131870394, + "grad_norm": 2.6434319019317627, + "learning_rate": 3.053825673655365e-05, + "loss": 1.7956, + "step": 1923 + }, + { + "epoch": 0.6337024990736547, + "grad_norm": 2.6144514083862305, + "learning_rate": 3.0489810683548546e-05, + "loss": 2.0272, + "step": 1924 + }, + { + "epoch": 0.6340318662769154, + "grad_norm": 3.186408281326294, + "learning_rate": 3.0441386226954372e-05, + "loss": 1.9882, + "step": 1925 + }, + { + "epoch": 0.6343612334801763, + "grad_norm": 1.8592625856399536, + "learning_rate": 3.0392983420373577e-05, + "loss": 2.4248, + "step": 1926 + }, + { + "epoch": 0.634690600683437, + "grad_norm": 2.230592966079712, + "learning_rate": 3.0344602317384695e-05, + "loss": 2.476, + "step": 1927 + }, + { + "epoch": 0.6350199678866977, + "grad_norm": 2.3445613384246826, + "learning_rate": 3.0296242971542194e-05, + "loss": 2.1382, + "step": 1928 + }, + { + "epoch": 0.6353493350899584, + "grad_norm": 2.5234224796295166, + "learning_rate": 3.024790543637648e-05, + "loss": 2.5123, + "step": 1929 + }, + { + "epoch": 0.6356787022932191, + "grad_norm": 2.1975531578063965, + "learning_rate": 3.0199589765393823e-05, + "loss": 2.071, + "step": 1930 + }, + { + "epoch": 0.6360080694964799, + "grad_norm": 2.5757479667663574, + "learning_rate": 3.015129601207627e-05, + "loss": 2.2572, + "step": 1931 + }, + { + "epoch": 0.6363374366997406, + "grad_norm": 2.23809552192688, + "learning_rate": 3.0103024229881617e-05, + "loss": 1.8197, + "step": 1932 + }, + { + "epoch": 0.6366668039030013, + "grad_norm": 2.522535562515259, + "learning_rate": 3.0054774472243346e-05, + "loss": 2.4502, + "step": 1933 + }, + { + "epoch": 0.636996171106262, + "grad_norm": 2.562291145324707, + "learning_rate": 3.0006546792570566e-05, + "loss": 2.3762, + "step": 1934 + }, + { + "epoch": 0.6373255383095229, + "grad_norm": 2.2808642387390137, + "learning_rate": 2.9958341244247913e-05, + "loss": 2.0, + "step": 1935 + }, + { + "epoch": 0.6376549055127836, + "grad_norm": 2.5651497840881348, + "learning_rate": 2.991015788063556e-05, + "loss": 2.2833, + "step": 1936 + }, + { + "epoch": 0.6379842727160443, + "grad_norm": 2.647606372833252, + "learning_rate": 2.9861996755069112e-05, + "loss": 2.1614, + "step": 1937 + }, + { + "epoch": 0.638313639919305, + "grad_norm": 3.004729747772217, + "learning_rate": 2.9813857920859544e-05, + "loss": 2.3966, + "step": 1938 + }, + { + "epoch": 0.6386430071225657, + "grad_norm": 2.419417142868042, + "learning_rate": 2.9765741431293175e-05, + "loss": 2.2321, + "step": 1939 + }, + { + "epoch": 0.6389723743258265, + "grad_norm": 2.413052558898926, + "learning_rate": 2.97176473396316e-05, + "loss": 2.2033, + "step": 1940 + }, + { + "epoch": 0.6393017415290873, + "grad_norm": 2.419487237930298, + "learning_rate": 2.9669575699111575e-05, + "loss": 2.2214, + "step": 1941 + }, + { + "epoch": 0.639631108732348, + "grad_norm": 2.965186834335327, + "learning_rate": 2.9621526562945058e-05, + "loss": 2.3811, + "step": 1942 + }, + { + "epoch": 0.6399604759356087, + "grad_norm": 2.3459784984588623, + "learning_rate": 2.9573499984319053e-05, + "loss": 2.0426, + "step": 1943 + }, + { + "epoch": 0.6402898431388695, + "grad_norm": 2.5563299655914307, + "learning_rate": 2.9525496016395637e-05, + "loss": 2.0472, + "step": 1944 + }, + { + "epoch": 0.6406192103421302, + "grad_norm": 2.555187225341797, + "learning_rate": 2.9477514712311803e-05, + "loss": 1.9391, + "step": 1945 + }, + { + "epoch": 0.6409485775453909, + "grad_norm": 2.444321632385254, + "learning_rate": 2.942955612517952e-05, + "loss": 1.6775, + "step": 1946 + }, + { + "epoch": 0.6412779447486516, + "grad_norm": 2.7038373947143555, + "learning_rate": 2.9381620308085566e-05, + "loss": 1.811, + "step": 1947 + }, + { + "epoch": 0.6416073119519123, + "grad_norm": 3.07734751701355, + "learning_rate": 2.9333707314091525e-05, + "loss": 2.1634, + "step": 1948 + }, + { + "epoch": 0.6419366791551732, + "grad_norm": 3.2388222217559814, + "learning_rate": 2.9285817196233722e-05, + "loss": 1.7891, + "step": 1949 + }, + { + "epoch": 0.6422660463584339, + "grad_norm": 3.074317216873169, + "learning_rate": 2.9237950007523164e-05, + "loss": 1.944, + "step": 1950 + }, + { + "epoch": 0.6425954135616946, + "grad_norm": 2.3127830028533936, + "learning_rate": 2.919010580094546e-05, + "loss": 2.2439, + "step": 1951 + }, + { + "epoch": 0.6429247807649553, + "grad_norm": 2.1176540851593018, + "learning_rate": 2.91422846294608e-05, + "loss": 2.2749, + "step": 1952 + }, + { + "epoch": 0.6432541479682161, + "grad_norm": 2.5770678520202637, + "learning_rate": 2.9094486546003857e-05, + "loss": 2.2961, + "step": 1953 + }, + { + "epoch": 0.6435835151714768, + "grad_norm": 2.203704595565796, + "learning_rate": 2.9046711603483766e-05, + "loss": 2.1581, + "step": 1954 + }, + { + "epoch": 0.6439128823747375, + "grad_norm": 2.189901828765869, + "learning_rate": 2.8998959854784026e-05, + "loss": 2.4594, + "step": 1955 + }, + { + "epoch": 0.6442422495779982, + "grad_norm": 2.659796714782715, + "learning_rate": 2.8951231352762486e-05, + "loss": 2.3197, + "step": 1956 + }, + { + "epoch": 0.644571616781259, + "grad_norm": 2.2193856239318848, + "learning_rate": 2.890352615025124e-05, + "loss": 2.2613, + "step": 1957 + }, + { + "epoch": 0.6449009839845198, + "grad_norm": 2.119441270828247, + "learning_rate": 2.885584430005661e-05, + "loss": 2.2418, + "step": 1958 + }, + { + "epoch": 0.6452303511877805, + "grad_norm": 2.3953168392181396, + "learning_rate": 2.8808185854959047e-05, + "loss": 2.2826, + "step": 1959 + }, + { + "epoch": 0.6455597183910412, + "grad_norm": 2.431501626968384, + "learning_rate": 2.876055086771313e-05, + "loss": 2.21, + "step": 1960 + }, + { + "epoch": 0.6458890855943019, + "grad_norm": 2.9048917293548584, + "learning_rate": 2.871293939104742e-05, + "loss": 2.5832, + "step": 1961 + }, + { + "epoch": 0.6462184527975627, + "grad_norm": 2.4806578159332275, + "learning_rate": 2.8665351477664492e-05, + "loss": 2.4109, + "step": 1962 + }, + { + "epoch": 0.6465478200008234, + "grad_norm": 2.117027521133423, + "learning_rate": 2.861778718024082e-05, + "loss": 1.8737, + "step": 1963 + }, + { + "epoch": 0.6468771872040842, + "grad_norm": 2.629340410232544, + "learning_rate": 2.8570246551426762e-05, + "loss": 2.6089, + "step": 1964 + }, + { + "epoch": 0.6472065544073449, + "grad_norm": 2.8343591690063477, + "learning_rate": 2.852272964384644e-05, + "loss": 2.2693, + "step": 1965 + }, + { + "epoch": 0.6475359216106056, + "grad_norm": 2.3691017627716064, + "learning_rate": 2.8475236510097752e-05, + "loss": 2.2384, + "step": 1966 + }, + { + "epoch": 0.6478652888138664, + "grad_norm": 2.574763536453247, + "learning_rate": 2.842776720275228e-05, + "loss": 2.1394, + "step": 1967 + }, + { + "epoch": 0.6481946560171271, + "grad_norm": 2.5222694873809814, + "learning_rate": 2.838032177435518e-05, + "loss": 1.6567, + "step": 1968 + }, + { + "epoch": 0.6485240232203878, + "grad_norm": 2.61596941947937, + "learning_rate": 2.8332900277425233e-05, + "loss": 2.2399, + "step": 1969 + }, + { + "epoch": 0.6488533904236485, + "grad_norm": 2.659245252609253, + "learning_rate": 2.8285502764454703e-05, + "loss": 1.9834, + "step": 1970 + }, + { + "epoch": 0.6491827576269092, + "grad_norm": 2.8906190395355225, + "learning_rate": 2.8238129287909314e-05, + "loss": 2.1137, + "step": 1971 + }, + { + "epoch": 0.6495121248301701, + "grad_norm": 2.697786331176758, + "learning_rate": 2.8190779900228185e-05, + "loss": 1.913, + "step": 1972 + }, + { + "epoch": 0.6498414920334308, + "grad_norm": 3.1825335025787354, + "learning_rate": 2.8143454653823787e-05, + "loss": 2.0227, + "step": 1973 + }, + { + "epoch": 0.6501708592366915, + "grad_norm": 3.1395328044891357, + "learning_rate": 2.8096153601081805e-05, + "loss": 2.05, + "step": 1974 + }, + { + "epoch": 0.6505002264399522, + "grad_norm": 3.7124152183532715, + "learning_rate": 2.8048876794361206e-05, + "loss": 2.0609, + "step": 1975 + }, + { + "epoch": 0.650829593643213, + "grad_norm": 1.9795833826065063, + "learning_rate": 2.80016242859941e-05, + "loss": 2.3731, + "step": 1976 + }, + { + "epoch": 0.6511589608464737, + "grad_norm": 2.223407745361328, + "learning_rate": 2.7954396128285698e-05, + "loss": 2.5673, + "step": 1977 + }, + { + "epoch": 0.6514883280497344, + "grad_norm": 1.970519781112671, + "learning_rate": 2.7907192373514256e-05, + "loss": 2.3043, + "step": 1978 + }, + { + "epoch": 0.6518176952529952, + "grad_norm": 2.7503855228424072, + "learning_rate": 2.7860013073931024e-05, + "loss": 2.5426, + "step": 1979 + }, + { + "epoch": 0.6521470624562559, + "grad_norm": 2.135634183883667, + "learning_rate": 2.781285828176019e-05, + "loss": 2.0703, + "step": 1980 + }, + { + "epoch": 0.6524764296595167, + "grad_norm": 2.544966220855713, + "learning_rate": 2.7765728049198768e-05, + "loss": 2.2, + "step": 1981 + }, + { + "epoch": 0.6528057968627774, + "grad_norm": 2.2923483848571777, + "learning_rate": 2.771862242841663e-05, + "loss": 2.0587, + "step": 1982 + }, + { + "epoch": 0.6531351640660381, + "grad_norm": 2.587148427963257, + "learning_rate": 2.7671541471556404e-05, + "loss": 2.542, + "step": 1983 + }, + { + "epoch": 0.6534645312692988, + "grad_norm": 2.211090564727783, + "learning_rate": 2.7624485230733403e-05, + "loss": 2.2066, + "step": 1984 + }, + { + "epoch": 0.6537938984725596, + "grad_norm": 2.4215967655181885, + "learning_rate": 2.7577453758035588e-05, + "loss": 2.0838, + "step": 1985 + }, + { + "epoch": 0.6541232656758204, + "grad_norm": 2.3949782848358154, + "learning_rate": 2.753044710552349e-05, + "loss": 2.002, + "step": 1986 + }, + { + "epoch": 0.6544526328790811, + "grad_norm": 2.762983560562134, + "learning_rate": 2.748346532523018e-05, + "loss": 2.3399, + "step": 1987 + }, + { + "epoch": 0.6547820000823418, + "grad_norm": 2.6567108631134033, + "learning_rate": 2.7436508469161195e-05, + "loss": 2.2037, + "step": 1988 + }, + { + "epoch": 0.6551113672856025, + "grad_norm": 2.348445415496826, + "learning_rate": 2.7389576589294486e-05, + "loss": 2.0968, + "step": 1989 + }, + { + "epoch": 0.6554407344888633, + "grad_norm": 2.819063663482666, + "learning_rate": 2.734266973758034e-05, + "loss": 2.4161, + "step": 1990 + }, + { + "epoch": 0.655770101692124, + "grad_norm": 2.5999984741210938, + "learning_rate": 2.7295787965941355e-05, + "loss": 2.1207, + "step": 1991 + }, + { + "epoch": 0.6560994688953847, + "grad_norm": 2.612342357635498, + "learning_rate": 2.7248931326272386e-05, + "loss": 2.018, + "step": 1992 + }, + { + "epoch": 0.6564288360986454, + "grad_norm": 2.783804416656494, + "learning_rate": 2.720209987044041e-05, + "loss": 2.2325, + "step": 1993 + }, + { + "epoch": 0.6567582033019063, + "grad_norm": 2.30511212348938, + "learning_rate": 2.7155293650284573e-05, + "loss": 1.9004, + "step": 1994 + }, + { + "epoch": 0.657087570505167, + "grad_norm": 2.7158255577087402, + "learning_rate": 2.710851271761609e-05, + "loss": 1.8252, + "step": 1995 + }, + { + "epoch": 0.6574169377084277, + "grad_norm": 2.2666115760803223, + "learning_rate": 2.7061757124218162e-05, + "loss": 1.7978, + "step": 1996 + }, + { + "epoch": 0.6577463049116884, + "grad_norm": 2.959491491317749, + "learning_rate": 2.7015026921845952e-05, + "loss": 1.9785, + "step": 1997 + }, + { + "epoch": 0.6580756721149491, + "grad_norm": 3.017894983291626, + "learning_rate": 2.696832216222654e-05, + "loss": 1.828, + "step": 1998 + }, + { + "epoch": 0.6584050393182099, + "grad_norm": 3.1254472732543945, + "learning_rate": 2.6921642897058776e-05, + "loss": 2.0924, + "step": 1999 + }, + { + "epoch": 0.6587344065214706, + "grad_norm": 2.893326997756958, + "learning_rate": 2.6874989178013345e-05, + "loss": 1.7982, + "step": 2000 + }, + { + "epoch": 0.6590637737247313, + "grad_norm": 1.9270685911178589, + "learning_rate": 2.6828361056732653e-05, + "loss": 2.466, + "step": 2001 + }, + { + "epoch": 0.6593931409279921, + "grad_norm": 1.8927141427993774, + "learning_rate": 2.678175858483075e-05, + "loss": 2.4379, + "step": 2002 + }, + { + "epoch": 0.6597225081312529, + "grad_norm": 2.279390811920166, + "learning_rate": 2.6735181813893318e-05, + "loss": 2.3186, + "step": 2003 + }, + { + "epoch": 0.6600518753345136, + "grad_norm": 2.184950828552246, + "learning_rate": 2.6688630795477554e-05, + "loss": 2.1413, + "step": 2004 + }, + { + "epoch": 0.6603812425377743, + "grad_norm": 2.0760080814361572, + "learning_rate": 2.664210558111221e-05, + "loss": 2.1418, + "step": 2005 + }, + { + "epoch": 0.660710609741035, + "grad_norm": 2.2783913612365723, + "learning_rate": 2.6595606222297376e-05, + "loss": 2.1825, + "step": 2006 + }, + { + "epoch": 0.6610399769442957, + "grad_norm": 2.2643139362335205, + "learning_rate": 2.6549132770504615e-05, + "loss": 2.1718, + "step": 2007 + }, + { + "epoch": 0.6613693441475565, + "grad_norm": 2.3673267364501953, + "learning_rate": 2.6502685277176765e-05, + "loss": 2.2925, + "step": 2008 + }, + { + "epoch": 0.6616987113508173, + "grad_norm": 2.5755388736724854, + "learning_rate": 2.6456263793727953e-05, + "loss": 2.3991, + "step": 2009 + }, + { + "epoch": 0.662028078554078, + "grad_norm": 3.254749298095703, + "learning_rate": 2.6409868371543506e-05, + "loss": 2.3747, + "step": 2010 + }, + { + "epoch": 0.6623574457573387, + "grad_norm": 2.4208810329437256, + "learning_rate": 2.636349906197991e-05, + "loss": 2.168, + "step": 2011 + }, + { + "epoch": 0.6626868129605995, + "grad_norm": 2.396467447280884, + "learning_rate": 2.631715591636471e-05, + "loss": 2.1355, + "step": 2012 + }, + { + "epoch": 0.6630161801638602, + "grad_norm": 2.6166560649871826, + "learning_rate": 2.6270838985996525e-05, + "loss": 2.1752, + "step": 2013 + }, + { + "epoch": 0.6633455473671209, + "grad_norm": 2.6235897541046143, + "learning_rate": 2.6224548322144964e-05, + "loss": 2.3333, + "step": 2014 + }, + { + "epoch": 0.6636749145703816, + "grad_norm": 2.3000566959381104, + "learning_rate": 2.6178283976050532e-05, + "loss": 2.1201, + "step": 2015 + }, + { + "epoch": 0.6640042817736423, + "grad_norm": 2.6367340087890625, + "learning_rate": 2.6132045998924616e-05, + "loss": 2.2484, + "step": 2016 + }, + { + "epoch": 0.6643336489769032, + "grad_norm": 2.8426151275634766, + "learning_rate": 2.6085834441949418e-05, + "loss": 2.3913, + "step": 2017 + }, + { + "epoch": 0.6646630161801639, + "grad_norm": 3.0466115474700928, + "learning_rate": 2.6039649356277885e-05, + "loss": 2.3497, + "step": 2018 + }, + { + "epoch": 0.6649923833834246, + "grad_norm": 2.5447306632995605, + "learning_rate": 2.599349079303367e-05, + "loss": 2.0447, + "step": 2019 + }, + { + "epoch": 0.6653217505866853, + "grad_norm": 2.6460211277008057, + "learning_rate": 2.594735880331106e-05, + "loss": 1.9458, + "step": 2020 + }, + { + "epoch": 0.665651117789946, + "grad_norm": 2.6749167442321777, + "learning_rate": 2.5901253438174938e-05, + "loss": 1.8064, + "step": 2021 + }, + { + "epoch": 0.6659804849932068, + "grad_norm": 2.8169589042663574, + "learning_rate": 2.5855174748660704e-05, + "loss": 2.2488, + "step": 2022 + }, + { + "epoch": 0.6663098521964675, + "grad_norm": 3.0470190048217773, + "learning_rate": 2.5809122785774254e-05, + "loss": 1.9864, + "step": 2023 + }, + { + "epoch": 0.6666392193997283, + "grad_norm": 2.856595277786255, + "learning_rate": 2.5763097600491847e-05, + "loss": 1.8953, + "step": 2024 + }, + { + "epoch": 0.666968586602989, + "grad_norm": 3.369244337081909, + "learning_rate": 2.5717099243760147e-05, + "loss": 1.5753, + "step": 2025 + }, + { + "epoch": 0.6672979538062498, + "grad_norm": 1.7922537326812744, + "learning_rate": 2.5671127766496105e-05, + "loss": 2.545, + "step": 2026 + }, + { + "epoch": 0.6676273210095105, + "grad_norm": 2.0254573822021484, + "learning_rate": 2.5625183219586935e-05, + "loss": 2.2592, + "step": 2027 + }, + { + "epoch": 0.6679566882127712, + "grad_norm": 2.4665722846984863, + "learning_rate": 2.5579265653890016e-05, + "loss": 2.4563, + "step": 2028 + }, + { + "epoch": 0.6682860554160319, + "grad_norm": 2.5361671447753906, + "learning_rate": 2.5533375120232885e-05, + "loss": 2.3127, + "step": 2029 + }, + { + "epoch": 0.6686154226192926, + "grad_norm": 2.1621336936950684, + "learning_rate": 2.5487511669413143e-05, + "loss": 2.2734, + "step": 2030 + }, + { + "epoch": 0.6689447898225535, + "grad_norm": 2.921595811843872, + "learning_rate": 2.5441675352198392e-05, + "loss": 2.1833, + "step": 2031 + }, + { + "epoch": 0.6692741570258142, + "grad_norm": 1.9871540069580078, + "learning_rate": 2.5395866219326224e-05, + "loss": 2.2722, + "step": 2032 + }, + { + "epoch": 0.6696035242290749, + "grad_norm": 2.5396902561187744, + "learning_rate": 2.5350084321504148e-05, + "loss": 2.3714, + "step": 2033 + }, + { + "epoch": 0.6699328914323356, + "grad_norm": 2.6981403827667236, + "learning_rate": 2.5304329709409508e-05, + "loss": 2.5758, + "step": 2034 + }, + { + "epoch": 0.6702622586355964, + "grad_norm": 2.476912498474121, + "learning_rate": 2.525860243368945e-05, + "loss": 2.1581, + "step": 2035 + }, + { + "epoch": 0.6705916258388571, + "grad_norm": 2.6519880294799805, + "learning_rate": 2.5212902544960882e-05, + "loss": 2.0924, + "step": 2036 + }, + { + "epoch": 0.6709209930421178, + "grad_norm": 2.709019660949707, + "learning_rate": 2.516723009381033e-05, + "loss": 2.3023, + "step": 2037 + }, + { + "epoch": 0.6712503602453785, + "grad_norm": 2.7613868713378906, + "learning_rate": 2.512158513079402e-05, + "loss": 2.2279, + "step": 2038 + }, + { + "epoch": 0.6715797274486393, + "grad_norm": 2.635972023010254, + "learning_rate": 2.507596770643772e-05, + "loss": 2.2506, + "step": 2039 + }, + { + "epoch": 0.6719090946519001, + "grad_norm": 2.5571701526641846, + "learning_rate": 2.5030377871236714e-05, + "loss": 1.9826, + "step": 2040 + }, + { + "epoch": 0.6722384618551608, + "grad_norm": 2.326291799545288, + "learning_rate": 2.4984815675655766e-05, + "loss": 2.1142, + "step": 2041 + }, + { + "epoch": 0.6725678290584215, + "grad_norm": 2.433256149291992, + "learning_rate": 2.4939281170129015e-05, + "loss": 1.8394, + "step": 2042 + }, + { + "epoch": 0.6728971962616822, + "grad_norm": 2.478874921798706, + "learning_rate": 2.4893774405059993e-05, + "loss": 2.0067, + "step": 2043 + }, + { + "epoch": 0.673226563464943, + "grad_norm": 2.6590943336486816, + "learning_rate": 2.4848295430821455e-05, + "loss": 1.8746, + "step": 2044 + }, + { + "epoch": 0.6735559306682037, + "grad_norm": 2.759122610092163, + "learning_rate": 2.4802844297755455e-05, + "loss": 2.5029, + "step": 2045 + }, + { + "epoch": 0.6738852978714645, + "grad_norm": 2.439481258392334, + "learning_rate": 2.4757421056173184e-05, + "loss": 1.7643, + "step": 2046 + }, + { + "epoch": 0.6742146650747252, + "grad_norm": 2.6136834621429443, + "learning_rate": 2.4712025756355033e-05, + "loss": 2.0499, + "step": 2047 + }, + { + "epoch": 0.6745440322779859, + "grad_norm": 3.705885648727417, + "learning_rate": 2.466665844855041e-05, + "loss": 2.6908, + "step": 2048 + }, + { + "epoch": 0.6748733994812467, + "grad_norm": 3.136033773422241, + "learning_rate": 2.4621319182977697e-05, + "loss": 1.8605, + "step": 2049 + }, + { + "epoch": 0.6752027666845074, + "grad_norm": 3.363560199737549, + "learning_rate": 2.457600800982431e-05, + "loss": 1.5852, + "step": 2050 + }, + { + "epoch": 0.6755321338877681, + "grad_norm": 2.1761481761932373, + "learning_rate": 2.4530724979246535e-05, + "loss": 2.4601, + "step": 2051 + }, + { + "epoch": 0.6758615010910288, + "grad_norm": 2.2820851802825928, + "learning_rate": 2.44854701413695e-05, + "loss": 2.2097, + "step": 2052 + }, + { + "epoch": 0.6761908682942896, + "grad_norm": 2.133443593978882, + "learning_rate": 2.444024354628715e-05, + "loss": 2.2759, + "step": 2053 + }, + { + "epoch": 0.6765202354975504, + "grad_norm": 2.1510062217712402, + "learning_rate": 2.4395045244062172e-05, + "loss": 2.1092, + "step": 2054 + }, + { + "epoch": 0.6768496027008111, + "grad_norm": 2.543733596801758, + "learning_rate": 2.4349875284725863e-05, + "loss": 2.52, + "step": 2055 + }, + { + "epoch": 0.6771789699040718, + "grad_norm": 2.35121488571167, + "learning_rate": 2.430473371827824e-05, + "loss": 2.5922, + "step": 2056 + }, + { + "epoch": 0.6775083371073325, + "grad_norm": 2.656498432159424, + "learning_rate": 2.425962059468783e-05, + "loss": 2.6134, + "step": 2057 + }, + { + "epoch": 0.6778377043105933, + "grad_norm": 2.513655662536621, + "learning_rate": 2.421453596389171e-05, + "loss": 2.4393, + "step": 2058 + }, + { + "epoch": 0.678167071513854, + "grad_norm": 2.9910781383514404, + "learning_rate": 2.4169479875795396e-05, + "loss": 2.5802, + "step": 2059 + }, + { + "epoch": 0.6784964387171147, + "grad_norm": 2.754699230194092, + "learning_rate": 2.4124452380272817e-05, + "loss": 2.0555, + "step": 2060 + }, + { + "epoch": 0.6788258059203754, + "grad_norm": 2.328061819076538, + "learning_rate": 2.4079453527166273e-05, + "loss": 2.4093, + "step": 2061 + }, + { + "epoch": 0.6791551731236363, + "grad_norm": 2.4432404041290283, + "learning_rate": 2.4034483366286305e-05, + "loss": 2.0952, + "step": 2062 + }, + { + "epoch": 0.679484540326897, + "grad_norm": 2.6212449073791504, + "learning_rate": 2.3989541947411735e-05, + "loss": 1.9832, + "step": 2063 + }, + { + "epoch": 0.6798139075301577, + "grad_norm": 2.5926594734191895, + "learning_rate": 2.3944629320289568e-05, + "loss": 1.9186, + "step": 2064 + }, + { + "epoch": 0.6801432747334184, + "grad_norm": 2.6798171997070312, + "learning_rate": 2.3899745534634925e-05, + "loss": 1.7767, + "step": 2065 + }, + { + "epoch": 0.6804726419366791, + "grad_norm": 2.3956661224365234, + "learning_rate": 2.3854890640131018e-05, + "loss": 2.2176, + "step": 2066 + }, + { + "epoch": 0.6808020091399399, + "grad_norm": 2.882429838180542, + "learning_rate": 2.3810064686429062e-05, + "loss": 2.1865, + "step": 2067 + }, + { + "epoch": 0.6811313763432006, + "grad_norm": 2.6350555419921875, + "learning_rate": 2.3765267723148267e-05, + "loss": 2.0534, + "step": 2068 + }, + { + "epoch": 0.6814607435464614, + "grad_norm": 3.9625144004821777, + "learning_rate": 2.3720499799875677e-05, + "loss": 2.4213, + "step": 2069 + }, + { + "epoch": 0.6817901107497221, + "grad_norm": 2.6471993923187256, + "learning_rate": 2.3675760966166276e-05, + "loss": 1.8502, + "step": 2070 + }, + { + "epoch": 0.6821194779529828, + "grad_norm": 2.543469190597534, + "learning_rate": 2.3631051271542816e-05, + "loss": 1.7004, + "step": 2071 + }, + { + "epoch": 0.6824488451562436, + "grad_norm": 3.3397600650787354, + "learning_rate": 2.358637076549578e-05, + "loss": 2.3005, + "step": 2072 + }, + { + "epoch": 0.6827782123595043, + "grad_norm": 3.1723084449768066, + "learning_rate": 2.3541719497483362e-05, + "loss": 2.3147, + "step": 2073 + }, + { + "epoch": 0.683107579562765, + "grad_norm": 2.94470477104187, + "learning_rate": 2.3497097516931398e-05, + "loss": 2.1635, + "step": 2074 + }, + { + "epoch": 0.6834369467660257, + "grad_norm": 3.1503822803497314, + "learning_rate": 2.3452504873233262e-05, + "loss": 1.9506, + "step": 2075 + }, + { + "epoch": 0.6837663139692866, + "grad_norm": 2.039581298828125, + "learning_rate": 2.3407941615749888e-05, + "loss": 2.0947, + "step": 2076 + }, + { + "epoch": 0.6840956811725473, + "grad_norm": 2.2650582790374756, + "learning_rate": 2.3363407793809666e-05, + "loss": 2.3766, + "step": 2077 + }, + { + "epoch": 0.684425048375808, + "grad_norm": 2.530653238296509, + "learning_rate": 2.3318903456708445e-05, + "loss": 2.3375, + "step": 2078 + }, + { + "epoch": 0.6847544155790687, + "grad_norm": 2.3660364151000977, + "learning_rate": 2.3274428653709412e-05, + "loss": 2.625, + "step": 2079 + }, + { + "epoch": 0.6850837827823294, + "grad_norm": 2.313480854034424, + "learning_rate": 2.3229983434043006e-05, + "loss": 2.233, + "step": 2080 + }, + { + "epoch": 0.6854131499855902, + "grad_norm": 2.3391940593719482, + "learning_rate": 2.3185567846906997e-05, + "loss": 2.1382, + "step": 2081 + }, + { + "epoch": 0.6857425171888509, + "grad_norm": 2.3489909172058105, + "learning_rate": 2.3141181941466312e-05, + "loss": 1.8627, + "step": 2082 + }, + { + "epoch": 0.6860718843921116, + "grad_norm": 2.5285491943359375, + "learning_rate": 2.3096825766853043e-05, + "loss": 2.1704, + "step": 2083 + }, + { + "epoch": 0.6864012515953724, + "grad_norm": 2.5529744625091553, + "learning_rate": 2.3052499372166366e-05, + "loss": 2.0689, + "step": 2084 + }, + { + "epoch": 0.6867306187986332, + "grad_norm": 2.355783462524414, + "learning_rate": 2.300820280647248e-05, + "loss": 2.1035, + "step": 2085 + }, + { + "epoch": 0.6870599860018939, + "grad_norm": 2.845242738723755, + "learning_rate": 2.29639361188046e-05, + "loss": 2.3495, + "step": 2086 + }, + { + "epoch": 0.6873893532051546, + "grad_norm": 2.588106632232666, + "learning_rate": 2.2919699358162817e-05, + "loss": 2.4312, + "step": 2087 + }, + { + "epoch": 0.6877187204084153, + "grad_norm": 2.835247755050659, + "learning_rate": 2.2875492573514123e-05, + "loss": 2.4685, + "step": 2088 + }, + { + "epoch": 0.688048087611676, + "grad_norm": 2.293931722640991, + "learning_rate": 2.2831315813792336e-05, + "loss": 1.9804, + "step": 2089 + }, + { + "epoch": 0.6883774548149368, + "grad_norm": 2.318871259689331, + "learning_rate": 2.2787169127898027e-05, + "loss": 1.7806, + "step": 2090 + }, + { + "epoch": 0.6887068220181976, + "grad_norm": 2.732576370239258, + "learning_rate": 2.2743052564698487e-05, + "loss": 2.298, + "step": 2091 + }, + { + "epoch": 0.6890361892214583, + "grad_norm": 2.569328784942627, + "learning_rate": 2.2698966173027663e-05, + "loss": 2.3241, + "step": 2092 + }, + { + "epoch": 0.689365556424719, + "grad_norm": 2.7927603721618652, + "learning_rate": 2.2654910001686076e-05, + "loss": 2.0825, + "step": 2093 + }, + { + "epoch": 0.6896949236279798, + "grad_norm": 2.9026594161987305, + "learning_rate": 2.261088409944082e-05, + "loss": 2.0577, + "step": 2094 + }, + { + "epoch": 0.6900242908312405, + "grad_norm": 3.1268606185913086, + "learning_rate": 2.2566888515025498e-05, + "loss": 2.2409, + "step": 2095 + }, + { + "epoch": 0.6903536580345012, + "grad_norm": 2.6002843379974365, + "learning_rate": 2.252292329714012e-05, + "loss": 1.8221, + "step": 2096 + }, + { + "epoch": 0.6906830252377619, + "grad_norm": 3.102250337600708, + "learning_rate": 2.2478988494451102e-05, + "loss": 1.8374, + "step": 2097 + }, + { + "epoch": 0.6910123924410226, + "grad_norm": 2.739004611968994, + "learning_rate": 2.2435084155591195e-05, + "loss": 1.8381, + "step": 2098 + }, + { + "epoch": 0.6913417596442835, + "grad_norm": 3.433546781539917, + "learning_rate": 2.2391210329159433e-05, + "loss": 2.4603, + "step": 2099 + }, + { + "epoch": 0.6916711268475442, + "grad_norm": 3.0737321376800537, + "learning_rate": 2.234736706372103e-05, + "loss": 1.8963, + "step": 2100 + }, + { + "epoch": 0.6920004940508049, + "grad_norm": 1.8924542665481567, + "learning_rate": 2.2303554407807426e-05, + "loss": 2.3809, + "step": 2101 + }, + { + "epoch": 0.6923298612540656, + "grad_norm": 2.463632583618164, + "learning_rate": 2.2259772409916153e-05, + "loss": 2.5391, + "step": 2102 + }, + { + "epoch": 0.6926592284573264, + "grad_norm": 2.449293375015259, + "learning_rate": 2.2216021118510815e-05, + "loss": 2.3246, + "step": 2103 + }, + { + "epoch": 0.6929885956605871, + "grad_norm": 2.3340203762054443, + "learning_rate": 2.2172300582021022e-05, + "loss": 2.3163, + "step": 2104 + }, + { + "epoch": 0.6933179628638478, + "grad_norm": 2.3864715099334717, + "learning_rate": 2.2128610848842336e-05, + "loss": 2.5004, + "step": 2105 + }, + { + "epoch": 0.6936473300671085, + "grad_norm": 2.2127561569213867, + "learning_rate": 2.208495196733625e-05, + "loss": 2.1619, + "step": 2106 + }, + { + "epoch": 0.6939766972703693, + "grad_norm": 2.7245256900787354, + "learning_rate": 2.2041323985830027e-05, + "loss": 2.3766, + "step": 2107 + }, + { + "epoch": 0.6943060644736301, + "grad_norm": 2.3172929286956787, + "learning_rate": 2.1997726952616836e-05, + "loss": 2.2989, + "step": 2108 + }, + { + "epoch": 0.6946354316768908, + "grad_norm": 2.4857938289642334, + "learning_rate": 2.1954160915955525e-05, + "loss": 2.3859, + "step": 2109 + }, + { + "epoch": 0.6949647988801515, + "grad_norm": 2.176250457763672, + "learning_rate": 2.1910625924070623e-05, + "loss": 2.0642, + "step": 2110 + }, + { + "epoch": 0.6952941660834122, + "grad_norm": 2.457257032394409, + "learning_rate": 2.186712202515234e-05, + "loss": 2.092, + "step": 2111 + }, + { + "epoch": 0.695623533286673, + "grad_norm": 3.2521281242370605, + "learning_rate": 2.1823649267356412e-05, + "loss": 2.2814, + "step": 2112 + }, + { + "epoch": 0.6959529004899337, + "grad_norm": 2.5254688262939453, + "learning_rate": 2.1780207698804134e-05, + "loss": 2.0433, + "step": 2113 + }, + { + "epoch": 0.6962822676931945, + "grad_norm": 2.5245742797851562, + "learning_rate": 2.1736797367582284e-05, + "loss": 2.1671, + "step": 2114 + }, + { + "epoch": 0.6966116348964552, + "grad_norm": 2.781444787979126, + "learning_rate": 2.169341832174306e-05, + "loss": 2.3001, + "step": 2115 + }, + { + "epoch": 0.6969410020997159, + "grad_norm": 2.258967876434326, + "learning_rate": 2.1650070609304002e-05, + "loss": 1.9225, + "step": 2116 + }, + { + "epoch": 0.6972703693029767, + "grad_norm": 2.339409112930298, + "learning_rate": 2.1606754278248025e-05, + "loss": 2.0533, + "step": 2117 + }, + { + "epoch": 0.6975997365062374, + "grad_norm": 2.8763694763183594, + "learning_rate": 2.1563469376523228e-05, + "loss": 1.9008, + "step": 2118 + }, + { + "epoch": 0.6979291037094981, + "grad_norm": 2.7212729454040527, + "learning_rate": 2.152021595204297e-05, + "loss": 2.1429, + "step": 2119 + }, + { + "epoch": 0.6982584709127588, + "grad_norm": 2.8974947929382324, + "learning_rate": 2.1476994052685766e-05, + "loss": 1.9576, + "step": 2120 + }, + { + "epoch": 0.6985878381160195, + "grad_norm": 2.508470058441162, + "learning_rate": 2.1433803726295227e-05, + "loss": 1.931, + "step": 2121 + }, + { + "epoch": 0.6989172053192804, + "grad_norm": 2.526657819747925, + "learning_rate": 2.1390645020680006e-05, + "loss": 1.7588, + "step": 2122 + }, + { + "epoch": 0.6992465725225411, + "grad_norm": 2.8346455097198486, + "learning_rate": 2.1347517983613773e-05, + "loss": 2.1006, + "step": 2123 + }, + { + "epoch": 0.6995759397258018, + "grad_norm": 3.421928882598877, + "learning_rate": 2.1304422662835146e-05, + "loss": 2.4451, + "step": 2124 + }, + { + "epoch": 0.6999053069290625, + "grad_norm": 2.873753547668457, + "learning_rate": 2.126135910604758e-05, + "loss": 1.7832, + "step": 2125 + }, + { + "epoch": 0.7002346741323233, + "grad_norm": 2.716782331466675, + "learning_rate": 2.1218327360919438e-05, + "loss": 2.4388, + "step": 2126 + }, + { + "epoch": 0.700564041335584, + "grad_norm": 2.3297483921051025, + "learning_rate": 2.1175327475083844e-05, + "loss": 2.5349, + "step": 2127 + }, + { + "epoch": 0.7008934085388447, + "grad_norm": 2.592759847640991, + "learning_rate": 2.1132359496138648e-05, + "loss": 2.3805, + "step": 2128 + }, + { + "epoch": 0.7012227757421055, + "grad_norm": 2.579590082168579, + "learning_rate": 2.108942347164639e-05, + "loss": 2.4302, + "step": 2129 + }, + { + "epoch": 0.7015521429453662, + "grad_norm": 2.581580877304077, + "learning_rate": 2.104651944913426e-05, + "loss": 2.2602, + "step": 2130 + }, + { + "epoch": 0.701881510148627, + "grad_norm": 2.675879955291748, + "learning_rate": 2.1003647476093962e-05, + "loss": 2.706, + "step": 2131 + }, + { + "epoch": 0.7022108773518877, + "grad_norm": 2.9450440406799316, + "learning_rate": 2.0960807599981785e-05, + "loss": 2.5483, + "step": 2132 + }, + { + "epoch": 0.7025402445551484, + "grad_norm": 2.4769771099090576, + "learning_rate": 2.0917999868218457e-05, + "loss": 2.2321, + "step": 2133 + }, + { + "epoch": 0.7028696117584091, + "grad_norm": 2.61974835395813, + "learning_rate": 2.087522432818914e-05, + "loss": 2.3689, + "step": 2134 + }, + { + "epoch": 0.7031989789616699, + "grad_norm": 2.3466622829437256, + "learning_rate": 2.0832481027243357e-05, + "loss": 2.2311, + "step": 2135 + }, + { + "epoch": 0.7035283461649307, + "grad_norm": 2.35425066947937, + "learning_rate": 2.0789770012694937e-05, + "loss": 2.0013, + "step": 2136 + }, + { + "epoch": 0.7038577133681914, + "grad_norm": 2.2261617183685303, + "learning_rate": 2.0747091331822005e-05, + "loss": 1.9601, + "step": 2137 + }, + { + "epoch": 0.7041870805714521, + "grad_norm": 2.35845685005188, + "learning_rate": 2.0704445031866803e-05, + "loss": 1.9554, + "step": 2138 + }, + { + "epoch": 0.7045164477747128, + "grad_norm": 2.579216480255127, + "learning_rate": 2.066183116003586e-05, + "loss": 2.0605, + "step": 2139 + }, + { + "epoch": 0.7048458149779736, + "grad_norm": 2.4616470336914062, + "learning_rate": 2.0619249763499708e-05, + "loss": 2.1647, + "step": 2140 + }, + { + "epoch": 0.7051751821812343, + "grad_norm": 2.7063252925872803, + "learning_rate": 2.057670088939298e-05, + "loss": 2.1963, + "step": 2141 + }, + { + "epoch": 0.705504549384495, + "grad_norm": 2.8242859840393066, + "learning_rate": 2.053418458481431e-05, + "loss": 2.2085, + "step": 2142 + }, + { + "epoch": 0.7058339165877557, + "grad_norm": 3.1535305976867676, + "learning_rate": 2.0491700896826222e-05, + "loss": 2.0284, + "step": 2143 + }, + { + "epoch": 0.7061632837910166, + "grad_norm": 2.366270065307617, + "learning_rate": 2.04492498724552e-05, + "loss": 2.1194, + "step": 2144 + }, + { + "epoch": 0.7064926509942773, + "grad_norm": 2.758836269378662, + "learning_rate": 2.0406831558691552e-05, + "loss": 2.2551, + "step": 2145 + }, + { + "epoch": 0.706822018197538, + "grad_norm": 2.47495698928833, + "learning_rate": 2.0364446002489372e-05, + "loss": 1.9995, + "step": 2146 + }, + { + "epoch": 0.7071513854007987, + "grad_norm": 2.167638063430786, + "learning_rate": 2.03220932507665e-05, + "loss": 1.8277, + "step": 2147 + }, + { + "epoch": 0.7074807526040594, + "grad_norm": 2.6053552627563477, + "learning_rate": 2.0279773350404464e-05, + "loss": 1.7099, + "step": 2148 + }, + { + "epoch": 0.7078101198073202, + "grad_norm": 2.8557825088500977, + "learning_rate": 2.0237486348248437e-05, + "loss": 1.6977, + "step": 2149 + }, + { + "epoch": 0.7081394870105809, + "grad_norm": 3.3039448261260986, + "learning_rate": 2.0195232291107125e-05, + "loss": 1.5792, + "step": 2150 + }, + { + "epoch": 0.7084688542138416, + "grad_norm": 1.8703773021697998, + "learning_rate": 2.0153011225752832e-05, + "loss": 2.436, + "step": 2151 + }, + { + "epoch": 0.7087982214171024, + "grad_norm": 1.92439866065979, + "learning_rate": 2.0110823198921314e-05, + "loss": 2.1712, + "step": 2152 + }, + { + "epoch": 0.7091275886203632, + "grad_norm": 2.654240846633911, + "learning_rate": 2.0068668257311752e-05, + "loss": 2.7122, + "step": 2153 + }, + { + "epoch": 0.7094569558236239, + "grad_norm": 2.478987455368042, + "learning_rate": 2.0026546447586715e-05, + "loss": 2.1828, + "step": 2154 + }, + { + "epoch": 0.7097863230268846, + "grad_norm": 2.456247091293335, + "learning_rate": 1.9984457816372103e-05, + "loss": 2.4922, + "step": 2155 + }, + { + "epoch": 0.7101156902301453, + "grad_norm": 2.527040958404541, + "learning_rate": 1.994240241025705e-05, + "loss": 2.265, + "step": 2156 + }, + { + "epoch": 0.710445057433406, + "grad_norm": 2.492377519607544, + "learning_rate": 1.990038027579395e-05, + "loss": 2.2534, + "step": 2157 + }, + { + "epoch": 0.7107744246366668, + "grad_norm": 3.028960943222046, + "learning_rate": 1.9858391459498367e-05, + "loss": 2.4119, + "step": 2158 + }, + { + "epoch": 0.7111037918399276, + "grad_norm": 2.4781980514526367, + "learning_rate": 1.9816436007848964e-05, + "loss": 2.2563, + "step": 2159 + }, + { + "epoch": 0.7114331590431883, + "grad_norm": 2.4208078384399414, + "learning_rate": 1.9774513967287496e-05, + "loss": 2.1364, + "step": 2160 + }, + { + "epoch": 0.711762526246449, + "grad_norm": 2.3853366374969482, + "learning_rate": 1.9732625384218705e-05, + "loss": 2.0286, + "step": 2161 + }, + { + "epoch": 0.7120918934497097, + "grad_norm": 2.075531244277954, + "learning_rate": 1.9690770305010346e-05, + "loss": 2.1623, + "step": 2162 + }, + { + "epoch": 0.7124212606529705, + "grad_norm": 2.8861477375030518, + "learning_rate": 1.9648948775993014e-05, + "loss": 2.0822, + "step": 2163 + }, + { + "epoch": 0.7127506278562312, + "grad_norm": 2.6686360836029053, + "learning_rate": 1.9607160843460225e-05, + "loss": 2.3996, + "step": 2164 + }, + { + "epoch": 0.7130799950594919, + "grad_norm": 2.642855644226074, + "learning_rate": 1.956540655366829e-05, + "loss": 2.1804, + "step": 2165 + }, + { + "epoch": 0.7134093622627526, + "grad_norm": 2.506636142730713, + "learning_rate": 1.952368595283628e-05, + "loss": 1.6528, + "step": 2166 + }, + { + "epoch": 0.7137387294660135, + "grad_norm": 2.5572757720947266, + "learning_rate": 1.9481999087145973e-05, + "loss": 1.9408, + "step": 2167 + }, + { + "epoch": 0.7140680966692742, + "grad_norm": 2.670609474182129, + "learning_rate": 1.9440346002741798e-05, + "loss": 2.2826, + "step": 2168 + }, + { + "epoch": 0.7143974638725349, + "grad_norm": 2.513737916946411, + "learning_rate": 1.939872674573081e-05, + "loss": 2.0831, + "step": 2169 + }, + { + "epoch": 0.7147268310757956, + "grad_norm": 2.594437837600708, + "learning_rate": 1.93571413621826e-05, + "loss": 2.0226, + "step": 2170 + }, + { + "epoch": 0.7150561982790563, + "grad_norm": 3.0486814975738525, + "learning_rate": 1.9315589898129266e-05, + "loss": 1.8639, + "step": 2171 + }, + { + "epoch": 0.7153855654823171, + "grad_norm": 2.8681252002716064, + "learning_rate": 1.9274072399565373e-05, + "loss": 1.9047, + "step": 2172 + }, + { + "epoch": 0.7157149326855778, + "grad_norm": 2.4376771450042725, + "learning_rate": 1.9232588912447875e-05, + "loss": 1.6213, + "step": 2173 + }, + { + "epoch": 0.7160442998888386, + "grad_norm": 2.7798798084259033, + "learning_rate": 1.9191139482696097e-05, + "loss": 1.9778, + "step": 2174 + }, + { + "epoch": 0.7163736670920993, + "grad_norm": 2.8711724281311035, + "learning_rate": 1.9149724156191618e-05, + "loss": 1.7021, + "step": 2175 + }, + { + "epoch": 0.7167030342953601, + "grad_norm": 2.1148149967193604, + "learning_rate": 1.9108342978778317e-05, + "loss": 2.3465, + "step": 2176 + }, + { + "epoch": 0.7170324014986208, + "grad_norm": 2.3499250411987305, + "learning_rate": 1.9066995996262248e-05, + "loss": 2.2567, + "step": 2177 + }, + { + "epoch": 0.7173617687018815, + "grad_norm": 2.2933461666107178, + "learning_rate": 1.902568325441163e-05, + "loss": 2.1633, + "step": 2178 + }, + { + "epoch": 0.7176911359051422, + "grad_norm": 2.383384943008423, + "learning_rate": 1.898440479895677e-05, + "loss": 2.3886, + "step": 2179 + }, + { + "epoch": 0.7180205031084029, + "grad_norm": 2.2387309074401855, + "learning_rate": 1.894316067559003e-05, + "loss": 2.02, + "step": 2180 + }, + { + "epoch": 0.7183498703116638, + "grad_norm": 2.4402246475219727, + "learning_rate": 1.890195092996573e-05, + "loss": 2.1891, + "step": 2181 + }, + { + "epoch": 0.7186792375149245, + "grad_norm": 2.534137010574341, + "learning_rate": 1.886077560770019e-05, + "loss": 2.3046, + "step": 2182 + }, + { + "epoch": 0.7190086047181852, + "grad_norm": 2.543569803237915, + "learning_rate": 1.88196347543716e-05, + "loss": 2.5529, + "step": 2183 + }, + { + "epoch": 0.7193379719214459, + "grad_norm": 2.6904938220977783, + "learning_rate": 1.8778528415519998e-05, + "loss": 1.9117, + "step": 2184 + }, + { + "epoch": 0.7196673391247067, + "grad_norm": 2.512737989425659, + "learning_rate": 1.873745663664722e-05, + "loss": 2.093, + "step": 2185 + }, + { + "epoch": 0.7199967063279674, + "grad_norm": 2.5019516944885254, + "learning_rate": 1.869641946321684e-05, + "loss": 2.3069, + "step": 2186 + }, + { + "epoch": 0.7203260735312281, + "grad_norm": 2.564549446105957, + "learning_rate": 1.8655416940654152e-05, + "loss": 1.94, + "step": 2187 + }, + { + "epoch": 0.7206554407344888, + "grad_norm": 2.498377561569214, + "learning_rate": 1.8614449114346033e-05, + "loss": 2.317, + "step": 2188 + }, + { + "epoch": 0.7209848079377495, + "grad_norm": 2.7334401607513428, + "learning_rate": 1.8573516029641015e-05, + "loss": 2.0478, + "step": 2189 + }, + { + "epoch": 0.7213141751410104, + "grad_norm": 2.2970240116119385, + "learning_rate": 1.8532617731849144e-05, + "loss": 1.6288, + "step": 2190 + }, + { + "epoch": 0.7216435423442711, + "grad_norm": 2.7691264152526855, + "learning_rate": 1.8491754266241973e-05, + "loss": 2.3471, + "step": 2191 + }, + { + "epoch": 0.7219729095475318, + "grad_norm": 2.845181941986084, + "learning_rate": 1.8450925678052495e-05, + "loss": 2.3704, + "step": 2192 + }, + { + "epoch": 0.7223022767507925, + "grad_norm": 2.658377170562744, + "learning_rate": 1.8410132012475094e-05, + "loss": 2.0011, + "step": 2193 + }, + { + "epoch": 0.7226316439540533, + "grad_norm": 2.7496695518493652, + "learning_rate": 1.8369373314665483e-05, + "loss": 1.9333, + "step": 2194 + }, + { + "epoch": 0.722961011157314, + "grad_norm": 3.0554986000061035, + "learning_rate": 1.8328649629740685e-05, + "loss": 2.1955, + "step": 2195 + }, + { + "epoch": 0.7232903783605747, + "grad_norm": 2.52851939201355, + "learning_rate": 1.8287961002778964e-05, + "loss": 2.1551, + "step": 2196 + }, + { + "epoch": 0.7236197455638355, + "grad_norm": 2.8639140129089355, + "learning_rate": 1.824730747881978e-05, + "loss": 2.0154, + "step": 2197 + }, + { + "epoch": 0.7239491127670962, + "grad_norm": 3.166149139404297, + "learning_rate": 1.8206689102863728e-05, + "loss": 2.148, + "step": 2198 + }, + { + "epoch": 0.724278479970357, + "grad_norm": 3.3647382259368896, + "learning_rate": 1.81661059198725e-05, + "loss": 2.2395, + "step": 2199 + }, + { + "epoch": 0.7246078471736177, + "grad_norm": 2.8727433681488037, + "learning_rate": 1.8125557974768837e-05, + "loss": 1.7405, + "step": 2200 + }, + { + "epoch": 0.7249372143768784, + "grad_norm": 2.2409629821777344, + "learning_rate": 1.8085045312436465e-05, + "loss": 2.6778, + "step": 2201 + }, + { + "epoch": 0.7252665815801391, + "grad_norm": 1.9362051486968994, + "learning_rate": 1.804456797772006e-05, + "loss": 2.1103, + "step": 2202 + }, + { + "epoch": 0.7255959487834, + "grad_norm": 2.2799839973449707, + "learning_rate": 1.80041260154252e-05, + "loss": 2.3589, + "step": 2203 + }, + { + "epoch": 0.7259253159866607, + "grad_norm": 2.4302492141723633, + "learning_rate": 1.796371947031829e-05, + "loss": 2.2235, + "step": 2204 + }, + { + "epoch": 0.7262546831899214, + "grad_norm": 2.615129232406616, + "learning_rate": 1.7923348387126566e-05, + "loss": 2.3434, + "step": 2205 + }, + { + "epoch": 0.7265840503931821, + "grad_norm": 2.2985572814941406, + "learning_rate": 1.788301281053794e-05, + "loss": 2.2154, + "step": 2206 + }, + { + "epoch": 0.7269134175964428, + "grad_norm": 2.6856024265289307, + "learning_rate": 1.7842712785201094e-05, + "loss": 2.4275, + "step": 2207 + }, + { + "epoch": 0.7272427847997036, + "grad_norm": 2.337125778198242, + "learning_rate": 1.7802448355725322e-05, + "loss": 2.2931, + "step": 2208 + }, + { + "epoch": 0.7275721520029643, + "grad_norm": 2.6537628173828125, + "learning_rate": 1.7762219566680528e-05, + "loss": 2.2855, + "step": 2209 + }, + { + "epoch": 0.727901519206225, + "grad_norm": 3.049884796142578, + "learning_rate": 1.7722026462597153e-05, + "loss": 2.5902, + "step": 2210 + }, + { + "epoch": 0.7282308864094857, + "grad_norm": 2.5096991062164307, + "learning_rate": 1.768186908796617e-05, + "loss": 2.1373, + "step": 2211 + }, + { + "epoch": 0.7285602536127465, + "grad_norm": 2.4747989177703857, + "learning_rate": 1.764174748723893e-05, + "loss": 2.626, + "step": 2212 + }, + { + "epoch": 0.7288896208160073, + "grad_norm": 2.4736328125, + "learning_rate": 1.7601661704827253e-05, + "loss": 2.303, + "step": 2213 + }, + { + "epoch": 0.729218988019268, + "grad_norm": 2.823103666305542, + "learning_rate": 1.7561611785103294e-05, + "loss": 2.1824, + "step": 2214 + }, + { + "epoch": 0.7295483552225287, + "grad_norm": 2.7458789348602295, + "learning_rate": 1.7521597772399496e-05, + "loss": 1.7698, + "step": 2215 + }, + { + "epoch": 0.7298777224257894, + "grad_norm": 2.636065721511841, + "learning_rate": 1.748161971100856e-05, + "loss": 2.2537, + "step": 2216 + }, + { + "epoch": 0.7302070896290502, + "grad_norm": 2.7386367321014404, + "learning_rate": 1.7441677645183413e-05, + "loss": 2.2158, + "step": 2217 + }, + { + "epoch": 0.7305364568323109, + "grad_norm": 2.3262336254119873, + "learning_rate": 1.740177161913712e-05, + "loss": 1.9395, + "step": 2218 + }, + { + "epoch": 0.7308658240355717, + "grad_norm": 2.5685794353485107, + "learning_rate": 1.736190167704283e-05, + "loss": 2.0057, + "step": 2219 + }, + { + "epoch": 0.7311951912388324, + "grad_norm": 2.8580093383789062, + "learning_rate": 1.732206786303378e-05, + "loss": 2.178, + "step": 2220 + }, + { + "epoch": 0.7315245584420931, + "grad_norm": 2.475217580795288, + "learning_rate": 1.7282270221203213e-05, + "loss": 1.9048, + "step": 2221 + }, + { + "epoch": 0.7318539256453539, + "grad_norm": 2.6721620559692383, + "learning_rate": 1.7242508795604324e-05, + "loss": 1.9798, + "step": 2222 + }, + { + "epoch": 0.7321832928486146, + "grad_norm": 3.082043409347534, + "learning_rate": 1.720278363025022e-05, + "loss": 2.1289, + "step": 2223 + }, + { + "epoch": 0.7325126600518753, + "grad_norm": 3.0245370864868164, + "learning_rate": 1.716309476911388e-05, + "loss": 1.8127, + "step": 2224 + }, + { + "epoch": 0.732842027255136, + "grad_norm": 3.863074779510498, + "learning_rate": 1.7123442256128097e-05, + "loss": 1.9633, + "step": 2225 + }, + { + "epoch": 0.7331713944583969, + "grad_norm": 1.7904800176620483, + "learning_rate": 1.7083826135185393e-05, + "loss": 2.3501, + "step": 2226 + }, + { + "epoch": 0.7335007616616576, + "grad_norm": 2.111262083053589, + "learning_rate": 1.7044246450138053e-05, + "loss": 2.4573, + "step": 2227 + }, + { + "epoch": 0.7338301288649183, + "grad_norm": 2.706902503967285, + "learning_rate": 1.700470324479801e-05, + "loss": 2.0959, + "step": 2228 + }, + { + "epoch": 0.734159496068179, + "grad_norm": 2.2584450244903564, + "learning_rate": 1.6965196562936796e-05, + "loss": 2.2161, + "step": 2229 + }, + { + "epoch": 0.7344888632714397, + "grad_norm": 2.040581226348877, + "learning_rate": 1.6925726448285588e-05, + "loss": 2.0735, + "step": 2230 + }, + { + "epoch": 0.7348182304747005, + "grad_norm": 2.670949697494507, + "learning_rate": 1.6886292944534994e-05, + "loss": 2.2781, + "step": 2231 + }, + { + "epoch": 0.7351475976779612, + "grad_norm": 2.4085488319396973, + "learning_rate": 1.6846896095335146e-05, + "loss": 2.1216, + "step": 2232 + }, + { + "epoch": 0.7354769648812219, + "grad_norm": 2.76834774017334, + "learning_rate": 1.680753594429559e-05, + "loss": 2.4634, + "step": 2233 + }, + { + "epoch": 0.7358063320844827, + "grad_norm": 2.398747444152832, + "learning_rate": 1.6768212534985257e-05, + "loss": 2.1542, + "step": 2234 + }, + { + "epoch": 0.7361356992877435, + "grad_norm": 2.2678730487823486, + "learning_rate": 1.672892591093239e-05, + "loss": 2.2039, + "step": 2235 + }, + { + "epoch": 0.7364650664910042, + "grad_norm": 2.4844462871551514, + "learning_rate": 1.6689676115624563e-05, + "loss": 2.0138, + "step": 2236 + }, + { + "epoch": 0.7367944336942649, + "grad_norm": 2.4144296646118164, + "learning_rate": 1.6650463192508496e-05, + "loss": 1.9737, + "step": 2237 + }, + { + "epoch": 0.7371238008975256, + "grad_norm": 3.1505680084228516, + "learning_rate": 1.6611287184990172e-05, + "loss": 2.4489, + "step": 2238 + }, + { + "epoch": 0.7374531681007863, + "grad_norm": 2.5178029537200928, + "learning_rate": 1.6572148136434678e-05, + "loss": 2.2807, + "step": 2239 + }, + { + "epoch": 0.7377825353040471, + "grad_norm": 2.765470266342163, + "learning_rate": 1.6533046090166195e-05, + "loss": 2.3461, + "step": 2240 + }, + { + "epoch": 0.7381119025073078, + "grad_norm": 2.7750508785247803, + "learning_rate": 1.6493981089467943e-05, + "loss": 2.2532, + "step": 2241 + }, + { + "epoch": 0.7384412697105686, + "grad_norm": 2.7615838050842285, + "learning_rate": 1.645495317758214e-05, + "loss": 2.0369, + "step": 2242 + }, + { + "epoch": 0.7387706369138293, + "grad_norm": 2.5372447967529297, + "learning_rate": 1.641596239770996e-05, + "loss": 2.1105, + "step": 2243 + }, + { + "epoch": 0.7391000041170901, + "grad_norm": 2.304556369781494, + "learning_rate": 1.6377008793011433e-05, + "loss": 1.6536, + "step": 2244 + }, + { + "epoch": 0.7394293713203508, + "grad_norm": 2.7731292247772217, + "learning_rate": 1.633809240660548e-05, + "loss": 1.9395, + "step": 2245 + }, + { + "epoch": 0.7397587385236115, + "grad_norm": 2.9680540561676025, + "learning_rate": 1.6299213281569815e-05, + "loss": 1.7278, + "step": 2246 + }, + { + "epoch": 0.7400881057268722, + "grad_norm": 2.882687568664551, + "learning_rate": 1.626037146094089e-05, + "loss": 1.9297, + "step": 2247 + }, + { + "epoch": 0.7404174729301329, + "grad_norm": 3.0989298820495605, + "learning_rate": 1.6221566987713893e-05, + "loss": 2.0538, + "step": 2248 + }, + { + "epoch": 0.7407468401333938, + "grad_norm": 3.140981435775757, + "learning_rate": 1.618279990484266e-05, + "loss": 2.1415, + "step": 2249 + }, + { + "epoch": 0.7410762073366545, + "grad_norm": 3.206493616104126, + "learning_rate": 1.6144070255239597e-05, + "loss": 2.1208, + "step": 2250 + }, + { + "epoch": 0.7414055745399152, + "grad_norm": 2.1097187995910645, + "learning_rate": 1.6105378081775735e-05, + "loss": 2.5872, + "step": 2251 + }, + { + "epoch": 0.7417349417431759, + "grad_norm": 2.210814952850342, + "learning_rate": 1.6066723427280582e-05, + "loss": 2.6126, + "step": 2252 + }, + { + "epoch": 0.7420643089464367, + "grad_norm": 2.039604902267456, + "learning_rate": 1.6028106334542144e-05, + "loss": 2.2881, + "step": 2253 + }, + { + "epoch": 0.7423936761496974, + "grad_norm": 2.2926268577575684, + "learning_rate": 1.5989526846306824e-05, + "loss": 2.4734, + "step": 2254 + }, + { + "epoch": 0.7427230433529581, + "grad_norm": 2.4341986179351807, + "learning_rate": 1.5950985005279413e-05, + "loss": 2.3901, + "step": 2255 + }, + { + "epoch": 0.7430524105562188, + "grad_norm": 2.212266206741333, + "learning_rate": 1.5912480854123042e-05, + "loss": 2.5487, + "step": 2256 + }, + { + "epoch": 0.7433817777594796, + "grad_norm": 2.5183675289154053, + "learning_rate": 1.5874014435459068e-05, + "loss": 2.3042, + "step": 2257 + }, + { + "epoch": 0.7437111449627404, + "grad_norm": 2.696521759033203, + "learning_rate": 1.5835585791867135e-05, + "loss": 2.2133, + "step": 2258 + }, + { + "epoch": 0.7440405121660011, + "grad_norm": 2.3256208896636963, + "learning_rate": 1.579719496588506e-05, + "loss": 2.2106, + "step": 2259 + }, + { + "epoch": 0.7443698793692618, + "grad_norm": 2.5763232707977295, + "learning_rate": 1.5758842000008772e-05, + "loss": 2.4074, + "step": 2260 + }, + { + "epoch": 0.7446992465725225, + "grad_norm": 2.66152024269104, + "learning_rate": 1.572052693669237e-05, + "loss": 1.9538, + "step": 2261 + }, + { + "epoch": 0.7450286137757832, + "grad_norm": 3.3669986724853516, + "learning_rate": 1.568224981834789e-05, + "loss": 2.4397, + "step": 2262 + }, + { + "epoch": 0.745357980979044, + "grad_norm": 2.784435510635376, + "learning_rate": 1.5644010687345433e-05, + "loss": 2.4693, + "step": 2263 + }, + { + "epoch": 0.7456873481823048, + "grad_norm": 2.511077404022217, + "learning_rate": 1.5605809586013033e-05, + "loss": 2.2116, + "step": 2264 + }, + { + "epoch": 0.7460167153855655, + "grad_norm": 2.888571262359619, + "learning_rate": 1.556764655663662e-05, + "loss": 2.5169, + "step": 2265 + }, + { + "epoch": 0.7463460825888262, + "grad_norm": 2.7775685787200928, + "learning_rate": 1.552952164146001e-05, + "loss": 1.8714, + "step": 2266 + }, + { + "epoch": 0.746675449792087, + "grad_norm": 2.7570314407348633, + "learning_rate": 1.5491434882684796e-05, + "loss": 2.1423, + "step": 2267 + }, + { + "epoch": 0.7470048169953477, + "grad_norm": 3.36289381980896, + "learning_rate": 1.545338632247037e-05, + "loss": 2.1092, + "step": 2268 + }, + { + "epoch": 0.7473341841986084, + "grad_norm": 2.6311471462249756, + "learning_rate": 1.541537600293378e-05, + "loss": 1.8818, + "step": 2269 + }, + { + "epoch": 0.7476635514018691, + "grad_norm": 2.5641837120056152, + "learning_rate": 1.5377403966149806e-05, + "loss": 1.7303, + "step": 2270 + }, + { + "epoch": 0.7479929186051298, + "grad_norm": 2.628680467605591, + "learning_rate": 1.533947025415083e-05, + "loss": 2.2235, + "step": 2271 + }, + { + "epoch": 0.7483222858083907, + "grad_norm": 3.235398530960083, + "learning_rate": 1.53015749089268e-05, + "loss": 1.8414, + "step": 2272 + }, + { + "epoch": 0.7486516530116514, + "grad_norm": 2.6925714015960693, + "learning_rate": 1.5263717972425222e-05, + "loss": 1.6467, + "step": 2273 + }, + { + "epoch": 0.7489810202149121, + "grad_norm": 2.408013343811035, + "learning_rate": 1.5225899486551065e-05, + "loss": 1.8224, + "step": 2274 + }, + { + "epoch": 0.7493103874181728, + "grad_norm": 3.5030288696289062, + "learning_rate": 1.5188119493166726e-05, + "loss": 2.0291, + "step": 2275 + }, + { + "epoch": 0.7496397546214336, + "grad_norm": 2.0207650661468506, + "learning_rate": 1.515037803409201e-05, + "loss": 2.4186, + "step": 2276 + }, + { + "epoch": 0.7499691218246943, + "grad_norm": 2.019650459289551, + "learning_rate": 1.5112675151104066e-05, + "loss": 2.1816, + "step": 2277 + }, + { + "epoch": 0.7499691218246943, + "eval_loss": 2.19006085395813, + "eval_runtime": 765.4555, + "eval_samples_per_second": 3.34, + "eval_steps_per_second": 1.671, + "step": 2277 } ], "logging_steps": 1, @@ -10676,7 +15997,7 @@ "attributes": {} } }, - "total_flos": 2.328291179772248e+18, + "total_flos": 3.492436769658372e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null