fats-fme commited on
Commit
48ecec5
·
verified ·
1 Parent(s): a02ef87

Training in progress, step 408, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8fcbb49be7893a3ac047c3a09c2e300cbc843f6b72defd8b98454ca6dba9ad9
3
  size 50503544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cc49cd2123962dccbe325a2bceb54ce95be21d864954094864c3a779fd90cf9
3
  size 50503544
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b96774311287360aebaca06f4e781428939128e58ed508068bacc973180dc24f
3
  size 101184122
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:773b86455e5f22c5da806f865a8d957c30131f2ceaac157c9d1818f781c9d876
3
  size 101184122
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dd18590bfb7e62654b0957a084b14842e40731607c690baee2df476a987b1bc1
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20768a83e56b6fa769e574419f07c86e79c791425ce3347059f718b93fcccfcd
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bad1627fa01b19f0bf5517efdc57830b50689393d14677984fa30f4582c06c01
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:844d07a2596435d36c66584eee737f312e21cc8a87613c0830623fb4b3e18b1d
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b8325157143ebc846997ab8494d2a09a51a0f40db80b2396dabb4facfd2ad01
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a830cc09aea0d73a752aadc4db39e884877eb126ff93c26e59113e94cd6ce260
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2507682851874616,
5
  "eval_steps": 204,
6
- "global_step": 204,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -1451,6 +1451,1442 @@
1451
  "eval_samples_per_second": 10.473,
1452
  "eval_steps_per_second": 5.244,
1453
  "step": 204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1454
  }
1455
  ],
1456
  "logging_steps": 1,
@@ -1470,7 +2906,7 @@
1470
  "attributes": {}
1471
  }
1472
  },
1473
- "total_flos": 4.204971437116621e+16,
1474
  "train_batch_size": 1,
1475
  "trial_name": null,
1476
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.5015365703749232,
5
  "eval_steps": 204,
6
+ "global_step": 408,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
1451
  "eval_samples_per_second": 10.473,
1452
  "eval_steps_per_second": 5.244,
1453
  "step": 204
1454
+ },
1455
+ {
1456
+ "epoch": 0.2519975414874001,
1457
+ "grad_norm": 0.3105545938014984,
1458
+ "learning_rate": 9.015846028376462e-05,
1459
+ "loss": 1.2827,
1460
+ "step": 205
1461
+ },
1462
+ {
1463
+ "epoch": 0.25322679778733864,
1464
+ "grad_norm": 0.2826372981071472,
1465
+ "learning_rate": 9.00354723283191e-05,
1466
+ "loss": 1.1159,
1467
+ "step": 206
1468
+ },
1469
+ {
1470
+ "epoch": 0.2544560540872772,
1471
+ "grad_norm": 0.2823708951473236,
1472
+ "learning_rate": 8.991180564605086e-05,
1473
+ "loss": 1.0368,
1474
+ "step": 207
1475
+ },
1476
+ {
1477
+ "epoch": 0.25568531038721576,
1478
+ "grad_norm": 0.28265297412872314,
1479
+ "learning_rate": 8.978746233349802e-05,
1480
+ "loss": 1.1583,
1481
+ "step": 208
1482
+ },
1483
+ {
1484
+ "epoch": 0.2569145666871543,
1485
+ "grad_norm": 0.3202212452888489,
1486
+ "learning_rate": 8.966244449866973e-05,
1487
+ "loss": 1.2069,
1488
+ "step": 209
1489
+ },
1490
+ {
1491
+ "epoch": 0.2581438229870928,
1492
+ "grad_norm": 0.30576291680336,
1493
+ "learning_rate": 8.953675426101038e-05,
1494
+ "loss": 1.1588,
1495
+ "step": 210
1496
+ },
1497
+ {
1498
+ "epoch": 0.25937307928703135,
1499
+ "grad_norm": 0.3853960633277893,
1500
+ "learning_rate": 8.941039375136371e-05,
1501
+ "loss": 1.1947,
1502
+ "step": 211
1503
+ },
1504
+ {
1505
+ "epoch": 0.2606023355869699,
1506
+ "grad_norm": 0.4404067099094391,
1507
+ "learning_rate": 8.928336511193669e-05,
1508
+ "loss": 1.0786,
1509
+ "step": 212
1510
+ },
1511
+ {
1512
+ "epoch": 0.2618315918869084,
1513
+ "grad_norm": 0.422333300113678,
1514
+ "learning_rate": 8.915567049626315e-05,
1515
+ "loss": 1.1454,
1516
+ "step": 213
1517
+ },
1518
+ {
1519
+ "epoch": 0.26306084818684694,
1520
+ "grad_norm": 0.5277565121650696,
1521
+ "learning_rate": 8.902731206916734e-05,
1522
+ "loss": 0.7775,
1523
+ "step": 214
1524
+ },
1525
+ {
1526
+ "epoch": 0.26429010448678547,
1527
+ "grad_norm": 0.7032243609428406,
1528
+ "learning_rate": 8.889829200672719e-05,
1529
+ "loss": 0.5771,
1530
+ "step": 215
1531
+ },
1532
+ {
1533
+ "epoch": 0.26551936078672406,
1534
+ "grad_norm": 0.6663339734077454,
1535
+ "learning_rate": 8.876861249623739e-05,
1536
+ "loss": 0.616,
1537
+ "step": 216
1538
+ },
1539
+ {
1540
+ "epoch": 0.2667486170866626,
1541
+ "grad_norm": 0.8129518628120422,
1542
+ "learning_rate": 8.863827573617238e-05,
1543
+ "loss": 1.1483,
1544
+ "step": 217
1545
+ },
1546
+ {
1547
+ "epoch": 0.2679778733866011,
1548
+ "grad_norm": 1.0273211002349854,
1549
+ "learning_rate": 8.850728393614902e-05,
1550
+ "loss": 1.2066,
1551
+ "step": 218
1552
+ },
1553
+ {
1554
+ "epoch": 0.26920712968653965,
1555
+ "grad_norm": 1.5424954891204834,
1556
+ "learning_rate": 8.837563931688919e-05,
1557
+ "loss": 1.247,
1558
+ "step": 219
1559
+ },
1560
+ {
1561
+ "epoch": 0.2704363859864782,
1562
+ "grad_norm": 2.9167752265930176,
1563
+ "learning_rate": 8.824334411018204e-05,
1564
+ "loss": 1.3413,
1565
+ "step": 220
1566
+ },
1567
+ {
1568
+ "epoch": 0.2716656422864167,
1569
+ "grad_norm": 5.498292446136475,
1570
+ "learning_rate": 8.811040055884629e-05,
1571
+ "loss": 1.0072,
1572
+ "step": 221
1573
+ },
1574
+ {
1575
+ "epoch": 0.27289489858635524,
1576
+ "grad_norm": 3.1687686443328857,
1577
+ "learning_rate": 8.797681091669206e-05,
1578
+ "loss": 1.3309,
1579
+ "step": 222
1580
+ },
1581
+ {
1582
+ "epoch": 0.27412415488629377,
1583
+ "grad_norm": 2.760160446166992,
1584
+ "learning_rate": 8.784257744848279e-05,
1585
+ "loss": 1.5268,
1586
+ "step": 223
1587
+ },
1588
+ {
1589
+ "epoch": 0.27535341118623236,
1590
+ "grad_norm": 2.3323326110839844,
1591
+ "learning_rate": 8.770770242989679e-05,
1592
+ "loss": 1.27,
1593
+ "step": 224
1594
+ },
1595
+ {
1596
+ "epoch": 0.2765826674861709,
1597
+ "grad_norm": 2.150510549545288,
1598
+ "learning_rate": 8.75721881474886e-05,
1599
+ "loss": 1.0602,
1600
+ "step": 225
1601
+ },
1602
+ {
1603
+ "epoch": 0.2778119237861094,
1604
+ "grad_norm": 0.23049846291542053,
1605
+ "learning_rate": 8.743603689865039e-05,
1606
+ "loss": 1.0067,
1607
+ "step": 226
1608
+ },
1609
+ {
1610
+ "epoch": 0.27904118008604795,
1611
+ "grad_norm": 0.2650708556175232,
1612
+ "learning_rate": 8.729925099157281e-05,
1613
+ "loss": 1.1932,
1614
+ "step": 227
1615
+ },
1616
+ {
1617
+ "epoch": 0.2802704363859865,
1618
+ "grad_norm": 0.2723963260650635,
1619
+ "learning_rate": 8.7161832745206e-05,
1620
+ "loss": 1.2495,
1621
+ "step": 228
1622
+ },
1623
+ {
1624
+ "epoch": 0.281499692685925,
1625
+ "grad_norm": 0.26627010107040405,
1626
+ "learning_rate": 8.702378448922026e-05,
1627
+ "loss": 1.2837,
1628
+ "step": 229
1629
+ },
1630
+ {
1631
+ "epoch": 0.28272894898586354,
1632
+ "grad_norm": 0.2728361189365387,
1633
+ "learning_rate": 8.688510856396648e-05,
1634
+ "loss": 1.2969,
1635
+ "step": 230
1636
+ },
1637
+ {
1638
+ "epoch": 0.28395820528580207,
1639
+ "grad_norm": 0.26788559556007385,
1640
+ "learning_rate": 8.674580732043656e-05,
1641
+ "loss": 1.0944,
1642
+ "step": 231
1643
+ },
1644
+ {
1645
+ "epoch": 0.28518746158574065,
1646
+ "grad_norm": 0.3129604160785675,
1647
+ "learning_rate": 8.660588312022344e-05,
1648
+ "loss": 1.3591,
1649
+ "step": 232
1650
+ },
1651
+ {
1652
+ "epoch": 0.2864167178856792,
1653
+ "grad_norm": 0.32250627875328064,
1654
+ "learning_rate": 8.646533833548119e-05,
1655
+ "loss": 1.1469,
1656
+ "step": 233
1657
+ },
1658
+ {
1659
+ "epoch": 0.2876459741856177,
1660
+ "grad_norm": 0.32614386081695557,
1661
+ "learning_rate": 8.632417534888473e-05,
1662
+ "loss": 1.3551,
1663
+ "step": 234
1664
+ },
1665
+ {
1666
+ "epoch": 0.28887523048555624,
1667
+ "grad_norm": 0.3620636463165283,
1668
+ "learning_rate": 8.61823965535894e-05,
1669
+ "loss": 1.1427,
1670
+ "step": 235
1671
+ },
1672
+ {
1673
+ "epoch": 0.2901044867854948,
1674
+ "grad_norm": 0.39082473516464233,
1675
+ "learning_rate": 8.604000435319047e-05,
1676
+ "loss": 1.0041,
1677
+ "step": 236
1678
+ },
1679
+ {
1680
+ "epoch": 0.2913337430854333,
1681
+ "grad_norm": 0.3823097050189972,
1682
+ "learning_rate": 8.589700116168232e-05,
1683
+ "loss": 1.1756,
1684
+ "step": 237
1685
+ },
1686
+ {
1687
+ "epoch": 0.29256299938537184,
1688
+ "grad_norm": 0.5359341502189636,
1689
+ "learning_rate": 8.575338940341757e-05,
1690
+ "loss": 1.1814,
1691
+ "step": 238
1692
+ },
1693
+ {
1694
+ "epoch": 0.29379225568531037,
1695
+ "grad_norm": 0.6902546286582947,
1696
+ "learning_rate": 8.560917151306593e-05,
1697
+ "loss": 0.9253,
1698
+ "step": 239
1699
+ },
1700
+ {
1701
+ "epoch": 0.29502151198524895,
1702
+ "grad_norm": 0.7236252427101135,
1703
+ "learning_rate": 8.5464349935573e-05,
1704
+ "loss": 0.6398,
1705
+ "step": 240
1706
+ },
1707
+ {
1708
+ "epoch": 0.2962507682851875,
1709
+ "grad_norm": 0.7172759175300598,
1710
+ "learning_rate": 8.53189271261187e-05,
1711
+ "loss": 0.9061,
1712
+ "step": 241
1713
+ },
1714
+ {
1715
+ "epoch": 0.297480024585126,
1716
+ "grad_norm": 0.7999723553657532,
1717
+ "learning_rate": 8.517290555007578e-05,
1718
+ "loss": 1.0691,
1719
+ "step": 242
1720
+ },
1721
+ {
1722
+ "epoch": 0.29870928088506454,
1723
+ "grad_norm": 1.235872745513916,
1724
+ "learning_rate": 8.502628768296788e-05,
1725
+ "loss": 1.5235,
1726
+ "step": 243
1727
+ },
1728
+ {
1729
+ "epoch": 0.2999385371850031,
1730
+ "grad_norm": 1.9676207304000854,
1731
+ "learning_rate": 8.487907601042777e-05,
1732
+ "loss": 1.5859,
1733
+ "step": 244
1734
+ },
1735
+ {
1736
+ "epoch": 0.3011677934849416,
1737
+ "grad_norm": 3.5035860538482666,
1738
+ "learning_rate": 8.473127302815496e-05,
1739
+ "loss": 1.1743,
1740
+ "step": 245
1741
+ },
1742
+ {
1743
+ "epoch": 0.30239704978488013,
1744
+ "grad_norm": 4.519472599029541,
1745
+ "learning_rate": 8.458288124187359e-05,
1746
+ "loss": 0.7165,
1747
+ "step": 246
1748
+ },
1749
+ {
1750
+ "epoch": 0.30362630608481866,
1751
+ "grad_norm": 2.3718838691711426,
1752
+ "learning_rate": 8.443390316728987e-05,
1753
+ "loss": 1.1449,
1754
+ "step": 247
1755
+ },
1756
+ {
1757
+ "epoch": 0.3048555623847572,
1758
+ "grad_norm": 2.1668829917907715,
1759
+ "learning_rate": 8.428434133004937e-05,
1760
+ "loss": 1.0383,
1761
+ "step": 248
1762
+ },
1763
+ {
1764
+ "epoch": 0.3060848186846958,
1765
+ "grad_norm": 3.2350733280181885,
1766
+ "learning_rate": 8.413419826569435e-05,
1767
+ "loss": 1.2341,
1768
+ "step": 249
1769
+ },
1770
+ {
1771
+ "epoch": 0.3073140749846343,
1772
+ "grad_norm": 2.3541886806488037,
1773
+ "learning_rate": 8.398347651962064e-05,
1774
+ "loss": 1.0355,
1775
+ "step": 250
1776
+ },
1777
+ {
1778
+ "epoch": 0.30854333128457284,
1779
+ "grad_norm": 0.2730487883090973,
1780
+ "learning_rate": 8.383217864703456e-05,
1781
+ "loss": 1.2813,
1782
+ "step": 251
1783
+ },
1784
+ {
1785
+ "epoch": 0.30977258758451137,
1786
+ "grad_norm": 0.2517383098602295,
1787
+ "learning_rate": 8.36803072129096e-05,
1788
+ "loss": 1.1793,
1789
+ "step": 252
1790
+ },
1791
+ {
1792
+ "epoch": 0.3110018438844499,
1793
+ "grad_norm": 0.28486472368240356,
1794
+ "learning_rate": 8.352786479194288e-05,
1795
+ "loss": 1.4065,
1796
+ "step": 253
1797
+ },
1798
+ {
1799
+ "epoch": 0.31223110018438843,
1800
+ "grad_norm": 0.3247184455394745,
1801
+ "learning_rate": 8.337485396851155e-05,
1802
+ "loss": 1.4863,
1803
+ "step": 254
1804
+ },
1805
+ {
1806
+ "epoch": 0.31346035648432696,
1807
+ "grad_norm": 0.26896461844444275,
1808
+ "learning_rate": 8.322127733662897e-05,
1809
+ "loss": 1.1373,
1810
+ "step": 255
1811
+ },
1812
+ {
1813
+ "epoch": 0.3146896127842655,
1814
+ "grad_norm": 0.29333245754241943,
1815
+ "learning_rate": 8.306713749990072e-05,
1816
+ "loss": 1.0615,
1817
+ "step": 256
1818
+ },
1819
+ {
1820
+ "epoch": 0.3159188690842041,
1821
+ "grad_norm": 0.2958793640136719,
1822
+ "learning_rate": 8.291243707148048e-05,
1823
+ "loss": 0.9392,
1824
+ "step": 257
1825
+ },
1826
+ {
1827
+ "epoch": 0.3171481253841426,
1828
+ "grad_norm": 0.3320540487766266,
1829
+ "learning_rate": 8.275717867402575e-05,
1830
+ "loss": 1.2935,
1831
+ "step": 258
1832
+ },
1833
+ {
1834
+ "epoch": 0.31837738168408114,
1835
+ "grad_norm": 0.3567339777946472,
1836
+ "learning_rate": 8.260136493965326e-05,
1837
+ "loss": 1.0954,
1838
+ "step": 259
1839
+ },
1840
+ {
1841
+ "epoch": 0.31960663798401967,
1842
+ "grad_norm": 0.38393881916999817,
1843
+ "learning_rate": 8.244499850989452e-05,
1844
+ "loss": 1.045,
1845
+ "step": 260
1846
+ },
1847
+ {
1848
+ "epoch": 0.3208358942839582,
1849
+ "grad_norm": 0.41993001103401184,
1850
+ "learning_rate": 8.228808203565095e-05,
1851
+ "loss": 1.2225,
1852
+ "step": 261
1853
+ },
1854
+ {
1855
+ "epoch": 0.32206515058389673,
1856
+ "grad_norm": 0.6547941565513611,
1857
+ "learning_rate": 8.213061817714893e-05,
1858
+ "loss": 0.9286,
1859
+ "step": 262
1860
+ },
1861
+ {
1862
+ "epoch": 0.32329440688383526,
1863
+ "grad_norm": 0.7117279767990112,
1864
+ "learning_rate": 8.197260960389474e-05,
1865
+ "loss": 0.5088,
1866
+ "step": 263
1867
+ },
1868
+ {
1869
+ "epoch": 0.3245236631837738,
1870
+ "grad_norm": 0.7041743993759155,
1871
+ "learning_rate": 8.181405899462926e-05,
1872
+ "loss": 0.8899,
1873
+ "step": 264
1874
+ },
1875
+ {
1876
+ "epoch": 0.3257529194837124,
1877
+ "grad_norm": 0.7142787575721741,
1878
+ "learning_rate": 8.16549690372826e-05,
1879
+ "loss": 0.7447,
1880
+ "step": 265
1881
+ },
1882
+ {
1883
+ "epoch": 0.3269821757836509,
1884
+ "grad_norm": 0.8879908323287964,
1885
+ "learning_rate": 8.14953424289285e-05,
1886
+ "loss": 1.2607,
1887
+ "step": 266
1888
+ },
1889
+ {
1890
+ "epoch": 0.32821143208358944,
1891
+ "grad_norm": 0.9387282133102417,
1892
+ "learning_rate": 8.133518187573862e-05,
1893
+ "loss": 1.1611,
1894
+ "step": 267
1895
+ },
1896
+ {
1897
+ "epoch": 0.32944068838352797,
1898
+ "grad_norm": 1.4039078950881958,
1899
+ "learning_rate": 8.117449009293668e-05,
1900
+ "loss": 0.9947,
1901
+ "step": 268
1902
+ },
1903
+ {
1904
+ "epoch": 0.3306699446834665,
1905
+ "grad_norm": 3.3686740398406982,
1906
+ "learning_rate": 8.101326980475237e-05,
1907
+ "loss": 1.0783,
1908
+ "step": 269
1909
+ },
1910
+ {
1911
+ "epoch": 0.33189920098340503,
1912
+ "grad_norm": 2.8384785652160645,
1913
+ "learning_rate": 8.085152374437525e-05,
1914
+ "loss": 0.9008,
1915
+ "step": 270
1916
+ },
1917
+ {
1918
+ "epoch": 0.33312845728334356,
1919
+ "grad_norm": 2.453441619873047,
1920
+ "learning_rate": 8.06892546539083e-05,
1921
+ "loss": 0.5504,
1922
+ "step": 271
1923
+ },
1924
+ {
1925
+ "epoch": 0.3343577135832821,
1926
+ "grad_norm": 2.592667579650879,
1927
+ "learning_rate": 8.052646528432158e-05,
1928
+ "loss": 0.7489,
1929
+ "step": 272
1930
+ },
1931
+ {
1932
+ "epoch": 0.3355869698832207,
1933
+ "grad_norm": 1.9753395318984985,
1934
+ "learning_rate": 8.036315839540545e-05,
1935
+ "loss": 0.9747,
1936
+ "step": 273
1937
+ },
1938
+ {
1939
+ "epoch": 0.3368162261831592,
1940
+ "grad_norm": 3.042698860168457,
1941
+ "learning_rate": 8.019933675572389e-05,
1942
+ "loss": 1.6841,
1943
+ "step": 274
1944
+ },
1945
+ {
1946
+ "epoch": 0.33804548248309774,
1947
+ "grad_norm": 2.4343316555023193,
1948
+ "learning_rate": 8.00350031425675e-05,
1949
+ "loss": 0.869,
1950
+ "step": 275
1951
+ },
1952
+ {
1953
+ "epoch": 0.33927473878303627,
1954
+ "grad_norm": 0.2026144415140152,
1955
+ "learning_rate": 7.98701603419064e-05,
1956
+ "loss": 0.8867,
1957
+ "step": 276
1958
+ },
1959
+ {
1960
+ "epoch": 0.3405039950829748,
1961
+ "grad_norm": 0.24370141327381134,
1962
+ "learning_rate": 7.970481114834312e-05,
1963
+ "loss": 1.3135,
1964
+ "step": 277
1965
+ },
1966
+ {
1967
+ "epoch": 0.34173325138291333,
1968
+ "grad_norm": 0.22894087433815002,
1969
+ "learning_rate": 7.953895836506508e-05,
1970
+ "loss": 1.0986,
1971
+ "step": 278
1972
+ },
1973
+ {
1974
+ "epoch": 0.34296250768285186,
1975
+ "grad_norm": 0.2533970773220062,
1976
+ "learning_rate": 7.937260480379712e-05,
1977
+ "loss": 1.1821,
1978
+ "step": 279
1979
+ },
1980
+ {
1981
+ "epoch": 0.3441917639827904,
1982
+ "grad_norm": 0.25789350271224976,
1983
+ "learning_rate": 7.920575328475385e-05,
1984
+ "loss": 1.1414,
1985
+ "step": 280
1986
+ },
1987
+ {
1988
+ "epoch": 0.345421020282729,
1989
+ "grad_norm": 0.28820541501045227,
1990
+ "learning_rate": 7.903840663659186e-05,
1991
+ "loss": 1.3332,
1992
+ "step": 281
1993
+ },
1994
+ {
1995
+ "epoch": 0.3466502765826675,
1996
+ "grad_norm": 0.28611505031585693,
1997
+ "learning_rate": 7.887056769636165e-05,
1998
+ "loss": 1.0901,
1999
+ "step": 282
2000
+ },
2001
+ {
2002
+ "epoch": 0.34787953288260604,
2003
+ "grad_norm": 0.28022873401641846,
2004
+ "learning_rate": 7.870223930945972e-05,
2005
+ "loss": 0.8461,
2006
+ "step": 283
2007
+ },
2008
+ {
2009
+ "epoch": 0.34910878918254457,
2010
+ "grad_norm": 0.3246136009693146,
2011
+ "learning_rate": 7.853342432958013e-05,
2012
+ "loss": 0.9325,
2013
+ "step": 284
2014
+ },
2015
+ {
2016
+ "epoch": 0.3503380454824831,
2017
+ "grad_norm": 0.3149406611919403,
2018
+ "learning_rate": 7.836412561866629e-05,
2019
+ "loss": 1.013,
2020
+ "step": 285
2021
+ },
2022
+ {
2023
+ "epoch": 0.3515673017824216,
2024
+ "grad_norm": 0.3745490610599518,
2025
+ "learning_rate": 7.819434604686228e-05,
2026
+ "loss": 1.2624,
2027
+ "step": 286
2028
+ },
2029
+ {
2030
+ "epoch": 0.35279655808236016,
2031
+ "grad_norm": 0.4822925329208374,
2032
+ "learning_rate": 7.802408849246442e-05,
2033
+ "loss": 1.2424,
2034
+ "step": 287
2035
+ },
2036
+ {
2037
+ "epoch": 0.3540258143822987,
2038
+ "grad_norm": 0.6210641264915466,
2039
+ "learning_rate": 7.785335584187219e-05,
2040
+ "loss": 1.2527,
2041
+ "step": 288
2042
+ },
2043
+ {
2044
+ "epoch": 0.3552550706822373,
2045
+ "grad_norm": 0.6488444805145264,
2046
+ "learning_rate": 7.768215098953952e-05,
2047
+ "loss": 0.7986,
2048
+ "step": 289
2049
+ },
2050
+ {
2051
+ "epoch": 0.3564843269821758,
2052
+ "grad_norm": 0.760388195514679,
2053
+ "learning_rate": 7.751047683792561e-05,
2054
+ "loss": 1.0136,
2055
+ "step": 290
2056
+ },
2057
+ {
2058
+ "epoch": 0.35771358328211433,
2059
+ "grad_norm": 0.7666548490524292,
2060
+ "learning_rate": 7.73383362974458e-05,
2061
+ "loss": 0.8205,
2062
+ "step": 291
2063
+ },
2064
+ {
2065
+ "epoch": 0.35894283958205286,
2066
+ "grad_norm": 0.7492078542709351,
2067
+ "learning_rate": 7.71657322864221e-05,
2068
+ "loss": 0.9254,
2069
+ "step": 292
2070
+ },
2071
+ {
2072
+ "epoch": 0.3601720958819914,
2073
+ "grad_norm": 0.9061193466186523,
2074
+ "learning_rate": 7.699266773103389e-05,
2075
+ "loss": 1.3013,
2076
+ "step": 293
2077
+ },
2078
+ {
2079
+ "epoch": 0.3614013521819299,
2080
+ "grad_norm": 2.1404013633728027,
2081
+ "learning_rate": 7.681914556526817e-05,
2082
+ "loss": 1.5957,
2083
+ "step": 294
2084
+ },
2085
+ {
2086
+ "epoch": 0.36263060848186845,
2087
+ "grad_norm": 2.647864580154419,
2088
+ "learning_rate": 7.664516873086987e-05,
2089
+ "loss": 1.1658,
2090
+ "step": 295
2091
+ },
2092
+ {
2093
+ "epoch": 0.363859864781807,
2094
+ "grad_norm": 3.0906460285186768,
2095
+ "learning_rate": 7.647074017729202e-05,
2096
+ "loss": 1.1344,
2097
+ "step": 296
2098
+ },
2099
+ {
2100
+ "epoch": 0.36508912108174557,
2101
+ "grad_norm": 2.2348814010620117,
2102
+ "learning_rate": 7.629586286164565e-05,
2103
+ "loss": 0.8813,
2104
+ "step": 297
2105
+ },
2106
+ {
2107
+ "epoch": 0.3663183773816841,
2108
+ "grad_norm": 2.937446117401123,
2109
+ "learning_rate": 7.612053974864976e-05,
2110
+ "loss": 1.0414,
2111
+ "step": 298
2112
+ },
2113
+ {
2114
+ "epoch": 0.36754763368162263,
2115
+ "grad_norm": 2.5343546867370605,
2116
+ "learning_rate": 7.594477381058098e-05,
2117
+ "loss": 1.1847,
2118
+ "step": 299
2119
+ },
2120
+ {
2121
+ "epoch": 0.36877688998156116,
2122
+ "grad_norm": 2.8971638679504395,
2123
+ "learning_rate": 7.576856802722325e-05,
2124
+ "loss": 0.9029,
2125
+ "step": 300
2126
+ },
2127
+ {
2128
+ "epoch": 0.3700061462814997,
2129
+ "grad_norm": 0.1982557773590088,
2130
+ "learning_rate": 7.559192538581722e-05,
2131
+ "loss": 0.9314,
2132
+ "step": 301
2133
+ },
2134
+ {
2135
+ "epoch": 0.3712354025814382,
2136
+ "grad_norm": 0.24721381068229675,
2137
+ "learning_rate": 7.541484888100974e-05,
2138
+ "loss": 1.2432,
2139
+ "step": 302
2140
+ },
2141
+ {
2142
+ "epoch": 0.37246465888137675,
2143
+ "grad_norm": 0.24999506771564484,
2144
+ "learning_rate": 7.523734151480289e-05,
2145
+ "loss": 1.285,
2146
+ "step": 303
2147
+ },
2148
+ {
2149
+ "epoch": 0.3736939151813153,
2150
+ "grad_norm": 0.267764151096344,
2151
+ "learning_rate": 7.505940629650326e-05,
2152
+ "loss": 1.198,
2153
+ "step": 304
2154
+ },
2155
+ {
2156
+ "epoch": 0.3749231714812538,
2157
+ "grad_norm": 0.26003679633140564,
2158
+ "learning_rate": 7.488104624267091e-05,
2159
+ "loss": 1.2001,
2160
+ "step": 305
2161
+ },
2162
+ {
2163
+ "epoch": 0.3761524277811924,
2164
+ "grad_norm": 0.28197526931762695,
2165
+ "learning_rate": 7.470226437706813e-05,
2166
+ "loss": 1.1687,
2167
+ "step": 306
2168
+ },
2169
+ {
2170
+ "epoch": 0.37738168408113093,
2171
+ "grad_norm": 0.29367661476135254,
2172
+ "learning_rate": 7.452306373060829e-05,
2173
+ "loss": 1.211,
2174
+ "step": 307
2175
+ },
2176
+ {
2177
+ "epoch": 0.37861094038106946,
2178
+ "grad_norm": 0.2982727885246277,
2179
+ "learning_rate": 7.434344734130437e-05,
2180
+ "loss": 1.151,
2181
+ "step": 308
2182
+ },
2183
+ {
2184
+ "epoch": 0.379840196681008,
2185
+ "grad_norm": 0.3283758759498596,
2186
+ "learning_rate": 7.416341825421754e-05,
2187
+ "loss": 0.9875,
2188
+ "step": 309
2189
+ },
2190
+ {
2191
+ "epoch": 0.3810694529809465,
2192
+ "grad_norm": 0.32420334219932556,
2193
+ "learning_rate": 7.398297952140544e-05,
2194
+ "loss": 1.0796,
2195
+ "step": 310
2196
+ },
2197
+ {
2198
+ "epoch": 0.38229870928088505,
2199
+ "grad_norm": 0.4046980142593384,
2200
+ "learning_rate": 7.380213420187055e-05,
2201
+ "loss": 1.1158,
2202
+ "step": 311
2203
+ },
2204
+ {
2205
+ "epoch": 0.3835279655808236,
2206
+ "grad_norm": 0.391736775636673,
2207
+ "learning_rate": 7.36208853615082e-05,
2208
+ "loss": 1.1682,
2209
+ "step": 312
2210
+ },
2211
+ {
2212
+ "epoch": 0.3847572218807621,
2213
+ "grad_norm": 0.6027556657791138,
2214
+ "learning_rate": 7.343923607305471e-05,
2215
+ "loss": 1.0696,
2216
+ "step": 313
2217
+ },
2218
+ {
2219
+ "epoch": 0.3859864781807007,
2220
+ "grad_norm": 0.6483603119850159,
2221
+ "learning_rate": 7.325718941603527e-05,
2222
+ "loss": 0.7843,
2223
+ "step": 314
2224
+ },
2225
+ {
2226
+ "epoch": 0.38721573448063923,
2227
+ "grad_norm": 0.6711483001708984,
2228
+ "learning_rate": 7.307474847671168e-05,
2229
+ "loss": 0.7247,
2230
+ "step": 315
2231
+ },
2232
+ {
2233
+ "epoch": 0.38844499078057776,
2234
+ "grad_norm": 0.7372632026672363,
2235
+ "learning_rate": 7.289191634803003e-05,
2236
+ "loss": 1.0535,
2237
+ "step": 316
2238
+ },
2239
+ {
2240
+ "epoch": 0.3896742470805163,
2241
+ "grad_norm": 0.7427420020103455,
2242
+ "learning_rate": 7.270869612956835e-05,
2243
+ "loss": 1.0563,
2244
+ "step": 317
2245
+ },
2246
+ {
2247
+ "epoch": 0.3909035033804548,
2248
+ "grad_norm": 2.6449501514434814,
2249
+ "learning_rate": 7.252509092748401e-05,
2250
+ "loss": 1.3099,
2251
+ "step": 318
2252
+ },
2253
+ {
2254
+ "epoch": 0.39213275968039335,
2255
+ "grad_norm": 3.1938464641571045,
2256
+ "learning_rate": 7.234110385446103e-05,
2257
+ "loss": 1.1728,
2258
+ "step": 319
2259
+ },
2260
+ {
2261
+ "epoch": 0.3933620159803319,
2262
+ "grad_norm": 2.584103584289551,
2263
+ "learning_rate": 7.215673802965734e-05,
2264
+ "loss": 0.792,
2265
+ "step": 320
2266
+ },
2267
+ {
2268
+ "epoch": 0.3945912722802704,
2269
+ "grad_norm": 2.358025074005127,
2270
+ "learning_rate": 7.197199657865195e-05,
2271
+ "loss": 1.0462,
2272
+ "step": 321
2273
+ },
2274
+ {
2275
+ "epoch": 0.395820528580209,
2276
+ "grad_norm": 2.9621617794036865,
2277
+ "learning_rate": 7.178688263339184e-05,
2278
+ "loss": 1.4222,
2279
+ "step": 322
2280
+ },
2281
+ {
2282
+ "epoch": 0.3970497848801475,
2283
+ "grad_norm": 2.5362660884857178,
2284
+ "learning_rate": 7.160139933213898e-05,
2285
+ "loss": 1.1527,
2286
+ "step": 323
2287
+ },
2288
+ {
2289
+ "epoch": 0.39827904118008606,
2290
+ "grad_norm": 2.4901375770568848,
2291
+ "learning_rate": 7.141554981941709e-05,
2292
+ "loss": 1.1712,
2293
+ "step": 324
2294
+ },
2295
+ {
2296
+ "epoch": 0.3995082974800246,
2297
+ "grad_norm": 2.9214236736297607,
2298
+ "learning_rate": 7.12293372459583e-05,
2299
+ "loss": 1.1977,
2300
+ "step": 325
2301
+ },
2302
+ {
2303
+ "epoch": 0.4007375537799631,
2304
+ "grad_norm": 0.24753543734550476,
2305
+ "learning_rate": 7.104276476864974e-05,
2306
+ "loss": 1.2176,
2307
+ "step": 326
2308
+ },
2309
+ {
2310
+ "epoch": 0.40196681007990165,
2311
+ "grad_norm": 0.25986090302467346,
2312
+ "learning_rate": 7.085583555048008e-05,
2313
+ "loss": 1.2854,
2314
+ "step": 327
2315
+ },
2316
+ {
2317
+ "epoch": 0.4031960663798402,
2318
+ "grad_norm": 0.2640175521373749,
2319
+ "learning_rate": 7.066855276048587e-05,
2320
+ "loss": 1.2204,
2321
+ "step": 328
2322
+ },
2323
+ {
2324
+ "epoch": 0.4044253226797787,
2325
+ "grad_norm": 0.2603614330291748,
2326
+ "learning_rate": 7.048091957369776e-05,
2327
+ "loss": 1.2621,
2328
+ "step": 329
2329
+ },
2330
+ {
2331
+ "epoch": 0.4056545789797173,
2332
+ "grad_norm": 0.2921195924282074,
2333
+ "learning_rate": 7.029293917108678e-05,
2334
+ "loss": 1.281,
2335
+ "step": 330
2336
+ },
2337
+ {
2338
+ "epoch": 0.4068838352796558,
2339
+ "grad_norm": 0.2984941899776459,
2340
+ "learning_rate": 7.010461473951033e-05,
2341
+ "loss": 1.071,
2342
+ "step": 331
2343
+ },
2344
+ {
2345
+ "epoch": 0.40811309157959436,
2346
+ "grad_norm": 0.31219175457954407,
2347
+ "learning_rate": 6.991594947165818e-05,
2348
+ "loss": 1.3161,
2349
+ "step": 332
2350
+ },
2351
+ {
2352
+ "epoch": 0.4093423478795329,
2353
+ "grad_norm": 0.31329602003097534,
2354
+ "learning_rate": 6.972694656599834e-05,
2355
+ "loss": 0.9854,
2356
+ "step": 333
2357
+ },
2358
+ {
2359
+ "epoch": 0.4105716041794714,
2360
+ "grad_norm": 0.3356671929359436,
2361
+ "learning_rate": 6.953760922672286e-05,
2362
+ "loss": 1.02,
2363
+ "step": 334
2364
+ },
2365
+ {
2366
+ "epoch": 0.41180086047940995,
2367
+ "grad_norm": 0.3843994438648224,
2368
+ "learning_rate": 6.934794066369348e-05,
2369
+ "loss": 1.2173,
2370
+ "step": 335
2371
+ },
2372
+ {
2373
+ "epoch": 0.4130301167793485,
2374
+ "grad_norm": 0.45338544249534607,
2375
+ "learning_rate": 6.915794409238718e-05,
2376
+ "loss": 1.3614,
2377
+ "step": 336
2378
+ },
2379
+ {
2380
+ "epoch": 0.414259373079287,
2381
+ "grad_norm": 0.4857298731803894,
2382
+ "learning_rate": 6.896762273384178e-05,
2383
+ "loss": 1.0175,
2384
+ "step": 337
2385
+ },
2386
+ {
2387
+ "epoch": 0.4154886293792256,
2388
+ "grad_norm": 0.6512896418571472,
2389
+ "learning_rate": 6.877697981460125e-05,
2390
+ "loss": 0.6555,
2391
+ "step": 338
2392
+ },
2393
+ {
2394
+ "epoch": 0.4167178856791641,
2395
+ "grad_norm": 0.6744720935821533,
2396
+ "learning_rate": 6.858601856666094e-05,
2397
+ "loss": 0.6057,
2398
+ "step": 339
2399
+ },
2400
+ {
2401
+ "epoch": 0.41794714197910265,
2402
+ "grad_norm": 0.6527014374732971,
2403
+ "learning_rate": 6.839474222741299e-05,
2404
+ "loss": 0.9116,
2405
+ "step": 340
2406
+ },
2407
+ {
2408
+ "epoch": 0.4191763982790412,
2409
+ "grad_norm": 0.6935631036758423,
2410
+ "learning_rate": 6.820315403959123e-05,
2411
+ "loss": 0.9876,
2412
+ "step": 341
2413
+ },
2414
+ {
2415
+ "epoch": 0.4204056545789797,
2416
+ "grad_norm": 0.6856899261474609,
2417
+ "learning_rate": 6.801125725121636e-05,
2418
+ "loss": 0.9591,
2419
+ "step": 342
2420
+ },
2421
+ {
2422
+ "epoch": 0.42163491087891825,
2423
+ "grad_norm": 1.2577812671661377,
2424
+ "learning_rate": 6.781905511554079e-05,
2425
+ "loss": 1.3174,
2426
+ "step": 343
2427
+ },
2428
+ {
2429
+ "epoch": 0.4228641671788568,
2430
+ "grad_norm": 2.421950578689575,
2431
+ "learning_rate": 6.762655089099353e-05,
2432
+ "loss": 1.6442,
2433
+ "step": 344
2434
+ },
2435
+ {
2436
+ "epoch": 0.4240934234787953,
2437
+ "grad_norm": 2.6432454586029053,
2438
+ "learning_rate": 6.743374784112501e-05,
2439
+ "loss": 1.0468,
2440
+ "step": 345
2441
+ },
2442
+ {
2443
+ "epoch": 0.4253226797787339,
2444
+ "grad_norm": 2.7061827182769775,
2445
+ "learning_rate": 6.724064923455155e-05,
2446
+ "loss": 1.1526,
2447
+ "step": 346
2448
+ },
2449
+ {
2450
+ "epoch": 0.4265519360786724,
2451
+ "grad_norm": 2.466057777404785,
2452
+ "learning_rate": 6.704725834490024e-05,
2453
+ "loss": 1.1463,
2454
+ "step": 347
2455
+ },
2456
+ {
2457
+ "epoch": 0.42778119237861095,
2458
+ "grad_norm": 2.753512144088745,
2459
+ "learning_rate": 6.685357845075315e-05,
2460
+ "loss": 0.9492,
2461
+ "step": 348
2462
+ },
2463
+ {
2464
+ "epoch": 0.4290104486785495,
2465
+ "grad_norm": 2.76118803024292,
2466
+ "learning_rate": 6.665961283559197e-05,
2467
+ "loss": 0.8543,
2468
+ "step": 349
2469
+ },
2470
+ {
2471
+ "epoch": 0.430239704978488,
2472
+ "grad_norm": 2.295574426651001,
2473
+ "learning_rate": 6.646536478774222e-05,
2474
+ "loss": 0.9564,
2475
+ "step": 350
2476
+ },
2477
+ {
2478
+ "epoch": 0.43146896127842654,
2479
+ "grad_norm": 0.22731368243694305,
2480
+ "learning_rate": 6.627083760031754e-05,
2481
+ "loss": 0.9719,
2482
+ "step": 351
2483
+ },
2484
+ {
2485
+ "epoch": 0.4326982175783651,
2486
+ "grad_norm": 0.20097078382968903,
2487
+ "learning_rate": 6.60760345711639e-05,
2488
+ "loss": 1.0094,
2489
+ "step": 352
2490
+ },
2491
+ {
2492
+ "epoch": 0.4339274738783036,
2493
+ "grad_norm": 0.23321934044361115,
2494
+ "learning_rate": 6.58809590028036e-05,
2495
+ "loss": 1.101,
2496
+ "step": 353
2497
+ },
2498
+ {
2499
+ "epoch": 0.4351567301782422,
2500
+ "grad_norm": 0.27995625138282776,
2501
+ "learning_rate": 6.568561420237935e-05,
2502
+ "loss": 1.3545,
2503
+ "step": 354
2504
+ },
2505
+ {
2506
+ "epoch": 0.4363859864781807,
2507
+ "grad_norm": 0.259082168340683,
2508
+ "learning_rate": 6.54900034815982e-05,
2509
+ "loss": 1.1598,
2510
+ "step": 355
2511
+ },
2512
+ {
2513
+ "epoch": 0.43761524277811925,
2514
+ "grad_norm": 0.2688703238964081,
2515
+ "learning_rate": 6.52941301566754e-05,
2516
+ "loss": 1.1141,
2517
+ "step": 356
2518
+ },
2519
+ {
2520
+ "epoch": 0.4388444990780578,
2521
+ "grad_norm": 0.34018442034721375,
2522
+ "learning_rate": 6.50979975482781e-05,
2523
+ "loss": 1.2811,
2524
+ "step": 357
2525
+ },
2526
+ {
2527
+ "epoch": 0.4400737553779963,
2528
+ "grad_norm": 0.2925175130367279,
2529
+ "learning_rate": 6.490160898146918e-05,
2530
+ "loss": 0.9025,
2531
+ "step": 358
2532
+ },
2533
+ {
2534
+ "epoch": 0.44130301167793484,
2535
+ "grad_norm": 0.30208972096443176,
2536
+ "learning_rate": 6.470496778565082e-05,
2537
+ "loss": 1.0301,
2538
+ "step": 359
2539
+ },
2540
+ {
2541
+ "epoch": 0.4425322679778734,
2542
+ "grad_norm": 0.3110770285129547,
2543
+ "learning_rate": 6.4508077294508e-05,
2544
+ "loss": 1.0911,
2545
+ "step": 360
2546
+ },
2547
+ {
2548
+ "epoch": 0.4437615242778119,
2549
+ "grad_norm": 0.426252543926239,
2550
+ "learning_rate": 6.431094084595209e-05,
2551
+ "loss": 1.1214,
2552
+ "step": 361
2553
+ },
2554
+ {
2555
+ "epoch": 0.44499078057775043,
2556
+ "grad_norm": 0.4019356966018677,
2557
+ "learning_rate": 6.411356178206419e-05,
2558
+ "loss": 1.3063,
2559
+ "step": 362
2560
+ },
2561
+ {
2562
+ "epoch": 0.446220036877689,
2563
+ "grad_norm": 0.4622703194618225,
2564
+ "learning_rate": 6.391594344903848e-05,
2565
+ "loss": 1.1208,
2566
+ "step": 363
2567
+ },
2568
+ {
2569
+ "epoch": 0.44744929317762755,
2570
+ "grad_norm": 0.5752270817756653,
2571
+ "learning_rate": 6.371808919712549e-05,
2572
+ "loss": 0.9653,
2573
+ "step": 364
2574
+ },
2575
+ {
2576
+ "epoch": 0.4486785494775661,
2577
+ "grad_norm": 0.6309720277786255,
2578
+ "learning_rate": 6.35200023805754e-05,
2579
+ "loss": 0.5664,
2580
+ "step": 365
2581
+ },
2582
+ {
2583
+ "epoch": 0.4499078057775046,
2584
+ "grad_norm": 0.612684965133667,
2585
+ "learning_rate": 6.332168635758097e-05,
2586
+ "loss": 1.0443,
2587
+ "step": 366
2588
+ },
2589
+ {
2590
+ "epoch": 0.45113706207744314,
2591
+ "grad_norm": 0.6797056794166565,
2592
+ "learning_rate": 6.31231444902208e-05,
2593
+ "loss": 0.8389,
2594
+ "step": 367
2595
+ },
2596
+ {
2597
+ "epoch": 0.45236631837738167,
2598
+ "grad_norm": 1.222960352897644,
2599
+ "learning_rate": 6.292438014440227e-05,
2600
+ "loss": 1.4688,
2601
+ "step": 368
2602
+ },
2603
+ {
2604
+ "epoch": 0.4535955746773202,
2605
+ "grad_norm": 2.9443516731262207,
2606
+ "learning_rate": 6.272539668980441e-05,
2607
+ "loss": 1.0079,
2608
+ "step": 369
2609
+ },
2610
+ {
2611
+ "epoch": 0.45482483097725873,
2612
+ "grad_norm": 3.0168612003326416,
2613
+ "learning_rate": 6.252619749982089e-05,
2614
+ "loss": 0.9232,
2615
+ "step": 370
2616
+ },
2617
+ {
2618
+ "epoch": 0.4560540872771973,
2619
+ "grad_norm": 1.9470983743667603,
2620
+ "learning_rate": 6.232678595150275e-05,
2621
+ "loss": 0.8126,
2622
+ "step": 371
2623
+ },
2624
+ {
2625
+ "epoch": 0.45728334357713585,
2626
+ "grad_norm": 2.4769911766052246,
2627
+ "learning_rate": 6.212716542550112e-05,
2628
+ "loss": 0.7786,
2629
+ "step": 372
2630
+ },
2631
+ {
2632
+ "epoch": 0.4585125998770744,
2633
+ "grad_norm": 2.849158525466919,
2634
+ "learning_rate": 6.192733930601005e-05,
2635
+ "loss": 1.1914,
2636
+ "step": 373
2637
+ },
2638
+ {
2639
+ "epoch": 0.4597418561770129,
2640
+ "grad_norm": 2.6154119968414307,
2641
+ "learning_rate": 6.172731098070899e-05,
2642
+ "loss": 0.9171,
2643
+ "step": 374
2644
+ },
2645
+ {
2646
+ "epoch": 0.46097111247695144,
2647
+ "grad_norm": 3.5901479721069336,
2648
+ "learning_rate": 6.152708384070541e-05,
2649
+ "loss": 1.1269,
2650
+ "step": 375
2651
+ },
2652
+ {
2653
+ "epoch": 0.46220036877688997,
2654
+ "grad_norm": 0.23536159098148346,
2655
+ "learning_rate": 6.132666128047732e-05,
2656
+ "loss": 0.8768,
2657
+ "step": 376
2658
+ },
2659
+ {
2660
+ "epoch": 0.4634296250768285,
2661
+ "grad_norm": 0.24834086000919342,
2662
+ "learning_rate": 6.112604669781572e-05,
2663
+ "loss": 1.0644,
2664
+ "step": 377
2665
+ },
2666
+ {
2667
+ "epoch": 0.46465888137676703,
2668
+ "grad_norm": 0.3041445016860962,
2669
+ "learning_rate": 6.0925243493767016e-05,
2670
+ "loss": 1.2779,
2671
+ "step": 378
2672
+ },
2673
+ {
2674
+ "epoch": 0.4658881376767056,
2675
+ "grad_norm": 0.3158765137195587,
2676
+ "learning_rate": 6.0724255072575275e-05,
2677
+ "loss": 1.352,
2678
+ "step": 379
2679
+ },
2680
+ {
2681
+ "epoch": 0.46711739397664415,
2682
+ "grad_norm": 0.2845201790332794,
2683
+ "learning_rate": 6.0523084841624635e-05,
2684
+ "loss": 1.2567,
2685
+ "step": 380
2686
+ },
2687
+ {
2688
+ "epoch": 0.4683466502765827,
2689
+ "grad_norm": 0.2909673750400543,
2690
+ "learning_rate": 6.0321736211381464e-05,
2691
+ "loss": 1.1735,
2692
+ "step": 381
2693
+ },
2694
+ {
2695
+ "epoch": 0.4695759065765212,
2696
+ "grad_norm": 0.2946690022945404,
2697
+ "learning_rate": 6.0120212595336545e-05,
2698
+ "loss": 1.1514,
2699
+ "step": 382
2700
+ },
2701
+ {
2702
+ "epoch": 0.47080516287645974,
2703
+ "grad_norm": 0.302846223115921,
2704
+ "learning_rate": 5.9918517409947215e-05,
2705
+ "loss": 1.0621,
2706
+ "step": 383
2707
+ },
2708
+ {
2709
+ "epoch": 0.47203441917639827,
2710
+ "grad_norm": 0.3197241425514221,
2711
+ "learning_rate": 5.971665407457948e-05,
2712
+ "loss": 1.0299,
2713
+ "step": 384
2714
+ },
2715
+ {
2716
+ "epoch": 0.4732636754763368,
2717
+ "grad_norm": 0.342777281999588,
2718
+ "learning_rate": 5.951462601144998e-05,
2719
+ "loss": 1.0858,
2720
+ "step": 385
2721
+ },
2722
+ {
2723
+ "epoch": 0.47449293177627533,
2724
+ "grad_norm": 0.3554008901119232,
2725
+ "learning_rate": 5.931243664556803e-05,
2726
+ "loss": 1.1441,
2727
+ "step": 386
2728
+ },
2729
+ {
2730
+ "epoch": 0.4757221880762139,
2731
+ "grad_norm": 0.36057665944099426,
2732
+ "learning_rate": 5.9110089404677524e-05,
2733
+ "loss": 1.1836,
2734
+ "step": 387
2735
+ },
2736
+ {
2737
+ "epoch": 0.47695144437615244,
2738
+ "grad_norm": 0.5004509091377258,
2739
+ "learning_rate": 5.890758771919884e-05,
2740
+ "loss": 1.4109,
2741
+ "step": 388
2742
+ },
2743
+ {
2744
+ "epoch": 0.478180700676091,
2745
+ "grad_norm": 0.5211744904518127,
2746
+ "learning_rate": 5.8704935022170684e-05,
2747
+ "loss": 1.0097,
2748
+ "step": 389
2749
+ },
2750
+ {
2751
+ "epoch": 0.4794099569760295,
2752
+ "grad_norm": 0.7474620938301086,
2753
+ "learning_rate": 5.8502134749191816e-05,
2754
+ "loss": 0.8777,
2755
+ "step": 390
2756
+ },
2757
+ {
2758
+ "epoch": 0.48063921327596804,
2759
+ "grad_norm": 0.7044636011123657,
2760
+ "learning_rate": 5.8299190338362996e-05,
2761
+ "loss": 0.9007,
2762
+ "step": 391
2763
+ },
2764
+ {
2765
+ "epoch": 0.48186846957590657,
2766
+ "grad_norm": 0.6484948992729187,
2767
+ "learning_rate": 5.8096105230228435e-05,
2768
+ "loss": 0.8261,
2769
+ "step": 392
2770
+ },
2771
+ {
2772
+ "epoch": 0.4830977258758451,
2773
+ "grad_norm": 0.672816812992096,
2774
+ "learning_rate": 5.78928828677177e-05,
2775
+ "loss": 1.0531,
2776
+ "step": 393
2777
+ },
2778
+ {
2779
+ "epoch": 0.4843269821757836,
2780
+ "grad_norm": 1.1637803316116333,
2781
+ "learning_rate": 5.768952669608724e-05,
2782
+ "loss": 1.1586,
2783
+ "step": 394
2784
+ },
2785
+ {
2786
+ "epoch": 0.4855562384757222,
2787
+ "grad_norm": 3.1862003803253174,
2788
+ "learning_rate": 5.748604016286192e-05,
2789
+ "loss": 1.6232,
2790
+ "step": 395
2791
+ },
2792
+ {
2793
+ "epoch": 0.48678549477566074,
2794
+ "grad_norm": 3.3833253383636475,
2795
+ "learning_rate": 5.728242671777672e-05,
2796
+ "loss": 1.0918,
2797
+ "step": 396
2798
+ },
2799
+ {
2800
+ "epoch": 0.4880147510755993,
2801
+ "grad_norm": 3.116319417953491,
2802
+ "learning_rate": 5.707868981271815e-05,
2803
+ "loss": 0.8615,
2804
+ "step": 397
2805
+ },
2806
+ {
2807
+ "epoch": 0.4892440073755378,
2808
+ "grad_norm": 2.5967965126037598,
2809
+ "learning_rate": 5.687483290166573e-05,
2810
+ "loss": 0.8579,
2811
+ "step": 398
2812
+ },
2813
+ {
2814
+ "epoch": 0.49047326367547633,
2815
+ "grad_norm": 3.7683048248291016,
2816
+ "learning_rate": 5.6670859440633486e-05,
2817
+ "loss": 1.0777,
2818
+ "step": 399
2819
+ },
2820
+ {
2821
+ "epoch": 0.49170251997541486,
2822
+ "grad_norm": 3.182317018508911,
2823
+ "learning_rate": 5.646677288761132e-05,
2824
+ "loss": 0.8932,
2825
+ "step": 400
2826
+ },
2827
+ {
2828
+ "epoch": 0.4929317762753534,
2829
+ "grad_norm": 0.19372360408306122,
2830
+ "learning_rate": 5.6262576702506406e-05,
2831
+ "loss": 0.8516,
2832
+ "step": 401
2833
+ },
2834
+ {
2835
+ "epoch": 0.4941610325752919,
2836
+ "grad_norm": 0.223338320851326,
2837
+ "learning_rate": 5.6058274347084504e-05,
2838
+ "loss": 1.1287,
2839
+ "step": 402
2840
+ },
2841
+ {
2842
+ "epoch": 0.4953902888752305,
2843
+ "grad_norm": 0.23367400467395782,
2844
+ "learning_rate": 5.585386928491134e-05,
2845
+ "loss": 1.1128,
2846
+ "step": 403
2847
+ },
2848
+ {
2849
+ "epoch": 0.49661954517516904,
2850
+ "grad_norm": 0.2717371881008148,
2851
+ "learning_rate": 5.5649364981293786e-05,
2852
+ "loss": 1.2813,
2853
+ "step": 404
2854
+ },
2855
+ {
2856
+ "epoch": 0.49784880147510757,
2857
+ "grad_norm": 0.25909724831581116,
2858
+ "learning_rate": 5.54447649032212e-05,
2859
+ "loss": 1.2149,
2860
+ "step": 405
2861
+ },
2862
+ {
2863
+ "epoch": 0.4990780577750461,
2864
+ "grad_norm": 0.25411197543144226,
2865
+ "learning_rate": 5.5240072519306606e-05,
2866
+ "loss": 1.0679,
2867
+ "step": 406
2868
+ },
2869
+ {
2870
+ "epoch": 0.5003073140749846,
2871
+ "grad_norm": 0.2962128520011902,
2872
+ "learning_rate": 5.503529129972792e-05,
2873
+ "loss": 1.1156,
2874
+ "step": 407
2875
+ },
2876
+ {
2877
+ "epoch": 0.5015365703749232,
2878
+ "grad_norm": 0.29252949357032776,
2879
+ "learning_rate": 5.483042471616908e-05,
2880
+ "loss": 1.125,
2881
+ "step": 408
2882
+ },
2883
+ {
2884
+ "epoch": 0.5015365703749232,
2885
+ "eval_loss": 0.9695894122123718,
2886
+ "eval_runtime": 65.3254,
2887
+ "eval_samples_per_second": 10.486,
2888
+ "eval_steps_per_second": 5.251,
2889
+ "step": 408
2890
  }
2891
  ],
2892
  "logging_steps": 1,
 
2906
  "attributes": {}
2907
  }
2908
  },
2909
+ "total_flos": 8.411229584359424e+16,
2910
  "train_batch_size": 1,
2911
  "trial_name": null,
2912
  "trial_params": null