EColi commited on
Commit
3f53f30
1 Parent(s): b75fedd

Add 1850000

Browse files
added_tokens.json CHANGED
@@ -1 +1 @@
1
- {"PROFANITY_TOKEN": 32110, "NO_SEGMENT_TOKEN": 32111, "[Music]": 32107, "NUMBER_TOKEN": 32104, "URL_TOKEN": 32101, "HYPHENATED_URL_TOKEN": 32102, "BETWEEN_SEGMENTS_TOKEN": 32118, "NUMBER_PERCENTAGE_TOKEN": 32103, "SHORT_HYPHENATED_TOKEN": 32105, "[Laughter]": 32109, "START_INTERACTION_TOKEN": 32116, "END_INTERACTION_TOKEN": 32117, "[Applause]": 32108, "END_SPONSOR_TOKEN": 32113, "END_SELFPROMO_TOKEN": 32115, "LONG_WORD_TOKEN": 32106, "START_SELFPROMO_TOKEN": 32114, "EXTRACT_SEGMENTS: ": 32100, "START_SPONSOR_TOKEN": 32112}
 
1
+ {"START_SPONSOR_TOKEN": 32112, "LONG_WORD_TOKEN": 32106, "BETWEEN_SEGMENTS_TOKEN": 32118, "SHORT_HYPHENATED_TOKEN": 32105, "[Laughter]": 32109, "END_SPONSOR_TOKEN": 32113, "NUMBER_PERCENTAGE_TOKEN": 32103, "NO_SEGMENT_TOKEN": 32111, "START_SELFPROMO_TOKEN": 32114, "[Applause]": 32108, "URL_TOKEN": 32101, "EXTRACT_SEGMENTS: ": 32100, "START_INTERACTION_TOKEN": 32116, "PROFANITY_TOKEN": 32110, "HYPHENATED_URL_TOKEN": 32102, "[Music]": 32107, "END_INTERACTION_TOKEN": 32117, "END_SELFPROMO_TOKEN": 32115, "NUMBER_TOKEN": 32104}
config.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "_name_or_path": "/1TB_SSD/SB_AI/out_orig2",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
 
1
  {
2
+ "_name_or_path": "/1TB_SSD/SB_AI/out_epoch1/out/checkpoint-1115000/",
3
  "architectures": [
4
  "T5ForConditionalGeneration"
5
  ],
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3fb22fb40259e3ef7d648c85bc99a714855c5d5d75c32dd548bebf38df101aea
3
  size 891703231
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afdfb877d569756c5d3e589de624b065735445d4431398f5ec538b4f3ee17e99
3
  size 891703231
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:587906ddc876c2af39b06c057f327311bae563143c1e8d8b0e4f83d52a3778ec
3
  size 14439
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:867ce08b496f6dc7dd44b00318be2bd40fa0c1f470100e78a8ccca16dc0eb97f
3
  size 14439
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:774aaedff89f9ea48d153c07a6d564ddd201d61e5ed5d666b9d6573ead684ccb
3
  size 623
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c476f8e287d72e82b2fe676d2e42ba4bc778d9bb83ac44801844f0c68ef1e65d
3
  size 623
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 100, "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"], "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "/1TB_SSD/SB_AI/out_orig2", "tokenizer_class": "T5Tokenizer"}
 
1
+ {"eos_token": "</s>", "unk_token": "<unk>", "pad_token": "<pad>", "extra_ids": 100, "additional_special_tokens": ["<extra_id_0>", "<extra_id_1>", "<extra_id_2>", "<extra_id_3>", "<extra_id_4>", "<extra_id_5>", "<extra_id_6>", "<extra_id_7>", "<extra_id_8>", "<extra_id_9>", "<extra_id_10>", "<extra_id_11>", "<extra_id_12>", "<extra_id_13>", "<extra_id_14>", "<extra_id_15>", "<extra_id_16>", "<extra_id_17>", "<extra_id_18>", "<extra_id_19>", "<extra_id_20>", "<extra_id_21>", "<extra_id_22>", "<extra_id_23>", "<extra_id_24>", "<extra_id_25>", "<extra_id_26>", "<extra_id_27>", "<extra_id_28>", "<extra_id_29>", "<extra_id_30>", "<extra_id_31>", "<extra_id_32>", "<extra_id_33>", "<extra_id_34>", "<extra_id_35>", "<extra_id_36>", "<extra_id_37>", "<extra_id_38>", "<extra_id_39>", "<extra_id_40>", "<extra_id_41>", "<extra_id_42>", "<extra_id_43>", "<extra_id_44>", "<extra_id_45>", "<extra_id_46>", "<extra_id_47>", "<extra_id_48>", "<extra_id_49>", "<extra_id_50>", "<extra_id_51>", "<extra_id_52>", "<extra_id_53>", "<extra_id_54>", "<extra_id_55>", "<extra_id_56>", "<extra_id_57>", "<extra_id_58>", "<extra_id_59>", "<extra_id_60>", "<extra_id_61>", "<extra_id_62>", "<extra_id_63>", "<extra_id_64>", "<extra_id_65>", "<extra_id_66>", "<extra_id_67>", "<extra_id_68>", "<extra_id_69>", "<extra_id_70>", "<extra_id_71>", "<extra_id_72>", "<extra_id_73>", "<extra_id_74>", "<extra_id_75>", "<extra_id_76>", "<extra_id_77>", "<extra_id_78>", "<extra_id_79>", "<extra_id_80>", "<extra_id_81>", "<extra_id_82>", "<extra_id_83>", "<extra_id_84>", "<extra_id_85>", "<extra_id_86>", "<extra_id_87>", "<extra_id_88>", "<extra_id_89>", "<extra_id_90>", "<extra_id_91>", "<extra_id_92>", "<extra_id_93>", "<extra_id_94>", "<extra_id_95>", "<extra_id_96>", "<extra_id_97>", "<extra_id_98>", "<extra_id_99>"], "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "/1TB_SSD/SB_AI/out_epoch1/out/checkpoint-1115000/", "tokenizer_class": "T5Tokenizer"}
trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.0,
5
- "global_step": 1116594,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
@@ -1459,17 +1459,1130 @@
1459
  },
1460
  {
1461
  "epoch": 1.0,
1462
- "step": 1116594,
1463
- "total_flos": 3.3533589180916224e+17,
1464
- "train_loss": 0.0695511792498969,
1465
- "train_runtime": 176193.0564,
1466
- "train_samples_per_second": 6.337,
1467
- "train_steps_per_second": 6.337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1468
  }
1469
  ],
1470
- "max_steps": 1116594,
1471
- "num_train_epochs": 1,
1472
- "total_flos": 3.3533589180916224e+17,
1473
  "trial_name": null,
1474
  "trial_params": null
1475
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.656824235129331,
5
+ "global_step": 1850000,
6
  "is_hyper_param_search": false,
7
  "is_local_process_zero": true,
8
  "is_world_process_zero": true,
 
1459
  },
1460
  {
1461
  "epoch": 1.0,
1462
+ "learning_rate": 2.492374130615067e-05,
1463
+ "loss": 0.0633,
1464
+ "step": 1120000
1465
+ },
1466
+ {
1467
+ "epoch": 1.01,
1468
+ "learning_rate": 2.4811793722695988e-05,
1469
+ "loss": 0.0625,
1470
+ "step": 1125000
1471
+ },
1472
+ {
1473
+ "epoch": 1.01,
1474
+ "eval_loss": 0.06666136533021927,
1475
+ "eval_runtime": 1682.1519,
1476
+ "eval_samples_per_second": 36.877,
1477
+ "eval_steps_per_second": 36.877,
1478
+ "step": 1125000
1479
+ },
1480
+ {
1481
+ "epoch": 1.01,
1482
+ "learning_rate": 2.46998461392413e-05,
1483
+ "loss": 0.0645,
1484
+ "step": 1130000
1485
+ },
1486
+ {
1487
+ "epoch": 1.02,
1488
+ "learning_rate": 2.4587898555786618e-05,
1489
+ "loss": 0.058,
1490
+ "step": 1135000
1491
+ },
1492
+ {
1493
+ "epoch": 1.02,
1494
+ "learning_rate": 2.4475950972331933e-05,
1495
+ "loss": 0.0644,
1496
+ "step": 1140000
1497
+ },
1498
+ {
1499
+ "epoch": 1.03,
1500
+ "learning_rate": 2.4364003388877245e-05,
1501
+ "loss": 0.0599,
1502
+ "step": 1145000
1503
+ },
1504
+ {
1505
+ "epoch": 1.03,
1506
+ "learning_rate": 2.4252055805422564e-05,
1507
+ "loss": 0.0614,
1508
+ "step": 1150000
1509
+ },
1510
+ {
1511
+ "epoch": 1.03,
1512
+ "eval_loss": 0.06576403230428696,
1513
+ "eval_runtime": 1686.8754,
1514
+ "eval_samples_per_second": 36.774,
1515
+ "eval_steps_per_second": 36.774,
1516
+ "step": 1150000
1517
+ },
1518
+ {
1519
+ "epoch": 1.03,
1520
+ "learning_rate": 2.414010822196788e-05,
1521
+ "loss": 0.0637,
1522
+ "step": 1155000
1523
+ },
1524
+ {
1525
+ "epoch": 1.04,
1526
+ "learning_rate": 2.4028160638513194e-05,
1527
+ "loss": 0.058,
1528
+ "step": 1160000
1529
+ },
1530
+ {
1531
+ "epoch": 1.04,
1532
+ "learning_rate": 2.391621305505851e-05,
1533
+ "loss": 0.0623,
1534
+ "step": 1165000
1535
+ },
1536
+ {
1537
+ "epoch": 1.05,
1538
+ "learning_rate": 2.3804265471603828e-05,
1539
+ "loss": 0.0628,
1540
+ "step": 1170000
1541
+ },
1542
+ {
1543
+ "epoch": 1.05,
1544
+ "learning_rate": 2.369231788814914e-05,
1545
+ "loss": 0.0597,
1546
+ "step": 1175000
1547
+ },
1548
+ {
1549
+ "epoch": 1.05,
1550
+ "eval_loss": 0.06830067932605743,
1551
+ "eval_runtime": 1683.3372,
1552
+ "eval_samples_per_second": 36.851,
1553
+ "eval_steps_per_second": 36.851,
1554
+ "step": 1175000
1555
+ },
1556
+ {
1557
+ "epoch": 1.06,
1558
+ "learning_rate": 2.3580370304694455e-05,
1559
+ "loss": 0.0622,
1560
+ "step": 1180000
1561
+ },
1562
+ {
1563
+ "epoch": 1.06,
1564
+ "learning_rate": 2.3468422721239773e-05,
1565
+ "loss": 0.0579,
1566
+ "step": 1185000
1567
+ },
1568
+ {
1569
+ "epoch": 1.07,
1570
+ "learning_rate": 2.3356475137785085e-05,
1571
+ "loss": 0.0644,
1572
+ "step": 1190000
1573
+ },
1574
+ {
1575
+ "epoch": 1.07,
1576
+ "learning_rate": 2.3244527554330404e-05,
1577
+ "loss": 0.063,
1578
+ "step": 1195000
1579
+ },
1580
+ {
1581
+ "epoch": 1.07,
1582
+ "learning_rate": 2.313257997087572e-05,
1583
+ "loss": 0.0629,
1584
+ "step": 1200000
1585
+ },
1586
+ {
1587
+ "epoch": 1.07,
1588
+ "eval_loss": 0.0691303089261055,
1589
+ "eval_runtime": 1683.681,
1590
+ "eval_samples_per_second": 36.844,
1591
+ "eval_steps_per_second": 36.844,
1592
+ "step": 1200000
1593
+ },
1594
+ {
1595
+ "epoch": 1.08,
1596
+ "learning_rate": 2.3020632387421034e-05,
1597
+ "loss": 0.0647,
1598
+ "step": 1205000
1599
+ },
1600
+ {
1601
+ "epoch": 1.08,
1602
+ "learning_rate": 2.290868480396635e-05,
1603
+ "loss": 0.0645,
1604
+ "step": 1210000
1605
+ },
1606
+ {
1607
+ "epoch": 1.09,
1608
+ "learning_rate": 2.2796737220511664e-05,
1609
+ "loss": 0.063,
1610
+ "step": 1215000
1611
+ },
1612
+ {
1613
+ "epoch": 1.09,
1614
+ "learning_rate": 2.268478963705698e-05,
1615
+ "loss": 0.0638,
1616
+ "step": 1220000
1617
+ },
1618
+ {
1619
+ "epoch": 1.1,
1620
+ "learning_rate": 2.2572842053602295e-05,
1621
+ "loss": 0.0603,
1622
+ "step": 1225000
1623
+ },
1624
+ {
1625
+ "epoch": 1.1,
1626
+ "eval_loss": 0.06777703762054443,
1627
+ "eval_runtime": 1680.6968,
1628
+ "eval_samples_per_second": 36.909,
1629
+ "eval_steps_per_second": 36.909,
1630
+ "step": 1225000
1631
+ },
1632
+ {
1633
+ "epoch": 1.1,
1634
+ "learning_rate": 2.246089447014761e-05,
1635
+ "loss": 0.0612,
1636
+ "step": 1230000
1637
+ },
1638
+ {
1639
+ "epoch": 1.11,
1640
+ "learning_rate": 2.2348946886692925e-05,
1641
+ "loss": 0.0586,
1642
+ "step": 1235000
1643
+ },
1644
+ {
1645
+ "epoch": 1.11,
1646
+ "learning_rate": 2.2236999303238244e-05,
1647
+ "loss": 0.0607,
1648
+ "step": 1240000
1649
+ },
1650
+ {
1651
+ "epoch": 1.11,
1652
+ "learning_rate": 2.212505171978356e-05,
1653
+ "loss": 0.0653,
1654
+ "step": 1245000
1655
+ },
1656
+ {
1657
+ "epoch": 1.12,
1658
+ "learning_rate": 2.201310413632887e-05,
1659
+ "loss": 0.0601,
1660
+ "step": 1250000
1661
+ },
1662
+ {
1663
+ "epoch": 1.12,
1664
+ "eval_loss": 0.07459407299757004,
1665
+ "eval_runtime": 1683.3206,
1666
+ "eval_samples_per_second": 36.852,
1667
+ "eval_steps_per_second": 36.852,
1668
+ "step": 1250000
1669
+ },
1670
+ {
1671
+ "epoch": 1.12,
1672
+ "learning_rate": 2.190115655287419e-05,
1673
+ "loss": 0.0638,
1674
+ "step": 1255000
1675
+ },
1676
+ {
1677
+ "epoch": 1.13,
1678
+ "learning_rate": 2.1789208969419504e-05,
1679
+ "loss": 0.0634,
1680
+ "step": 1260000
1681
+ },
1682
+ {
1683
+ "epoch": 1.13,
1684
+ "learning_rate": 2.167726138596482e-05,
1685
+ "loss": 0.0622,
1686
+ "step": 1265000
1687
+ },
1688
+ {
1689
+ "epoch": 1.14,
1690
+ "learning_rate": 2.1565313802510135e-05,
1691
+ "loss": 0.0593,
1692
+ "step": 1270000
1693
+ },
1694
+ {
1695
+ "epoch": 1.14,
1696
+ "learning_rate": 2.145336621905545e-05,
1697
+ "loss": 0.0606,
1698
+ "step": 1275000
1699
+ },
1700
+ {
1701
+ "epoch": 1.14,
1702
+ "eval_loss": 0.06908420473337173,
1703
+ "eval_runtime": 1680.5541,
1704
+ "eval_samples_per_second": 36.912,
1705
+ "eval_steps_per_second": 36.912,
1706
+ "step": 1275000
1707
+ },
1708
+ {
1709
+ "epoch": 1.15,
1710
+ "learning_rate": 2.1341418635600765e-05,
1711
+ "loss": 0.0612,
1712
+ "step": 1280000
1713
+ },
1714
+ {
1715
+ "epoch": 1.15,
1716
+ "learning_rate": 2.122947105214608e-05,
1717
+ "loss": 0.066,
1718
+ "step": 1285000
1719
+ },
1720
+ {
1721
+ "epoch": 1.16,
1722
+ "learning_rate": 2.1117523468691396e-05,
1723
+ "loss": 0.0562,
1724
+ "step": 1290000
1725
+ },
1726
+ {
1727
+ "epoch": 1.16,
1728
+ "learning_rate": 2.100557588523671e-05,
1729
+ "loss": 0.0617,
1730
+ "step": 1295000
1731
+ },
1732
+ {
1733
+ "epoch": 1.16,
1734
+ "learning_rate": 2.089362830178203e-05,
1735
+ "loss": 0.0671,
1736
+ "step": 1300000
1737
+ },
1738
+ {
1739
+ "epoch": 1.16,
1740
+ "eval_loss": 0.07024173438549042,
1741
+ "eval_runtime": 1683.3132,
1742
+ "eval_samples_per_second": 36.852,
1743
+ "eval_steps_per_second": 36.852,
1744
+ "step": 1300000
1745
+ },
1746
+ {
1747
+ "epoch": 1.17,
1748
+ "learning_rate": 2.078168071832734e-05,
1749
+ "loss": 0.0578,
1750
+ "step": 1305000
1751
+ },
1752
+ {
1753
+ "epoch": 1.17,
1754
+ "learning_rate": 2.066973313487266e-05,
1755
+ "loss": 0.0592,
1756
+ "step": 1310000
1757
+ },
1758
+ {
1759
+ "epoch": 1.18,
1760
+ "learning_rate": 2.0557785551417975e-05,
1761
+ "loss": 0.0607,
1762
+ "step": 1315000
1763
+ },
1764
+ {
1765
+ "epoch": 1.18,
1766
+ "learning_rate": 2.0445837967963287e-05,
1767
+ "loss": 0.0645,
1768
+ "step": 1320000
1769
+ },
1770
+ {
1771
+ "epoch": 1.19,
1772
+ "learning_rate": 2.0333890384508605e-05,
1773
+ "loss": 0.0625,
1774
+ "step": 1325000
1775
+ },
1776
+ {
1777
+ "epoch": 1.19,
1778
+ "eval_loss": 0.06607282906770706,
1779
+ "eval_runtime": 1680.5346,
1780
+ "eval_samples_per_second": 36.913,
1781
+ "eval_steps_per_second": 36.913,
1782
+ "step": 1325000
1783
+ },
1784
+ {
1785
+ "epoch": 1.19,
1786
+ "learning_rate": 2.022194280105392e-05,
1787
+ "loss": 0.0625,
1788
+ "step": 1330000
1789
+ },
1790
+ {
1791
+ "epoch": 1.2,
1792
+ "learning_rate": 2.0109995217599236e-05,
1793
+ "loss": 0.0605,
1794
+ "step": 1335000
1795
+ },
1796
+ {
1797
+ "epoch": 1.2,
1798
+ "learning_rate": 1.999804763414455e-05,
1799
+ "loss": 0.0592,
1800
+ "step": 1340000
1801
+ },
1802
+ {
1803
+ "epoch": 1.2,
1804
+ "learning_rate": 1.9886100050689866e-05,
1805
+ "loss": 0.0652,
1806
+ "step": 1345000
1807
+ },
1808
+ {
1809
+ "epoch": 1.21,
1810
+ "learning_rate": 1.977415246723518e-05,
1811
+ "loss": 0.0617,
1812
+ "step": 1350000
1813
+ },
1814
+ {
1815
+ "epoch": 1.21,
1816
+ "eval_loss": 0.0687505304813385,
1817
+ "eval_runtime": 1680.1242,
1818
+ "eval_samples_per_second": 36.922,
1819
+ "eval_steps_per_second": 36.922,
1820
+ "step": 1350000
1821
+ },
1822
+ {
1823
+ "epoch": 1.21,
1824
+ "learning_rate": 1.9662204883780496e-05,
1825
+ "loss": 0.0607,
1826
+ "step": 1355000
1827
+ },
1828
+ {
1829
+ "epoch": 1.22,
1830
+ "learning_rate": 1.9550257300325815e-05,
1831
+ "loss": 0.0619,
1832
+ "step": 1360000
1833
+ },
1834
+ {
1835
+ "epoch": 1.22,
1836
+ "learning_rate": 1.9438309716871127e-05,
1837
+ "loss": 0.0629,
1838
+ "step": 1365000
1839
+ },
1840
+ {
1841
+ "epoch": 1.23,
1842
+ "learning_rate": 1.9326362133416445e-05,
1843
+ "loss": 0.0637,
1844
+ "step": 1370000
1845
+ },
1846
+ {
1847
+ "epoch": 1.23,
1848
+ "learning_rate": 1.921441454996176e-05,
1849
+ "loss": 0.0579,
1850
+ "step": 1375000
1851
+ },
1852
+ {
1853
+ "epoch": 1.23,
1854
+ "eval_loss": 0.06793049722909927,
1855
+ "eval_runtime": 1680.8053,
1856
+ "eval_samples_per_second": 36.907,
1857
+ "eval_steps_per_second": 36.907,
1858
+ "step": 1375000
1859
+ },
1860
+ {
1861
+ "epoch": 1.24,
1862
+ "learning_rate": 1.9102466966507072e-05,
1863
+ "loss": 0.0632,
1864
+ "step": 1380000
1865
+ },
1866
+ {
1867
+ "epoch": 1.24,
1868
+ "learning_rate": 1.899051938305239e-05,
1869
+ "loss": 0.0593,
1870
+ "step": 1385000
1871
+ },
1872
+ {
1873
+ "epoch": 1.24,
1874
+ "learning_rate": 1.8878571799597706e-05,
1875
+ "loss": 0.0622,
1876
+ "step": 1390000
1877
+ },
1878
+ {
1879
+ "epoch": 1.25,
1880
+ "learning_rate": 1.876662421614302e-05,
1881
+ "loss": 0.064,
1882
+ "step": 1395000
1883
+ },
1884
+ {
1885
+ "epoch": 1.25,
1886
+ "learning_rate": 1.8654676632688336e-05,
1887
+ "loss": 0.0663,
1888
+ "step": 1400000
1889
+ },
1890
+ {
1891
+ "epoch": 1.25,
1892
+ "eval_loss": 0.0633900910615921,
1893
+ "eval_runtime": 1680.7989,
1894
+ "eval_samples_per_second": 36.907,
1895
+ "eval_steps_per_second": 36.907,
1896
+ "step": 1400000
1897
+ },
1898
+ {
1899
+ "epoch": 1.26,
1900
+ "learning_rate": 1.8542729049233655e-05,
1901
+ "loss": 0.0606,
1902
+ "step": 1405000
1903
+ },
1904
+ {
1905
+ "epoch": 1.26,
1906
+ "learning_rate": 1.8430781465778967e-05,
1907
+ "loss": 0.0633,
1908
+ "step": 1410000
1909
+ },
1910
+ {
1911
+ "epoch": 1.27,
1912
+ "learning_rate": 1.8318833882324282e-05,
1913
+ "loss": 0.064,
1914
+ "step": 1415000
1915
+ },
1916
+ {
1917
+ "epoch": 1.27,
1918
+ "learning_rate": 1.82068862988696e-05,
1919
+ "loss": 0.0573,
1920
+ "step": 1420000
1921
+ },
1922
+ {
1923
+ "epoch": 1.28,
1924
+ "learning_rate": 1.8094938715414912e-05,
1925
+ "loss": 0.0583,
1926
+ "step": 1425000
1927
+ },
1928
+ {
1929
+ "epoch": 1.28,
1930
+ "eval_loss": 0.063847616314888,
1931
+ "eval_runtime": 1680.6703,
1932
+ "eval_samples_per_second": 36.91,
1933
+ "eval_steps_per_second": 36.91,
1934
+ "step": 1425000
1935
+ },
1936
+ {
1937
+ "epoch": 1.28,
1938
+ "learning_rate": 1.798299113196023e-05,
1939
+ "loss": 0.0669,
1940
+ "step": 1430000
1941
+ },
1942
+ {
1943
+ "epoch": 1.29,
1944
+ "learning_rate": 1.7871043548505546e-05,
1945
+ "loss": 0.0576,
1946
+ "step": 1435000
1947
+ },
1948
+ {
1949
+ "epoch": 1.29,
1950
+ "learning_rate": 1.775909596505086e-05,
1951
+ "loss": 0.0603,
1952
+ "step": 1440000
1953
+ },
1954
+ {
1955
+ "epoch": 1.29,
1956
+ "learning_rate": 1.7647148381596177e-05,
1957
+ "loss": 0.0628,
1958
+ "step": 1445000
1959
+ },
1960
+ {
1961
+ "epoch": 1.3,
1962
+ "learning_rate": 1.7535200798141492e-05,
1963
+ "loss": 0.0623,
1964
+ "step": 1450000
1965
+ },
1966
+ {
1967
+ "epoch": 1.3,
1968
+ "eval_loss": 0.06811905652284622,
1969
+ "eval_runtime": 1680.0308,
1970
+ "eval_samples_per_second": 36.924,
1971
+ "eval_steps_per_second": 36.924,
1972
+ "step": 1450000
1973
+ },
1974
+ {
1975
+ "epoch": 1.3,
1976
+ "learning_rate": 1.7423253214686807e-05,
1977
+ "loss": 0.066,
1978
+ "step": 1455000
1979
+ },
1980
+ {
1981
+ "epoch": 1.31,
1982
+ "learning_rate": 1.7311305631232122e-05,
1983
+ "loss": 0.0645,
1984
+ "step": 1460000
1985
+ },
1986
+ {
1987
+ "epoch": 1.31,
1988
+ "learning_rate": 1.719935804777744e-05,
1989
+ "loss": 0.0673,
1990
+ "step": 1465000
1991
+ },
1992
+ {
1993
+ "epoch": 1.32,
1994
+ "learning_rate": 1.7087410464322753e-05,
1995
+ "loss": 0.0601,
1996
+ "step": 1470000
1997
+ },
1998
+ {
1999
+ "epoch": 1.32,
2000
+ "learning_rate": 1.697546288086807e-05,
2001
+ "loss": 0.0615,
2002
+ "step": 1475000
2003
+ },
2004
+ {
2005
+ "epoch": 1.32,
2006
+ "eval_loss": 0.06700597703456879,
2007
+ "eval_runtime": 1680.6996,
2008
+ "eval_samples_per_second": 36.909,
2009
+ "eval_steps_per_second": 36.909,
2010
+ "step": 1475000
2011
+ },
2012
+ {
2013
+ "epoch": 1.33,
2014
+ "learning_rate": 1.6863515297413386e-05,
2015
+ "loss": 0.0651,
2016
+ "step": 1480000
2017
+ },
2018
+ {
2019
+ "epoch": 1.33,
2020
+ "learning_rate": 1.6751567713958698e-05,
2021
+ "loss": 0.0596,
2022
+ "step": 1485000
2023
+ },
2024
+ {
2025
+ "epoch": 1.33,
2026
+ "learning_rate": 1.6639620130504017e-05,
2027
+ "loss": 0.0616,
2028
+ "step": 1490000
2029
+ },
2030
+ {
2031
+ "epoch": 1.34,
2032
+ "learning_rate": 1.6527672547049332e-05,
2033
+ "loss": 0.0609,
2034
+ "step": 1495000
2035
+ },
2036
+ {
2037
+ "epoch": 1.34,
2038
+ "learning_rate": 1.6415724963594647e-05,
2039
+ "loss": 0.0592,
2040
+ "step": 1500000
2041
+ },
2042
+ {
2043
+ "epoch": 1.34,
2044
+ "eval_loss": 0.06664443016052246,
2045
+ "eval_runtime": 1681.9546,
2046
+ "eval_samples_per_second": 36.881,
2047
+ "eval_steps_per_second": 36.881,
2048
+ "step": 1500000
2049
+ },
2050
+ {
2051
+ "epoch": 1.35,
2052
+ "learning_rate": 1.6303777380139962e-05,
2053
+ "loss": 0.0618,
2054
+ "step": 1505000
2055
+ },
2056
+ {
2057
+ "epoch": 1.35,
2058
+ "learning_rate": 1.6191829796685277e-05,
2059
+ "loss": 0.0586,
2060
+ "step": 1510000
2061
+ },
2062
+ {
2063
+ "epoch": 1.36,
2064
+ "learning_rate": 1.6079882213230593e-05,
2065
+ "loss": 0.0611,
2066
+ "step": 1515000
2067
+ },
2068
+ {
2069
+ "epoch": 1.36,
2070
+ "learning_rate": 1.5967934629775908e-05,
2071
+ "loss": 0.0635,
2072
+ "step": 1520000
2073
+ },
2074
+ {
2075
+ "epoch": 1.37,
2076
+ "learning_rate": 1.5855987046321223e-05,
2077
+ "loss": 0.0626,
2078
+ "step": 1525000
2079
+ },
2080
+ {
2081
+ "epoch": 1.37,
2082
+ "eval_loss": 0.06663180142641068,
2083
+ "eval_runtime": 1679.9392,
2084
+ "eval_samples_per_second": 36.926,
2085
+ "eval_steps_per_second": 36.926,
2086
+ "step": 1525000
2087
+ },
2088
+ {
2089
+ "epoch": 1.37,
2090
+ "learning_rate": 1.5744039462866538e-05,
2091
+ "loss": 0.0637,
2092
+ "step": 1530000
2093
+ },
2094
+ {
2095
+ "epoch": 1.37,
2096
+ "learning_rate": 1.5632091879411857e-05,
2097
+ "loss": 0.0582,
2098
+ "step": 1535000
2099
+ },
2100
+ {
2101
+ "epoch": 1.38,
2102
+ "learning_rate": 1.5520144295957172e-05,
2103
+ "loss": 0.0618,
2104
+ "step": 1540000
2105
+ },
2106
+ {
2107
+ "epoch": 1.38,
2108
+ "learning_rate": 1.5408196712502487e-05,
2109
+ "loss": 0.0594,
2110
+ "step": 1545000
2111
+ },
2112
+ {
2113
+ "epoch": 1.39,
2114
+ "learning_rate": 1.5296249129047802e-05,
2115
+ "loss": 0.063,
2116
+ "step": 1550000
2117
+ },
2118
+ {
2119
+ "epoch": 1.39,
2120
+ "eval_loss": 0.06467917561531067,
2121
+ "eval_runtime": 1681.2275,
2122
+ "eval_samples_per_second": 36.897,
2123
+ "eval_steps_per_second": 36.897,
2124
+ "step": 1550000
2125
+ },
2126
+ {
2127
+ "epoch": 1.39,
2128
+ "learning_rate": 1.5184301545593116e-05,
2129
+ "loss": 0.0637,
2130
+ "step": 1555000
2131
+ },
2132
+ {
2133
+ "epoch": 1.4,
2134
+ "learning_rate": 1.5072353962138433e-05,
2135
+ "loss": 0.0641,
2136
+ "step": 1560000
2137
+ },
2138
+ {
2139
+ "epoch": 1.4,
2140
+ "learning_rate": 1.4960406378683748e-05,
2141
+ "loss": 0.0615,
2142
+ "step": 1565000
2143
+ },
2144
+ {
2145
+ "epoch": 1.41,
2146
+ "learning_rate": 1.4848458795229065e-05,
2147
+ "loss": 0.061,
2148
+ "step": 1570000
2149
+ },
2150
+ {
2151
+ "epoch": 1.41,
2152
+ "learning_rate": 1.4736511211774378e-05,
2153
+ "loss": 0.0648,
2154
+ "step": 1575000
2155
+ },
2156
+ {
2157
+ "epoch": 1.41,
2158
+ "eval_loss": 0.06533137708902359,
2159
+ "eval_runtime": 1677.1597,
2160
+ "eval_samples_per_second": 36.987,
2161
+ "eval_steps_per_second": 36.987,
2162
+ "step": 1575000
2163
+ },
2164
+ {
2165
+ "epoch": 1.42,
2166
+ "learning_rate": 1.4624563628319693e-05,
2167
+ "loss": 0.0623,
2168
+ "step": 1580000
2169
+ },
2170
+ {
2171
+ "epoch": 1.42,
2172
+ "learning_rate": 1.451261604486501e-05,
2173
+ "loss": 0.0631,
2174
+ "step": 1585000
2175
+ },
2176
+ {
2177
+ "epoch": 1.42,
2178
+ "learning_rate": 1.4400668461410324e-05,
2179
+ "loss": 0.0613,
2180
+ "step": 1590000
2181
+ },
2182
+ {
2183
+ "epoch": 1.43,
2184
+ "learning_rate": 1.428872087795564e-05,
2185
+ "loss": 0.0611,
2186
+ "step": 1595000
2187
+ },
2188
+ {
2189
+ "epoch": 1.43,
2190
+ "learning_rate": 1.4176773294500956e-05,
2191
+ "loss": 0.0611,
2192
+ "step": 1600000
2193
+ },
2194
+ {
2195
+ "epoch": 1.43,
2196
+ "eval_loss": 0.06996028870344162,
2197
+ "eval_runtime": 1678.7591,
2198
+ "eval_samples_per_second": 36.952,
2199
+ "eval_steps_per_second": 36.952,
2200
+ "step": 1600000
2201
+ },
2202
+ {
2203
+ "epoch": 1.44,
2204
+ "learning_rate": 1.4064825711046273e-05,
2205
+ "loss": 0.0629,
2206
+ "step": 1605000
2207
+ },
2208
+ {
2209
+ "epoch": 1.44,
2210
+ "learning_rate": 1.3952878127591586e-05,
2211
+ "loss": 0.0607,
2212
+ "step": 1610000
2213
+ },
2214
+ {
2215
+ "epoch": 1.45,
2216
+ "learning_rate": 1.3840930544136901e-05,
2217
+ "loss": 0.0653,
2218
+ "step": 1615000
2219
+ },
2220
+ {
2221
+ "epoch": 1.45,
2222
+ "learning_rate": 1.3728982960682218e-05,
2223
+ "loss": 0.0608,
2224
+ "step": 1620000
2225
+ },
2226
+ {
2227
+ "epoch": 1.46,
2228
+ "learning_rate": 1.3617035377227534e-05,
2229
+ "loss": 0.0622,
2230
+ "step": 1625000
2231
+ },
2232
+ {
2233
+ "epoch": 1.46,
2234
+ "eval_loss": 0.0634424015879631,
2235
+ "eval_runtime": 1677.9992,
2236
+ "eval_samples_per_second": 36.968,
2237
+ "eval_steps_per_second": 36.968,
2238
+ "step": 1625000
2239
+ },
2240
+ {
2241
+ "epoch": 1.46,
2242
+ "learning_rate": 1.350508779377285e-05,
2243
+ "loss": 0.0622,
2244
+ "step": 1630000
2245
+ },
2246
+ {
2247
+ "epoch": 1.46,
2248
+ "learning_rate": 1.3393140210318164e-05,
2249
+ "loss": 0.0634,
2250
+ "step": 1635000
2251
+ },
2252
+ {
2253
+ "epoch": 1.47,
2254
+ "learning_rate": 1.328119262686348e-05,
2255
+ "loss": 0.059,
2256
+ "step": 1640000
2257
+ },
2258
+ {
2259
+ "epoch": 1.47,
2260
+ "learning_rate": 1.3169245043408796e-05,
2261
+ "loss": 0.0608,
2262
+ "step": 1645000
2263
+ },
2264
+ {
2265
+ "epoch": 1.48,
2266
+ "learning_rate": 1.305729745995411e-05,
2267
+ "loss": 0.0617,
2268
+ "step": 1650000
2269
+ },
2270
+ {
2271
+ "epoch": 1.48,
2272
+ "eval_loss": 0.06513845920562744,
2273
+ "eval_runtime": 1682.3756,
2274
+ "eval_samples_per_second": 36.872,
2275
+ "eval_steps_per_second": 36.872,
2276
+ "step": 1650000
2277
+ },
2278
+ {
2279
+ "epoch": 1.48,
2280
+ "learning_rate": 1.2945349876499426e-05,
2281
+ "loss": 0.0622,
2282
+ "step": 1655000
2283
+ },
2284
+ {
2285
+ "epoch": 1.49,
2286
+ "learning_rate": 1.2833402293044742e-05,
2287
+ "loss": 0.0603,
2288
+ "step": 1660000
2289
+ },
2290
+ {
2291
+ "epoch": 1.49,
2292
+ "learning_rate": 1.2721454709590058e-05,
2293
+ "loss": 0.0647,
2294
+ "step": 1665000
2295
+ },
2296
+ {
2297
+ "epoch": 1.5,
2298
+ "learning_rate": 1.2609507126135372e-05,
2299
+ "loss": 0.0579,
2300
+ "step": 1670000
2301
+ },
2302
+ {
2303
+ "epoch": 1.5,
2304
+ "learning_rate": 1.2497559542680689e-05,
2305
+ "loss": 0.0613,
2306
+ "step": 1675000
2307
+ },
2308
+ {
2309
+ "epoch": 1.5,
2310
+ "eval_loss": 0.0634496882557869,
2311
+ "eval_runtime": 1679.6133,
2312
+ "eval_samples_per_second": 36.933,
2313
+ "eval_steps_per_second": 36.933,
2314
+ "step": 1675000
2315
+ },
2316
+ {
2317
+ "epoch": 1.5,
2318
+ "learning_rate": 1.2385611959226004e-05,
2319
+ "loss": 0.0574,
2320
+ "step": 1680000
2321
+ },
2322
+ {
2323
+ "epoch": 1.51,
2324
+ "learning_rate": 1.227366437577132e-05,
2325
+ "loss": 0.0574,
2326
+ "step": 1685000
2327
+ },
2328
+ {
2329
+ "epoch": 1.51,
2330
+ "learning_rate": 1.2161716792316636e-05,
2331
+ "loss": 0.0616,
2332
+ "step": 1690000
2333
+ },
2334
+ {
2335
+ "epoch": 1.52,
2336
+ "learning_rate": 1.2049769208861951e-05,
2337
+ "loss": 0.06,
2338
+ "step": 1695000
2339
+ },
2340
+ {
2341
+ "epoch": 1.52,
2342
+ "learning_rate": 1.1937821625407265e-05,
2343
+ "loss": 0.0639,
2344
+ "step": 1700000
2345
+ },
2346
+ {
2347
+ "epoch": 1.52,
2348
+ "eval_loss": 0.0661427304148674,
2349
+ "eval_runtime": 1685.0546,
2350
+ "eval_samples_per_second": 36.814,
2351
+ "eval_steps_per_second": 36.814,
2352
+ "step": 1700000
2353
+ },
2354
+ {
2355
+ "epoch": 1.53,
2356
+ "learning_rate": 1.1825874041952582e-05,
2357
+ "loss": 0.0638,
2358
+ "step": 1705000
2359
+ },
2360
+ {
2361
+ "epoch": 1.53,
2362
+ "learning_rate": 1.1713926458497897e-05,
2363
+ "loss": 0.0618,
2364
+ "step": 1710000
2365
+ },
2366
+ {
2367
+ "epoch": 1.54,
2368
+ "learning_rate": 1.1601978875043212e-05,
2369
+ "loss": 0.0616,
2370
+ "step": 1715000
2371
+ },
2372
+ {
2373
+ "epoch": 1.54,
2374
+ "learning_rate": 1.1490031291588529e-05,
2375
+ "loss": 0.0597,
2376
+ "step": 1720000
2377
+ },
2378
+ {
2379
+ "epoch": 1.54,
2380
+ "learning_rate": 1.1378083708133844e-05,
2381
+ "loss": 0.0615,
2382
+ "step": 1725000
2383
+ },
2384
+ {
2385
+ "epoch": 1.54,
2386
+ "eval_loss": 0.06442756205797195,
2387
+ "eval_runtime": 1688.2768,
2388
+ "eval_samples_per_second": 36.743,
2389
+ "eval_steps_per_second": 36.743,
2390
+ "step": 1725000
2391
+ },
2392
+ {
2393
+ "epoch": 1.55,
2394
+ "learning_rate": 1.126613612467916e-05,
2395
+ "loss": 0.0627,
2396
+ "step": 1730000
2397
+ },
2398
+ {
2399
+ "epoch": 1.55,
2400
+ "learning_rate": 1.1154188541224474e-05,
2401
+ "loss": 0.0599,
2402
+ "step": 1735000
2403
+ },
2404
+ {
2405
+ "epoch": 1.56,
2406
+ "learning_rate": 1.104224095776979e-05,
2407
+ "loss": 0.0609,
2408
+ "step": 1740000
2409
+ },
2410
+ {
2411
+ "epoch": 1.56,
2412
+ "learning_rate": 1.0930293374315105e-05,
2413
+ "loss": 0.067,
2414
+ "step": 1745000
2415
+ },
2416
+ {
2417
+ "epoch": 1.57,
2418
+ "learning_rate": 1.081834579086042e-05,
2419
+ "loss": 0.0605,
2420
+ "step": 1750000
2421
+ },
2422
+ {
2423
+ "epoch": 1.57,
2424
+ "eval_loss": 0.06615401804447174,
2425
+ "eval_runtime": 1682.0319,
2426
+ "eval_samples_per_second": 36.88,
2427
+ "eval_steps_per_second": 36.88,
2428
+ "step": 1750000
2429
+ },
2430
+ {
2431
+ "epoch": 1.57,
2432
+ "learning_rate": 1.0706398207405737e-05,
2433
+ "loss": 0.0603,
2434
+ "step": 1755000
2435
+ },
2436
+ {
2437
+ "epoch": 1.58,
2438
+ "learning_rate": 1.0594450623951052e-05,
2439
+ "loss": 0.0578,
2440
+ "step": 1760000
2441
+ },
2442
+ {
2443
+ "epoch": 1.58,
2444
+ "learning_rate": 1.0482503040496367e-05,
2445
+ "loss": 0.0658,
2446
+ "step": 1765000
2447
+ },
2448
+ {
2449
+ "epoch": 1.59,
2450
+ "learning_rate": 1.0370555457041682e-05,
2451
+ "loss": 0.0594,
2452
+ "step": 1770000
2453
+ },
2454
+ {
2455
+ "epoch": 1.59,
2456
+ "learning_rate": 1.0258607873586998e-05,
2457
+ "loss": 0.0622,
2458
+ "step": 1775000
2459
+ },
2460
+ {
2461
+ "epoch": 1.59,
2462
+ "eval_loss": 0.06558772176504135,
2463
+ "eval_runtime": 1679.2015,
2464
+ "eval_samples_per_second": 36.942,
2465
+ "eval_steps_per_second": 36.942,
2466
+ "step": 1775000
2467
+ },
2468
+ {
2469
+ "epoch": 1.59,
2470
+ "learning_rate": 1.0146660290132313e-05,
2471
+ "loss": 0.0654,
2472
+ "step": 1780000
2473
+ },
2474
+ {
2475
+ "epoch": 1.6,
2476
+ "learning_rate": 1.003471270667763e-05,
2477
+ "loss": 0.0629,
2478
+ "step": 1785000
2479
+ },
2480
+ {
2481
+ "epoch": 1.6,
2482
+ "learning_rate": 9.922765123222945e-06,
2483
+ "loss": 0.0541,
2484
+ "step": 1790000
2485
+ },
2486
+ {
2487
+ "epoch": 1.61,
2488
+ "learning_rate": 9.81081753976826e-06,
2489
+ "loss": 0.0605,
2490
+ "step": 1795000
2491
+ },
2492
+ {
2493
+ "epoch": 1.61,
2494
+ "learning_rate": 9.698869956313575e-06,
2495
+ "loss": 0.0585,
2496
+ "step": 1800000
2497
+ },
2498
+ {
2499
+ "epoch": 1.61,
2500
+ "eval_loss": 0.0633106529712677,
2501
+ "eval_runtime": 1681.6891,
2502
+ "eval_samples_per_second": 36.887,
2503
+ "eval_steps_per_second": 36.887,
2504
+ "step": 1800000
2505
+ },
2506
+ {
2507
+ "epoch": 1.62,
2508
+ "learning_rate": 9.58692237285889e-06,
2509
+ "loss": 0.0654,
2510
+ "step": 1805000
2511
+ },
2512
+ {
2513
+ "epoch": 1.62,
2514
+ "learning_rate": 9.474974789404206e-06,
2515
+ "loss": 0.0622,
2516
+ "step": 1810000
2517
+ },
2518
+ {
2519
+ "epoch": 1.63,
2520
+ "learning_rate": 9.363027205949523e-06,
2521
+ "loss": 0.0612,
2522
+ "step": 1815000
2523
+ },
2524
+ {
2525
+ "epoch": 1.63,
2526
+ "learning_rate": 9.251079622494838e-06,
2527
+ "loss": 0.056,
2528
+ "step": 1820000
2529
+ },
2530
+ {
2531
+ "epoch": 1.63,
2532
+ "learning_rate": 9.139132039040153e-06,
2533
+ "loss": 0.0628,
2534
+ "step": 1825000
2535
+ },
2536
+ {
2537
+ "epoch": 1.63,
2538
+ "eval_loss": 0.06252375990152359,
2539
+ "eval_runtime": 1681.1292,
2540
+ "eval_samples_per_second": 36.9,
2541
+ "eval_steps_per_second": 36.9,
2542
+ "step": 1825000
2543
+ },
2544
+ {
2545
+ "epoch": 1.64,
2546
+ "learning_rate": 9.02718445558547e-06,
2547
+ "loss": 0.0619,
2548
+ "step": 1830000
2549
+ },
2550
+ {
2551
+ "epoch": 1.64,
2552
+ "learning_rate": 8.915236872130783e-06,
2553
+ "loss": 0.063,
2554
+ "step": 1835000
2555
+ },
2556
+ {
2557
+ "epoch": 1.65,
2558
+ "learning_rate": 8.803289288676098e-06,
2559
+ "loss": 0.0607,
2560
+ "step": 1840000
2561
+ },
2562
+ {
2563
+ "epoch": 1.65,
2564
+ "learning_rate": 8.691341705221415e-06,
2565
+ "loss": 0.0633,
2566
+ "step": 1845000
2567
+ },
2568
+ {
2569
+ "epoch": 1.66,
2570
+ "learning_rate": 8.57939412176673e-06,
2571
+ "loss": 0.0638,
2572
+ "step": 1850000
2573
+ },
2574
+ {
2575
+ "epoch": 1.66,
2576
+ "eval_loss": 0.06624045222997665,
2577
+ "eval_runtime": 1680.5873,
2578
+ "eval_samples_per_second": 36.912,
2579
+ "eval_steps_per_second": 36.912,
2580
+ "step": 1850000
2581
  }
2582
  ],
2583
+ "max_steps": 2233188,
2584
+ "num_train_epochs": 2,
2585
+ "total_flos": 5.556070184057856e+17,
2586
  "trial_name": null,
2587
  "trial_params": null
2588
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:626bad985fb1862f6b32ba4ebd7449b96bad7c0595dcabc271cd02a0adcc193d
3
  size 3119
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70b443918dd5518fff4a7556be44b80bcda3039466b050dbf86c95c0ba51c348
3
  size 3119