flozi00 commited on
Commit
3147114
1 Parent(s): 63ffb42

Upload folder using huggingface_hub

Browse files
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f82520f27b73d7c06440d43e758347c30987ebd9cfbfe70df3ab63e32ce25154
3
  size 4991459544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a239bd0e58246448c79b136afab824a8a9a6153ce17d8d6d113a2e318b3de72
3
  size 4991459544
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7787114bf6023c351dcdc326201064a7935126ab709a5676caa116e882a236b4
3
  size 4991757456
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e1c690f2f36f7d22d9985346afe5114ce093d3af04d6ba6b3fb22d697c78864
3
  size 4991757456
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d42a0591755746258506f021e46ca86cc11d1e1587a4cf2efbb2e87ee3fbc22e
3
  size 4947691960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2759dd1a7d4b026c79bb5df907c7fe108df5d36ceb75f2082f8aa3202ce44bcf
3
  size 4947691960
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a546f3427e6b6ed67f0d30ed1e68c113468495586ee2ea61e7609dd393ebf64d
3
  size 1863387424
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ab61af589c217a86e226ecc839b1b369bd32759a3234e18b7cf254d273dabb3
3
  size 1863387424
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2827125681933636,
5
  "eval_steps": 100000,
6
- "global_step": 300,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -556,6 +556,1266 @@
556
  "memory(GiB)": 75.3,
557
  "step": 300,
558
  "train_speed(iter/s)": 0.003347
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
  }
560
  ],
561
  "logging_steps": 5,
@@ -575,7 +1835,7 @@
575
  "attributes": {}
576
  }
577
  },
578
- "total_flos": 1.6327530207541985e+18,
579
  "train_batch_size": 2,
580
  "trial_name": null,
581
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.942375227311212,
5
  "eval_steps": 100000,
6
+ "global_step": 1000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
556
  "memory(GiB)": 75.3,
557
  "step": 300,
558
  "train_speed(iter/s)": 0.003347
559
+ },
560
+ {
561
+ "epoch": 0.2874244443299197,
562
+ "grad_norm": 0.91015625,
563
+ "learning_rate": 8.104190202580811e-06,
564
+ "loss": 0.05302551,
565
+ "memory(GiB)": 75.3,
566
+ "step": 305,
567
+ "train_speed(iter/s)": 0.003346
568
+ },
569
+ {
570
+ "epoch": 0.29213632046647575,
571
+ "grad_norm": 0.91796875,
572
+ "learning_rate": 8.045766202102358e-06,
573
+ "loss": 0.05804279,
574
+ "memory(GiB)": 75.3,
575
+ "step": 310,
576
+ "train_speed(iter/s)": 0.003346
577
+ },
578
+ {
579
+ "epoch": 0.2968481966030318,
580
+ "grad_norm": 0.9375,
581
+ "learning_rate": 7.986673370246743e-06,
582
+ "loss": 0.05822692,
583
+ "memory(GiB)": 75.3,
584
+ "step": 315,
585
+ "train_speed(iter/s)": 0.003346
586
+ },
587
+ {
588
+ "epoch": 0.30156007273958785,
589
+ "grad_norm": 1.0078125,
590
+ "learning_rate": 7.926924683433523e-06,
591
+ "loss": 0.06007032,
592
+ "memory(GiB)": 75.3,
593
+ "step": 320,
594
+ "train_speed(iter/s)": 0.003346
595
+ },
596
+ {
597
+ "epoch": 0.3062719488761439,
598
+ "grad_norm": 0.921875,
599
+ "learning_rate": 7.866533262103937e-06,
600
+ "loss": 0.06018423,
601
+ "memory(GiB)": 75.3,
602
+ "step": 325,
603
+ "train_speed(iter/s)": 0.003346
604
+ },
605
+ {
606
+ "epoch": 0.3109838250127,
607
+ "grad_norm": 0.9375,
608
+ "learning_rate": 7.805512367839742e-06,
609
+ "loss": 0.05931915,
610
+ "memory(GiB)": 75.3,
611
+ "step": 330,
612
+ "train_speed(iter/s)": 0.003346
613
+ },
614
+ {
615
+ "epoch": 0.31569570114925605,
616
+ "grad_norm": 1.015625,
617
+ "learning_rate": 7.743875400451047e-06,
618
+ "loss": 0.0566447,
619
+ "memory(GiB)": 75.3,
620
+ "step": 335,
621
+ "train_speed(iter/s)": 0.003346
622
+ },
623
+ {
624
+ "epoch": 0.3204075772858121,
625
+ "grad_norm": 0.8203125,
626
+ "learning_rate": 7.681635895033798e-06,
627
+ "loss": 0.05161901,
628
+ "memory(GiB)": 75.3,
629
+ "step": 340,
630
+ "train_speed(iter/s)": 0.003346
631
+ },
632
+ {
633
+ "epoch": 0.32511945342236814,
634
+ "grad_norm": 1.0,
635
+ "learning_rate": 7.6188075189975644e-06,
636
+ "loss": 0.05694907,
637
+ "memory(GiB)": 75.3,
638
+ "step": 345,
639
+ "train_speed(iter/s)": 0.003346
640
+ },
641
+ {
642
+ "epoch": 0.32983132955892425,
643
+ "grad_norm": 1.0390625,
644
+ "learning_rate": 7.555404069064245e-06,
645
+ "loss": 0.05555046,
646
+ "memory(GiB)": 75.3,
647
+ "step": 350,
648
+ "train_speed(iter/s)": 0.003347
649
+ },
650
+ {
651
+ "epoch": 0.3345432056954803,
652
+ "grad_norm": 0.97265625,
653
+ "learning_rate": 7.491439468238404e-06,
654
+ "loss": 0.05587023,
655
+ "memory(GiB)": 75.3,
656
+ "step": 355,
657
+ "train_speed(iter/s)": 0.003347
658
+ },
659
+ {
660
+ "epoch": 0.33925508183203634,
661
+ "grad_norm": 0.96875,
662
+ "learning_rate": 7.426927762749867e-06,
663
+ "loss": 0.05913154,
664
+ "memory(GiB)": 75.3,
665
+ "step": 360,
666
+ "train_speed(iter/s)": 0.003347
667
+ },
668
+ {
669
+ "epoch": 0.3439669579685924,
670
+ "grad_norm": 0.89453125,
671
+ "learning_rate": 7.361883118969248e-06,
672
+ "loss": 0.05830712,
673
+ "memory(GiB)": 75.3,
674
+ "step": 365,
675
+ "train_speed(iter/s)": 0.003348
676
+ },
677
+ {
678
+ "epoch": 0.34867883410514844,
679
+ "grad_norm": 0.89453125,
680
+ "learning_rate": 7.2963198202971055e-06,
681
+ "loss": 0.05937972,
682
+ "memory(GiB)": 75.3,
683
+ "step": 370,
684
+ "train_speed(iter/s)": 0.003348
685
+ },
686
+ {
687
+ "epoch": 0.35339071024170454,
688
+ "grad_norm": 0.9375,
689
+ "learning_rate": 7.230252264027398e-06,
690
+ "loss": 0.0565136,
691
+ "memory(GiB)": 75.3,
692
+ "step": 375,
693
+ "train_speed(iter/s)": 0.003348
694
+ },
695
+ {
696
+ "epoch": 0.3581025863782606,
697
+ "grad_norm": 0.96875,
698
+ "learning_rate": 7.163694958185928e-06,
699
+ "loss": 0.05636386,
700
+ "memory(GiB)": 75.3,
701
+ "step": 380,
702
+ "train_speed(iter/s)": 0.003348
703
+ },
704
+ {
705
+ "epoch": 0.36281446251481664,
706
+ "grad_norm": 0.96875,
707
+ "learning_rate": 7.09666251834447e-06,
708
+ "loss": 0.06038175,
709
+ "memory(GiB)": 75.3,
710
+ "step": 385,
711
+ "train_speed(iter/s)": 0.003348
712
+ },
713
+ {
714
+ "epoch": 0.3675263386513727,
715
+ "grad_norm": 0.92578125,
716
+ "learning_rate": 7.0291696644112705e-06,
717
+ "loss": 0.05833557,
718
+ "memory(GiB)": 75.3,
719
+ "step": 390,
720
+ "train_speed(iter/s)": 0.003349
721
+ },
722
+ {
723
+ "epoch": 0.3722382147879288,
724
+ "grad_norm": 0.8359375,
725
+ "learning_rate": 6.9612312173986675e-06,
726
+ "loss": 0.05632974,
727
+ "memory(GiB)": 75.3,
728
+ "step": 395,
729
+ "train_speed(iter/s)": 0.003348
730
+ },
731
+ {
732
+ "epoch": 0.37695009092448484,
733
+ "grad_norm": 0.921875,
734
+ "learning_rate": 6.892862096168469e-06,
735
+ "loss": 0.05656151,
736
+ "memory(GiB)": 75.3,
737
+ "step": 400,
738
+ "train_speed(iter/s)": 0.003348
739
+ },
740
+ {
741
+ "epoch": 0.3816619670610409,
742
+ "grad_norm": 0.98828125,
743
+ "learning_rate": 6.824077314155877e-06,
744
+ "loss": 0.05432441,
745
+ "memory(GiB)": 75.3,
746
+ "step": 405,
747
+ "train_speed(iter/s)": 0.003347
748
+ },
749
+ {
750
+ "epoch": 0.38637384319759693,
751
+ "grad_norm": 0.9453125,
752
+ "learning_rate": 6.75489197607262e-06,
753
+ "loss": 0.05709869,
754
+ "memory(GiB)": 75.3,
755
+ "step": 410,
756
+ "train_speed(iter/s)": 0.003347
757
+ },
758
+ {
759
+ "epoch": 0.391085719334153,
760
+ "grad_norm": 1.0546875,
761
+ "learning_rate": 6.6853212745900585e-06,
762
+ "loss": 0.05979726,
763
+ "memory(GiB)": 75.3,
764
+ "step": 415,
765
+ "train_speed(iter/s)": 0.003348
766
+ },
767
+ {
768
+ "epoch": 0.3957975954707091,
769
+ "grad_norm": 0.9140625,
770
+ "learning_rate": 6.615380487002969e-06,
771
+ "loss": 0.0600209,
772
+ "memory(GiB)": 75.3,
773
+ "step": 420,
774
+ "train_speed(iter/s)": 0.003348
775
+ },
776
+ {
777
+ "epoch": 0.40050947160726513,
778
+ "grad_norm": 0.94140625,
779
+ "learning_rate": 6.545084971874738e-06,
780
+ "loss": 0.0563777,
781
+ "memory(GiB)": 75.3,
782
+ "step": 425,
783
+ "train_speed(iter/s)": 0.003348
784
+ },
785
+ {
786
+ "epoch": 0.4052213477438212,
787
+ "grad_norm": 0.91796875,
788
+ "learning_rate": 6.474450165664722e-06,
789
+ "loss": 0.05698464,
790
+ "memory(GiB)": 75.3,
791
+ "step": 430,
792
+ "train_speed(iter/s)": 0.003348
793
+ },
794
+ {
795
+ "epoch": 0.40993322388037723,
796
+ "grad_norm": 0.890625,
797
+ "learning_rate": 6.4034915793385e-06,
798
+ "loss": 0.05311573,
799
+ "memory(GiB)": 75.3,
800
+ "step": 435,
801
+ "train_speed(iter/s)": 0.003348
802
+ },
803
+ {
804
+ "epoch": 0.41464510001693333,
805
+ "grad_norm": 0.97265625,
806
+ "learning_rate": 6.332224794961752e-06,
807
+ "loss": 0.05458606,
808
+ "memory(GiB)": 75.3,
809
+ "step": 440,
810
+ "train_speed(iter/s)": 0.003348
811
+ },
812
+ {
813
+ "epoch": 0.4193569761534894,
814
+ "grad_norm": 0.95703125,
815
+ "learning_rate": 6.260665462278544e-06,
816
+ "loss": 0.05579169,
817
+ "memory(GiB)": 75.3,
818
+ "step": 445,
819
+ "train_speed(iter/s)": 0.003348
820
+ },
821
+ {
822
+ "epoch": 0.42406885229004543,
823
+ "grad_norm": 0.99609375,
824
+ "learning_rate": 6.18882929527473e-06,
825
+ "loss": 0.06002288,
826
+ "memory(GiB)": 75.3,
827
+ "step": 450,
828
+ "train_speed(iter/s)": 0.003348
829
+ },
830
+ {
831
+ "epoch": 0.4287807284266015,
832
+ "grad_norm": 0.94140625,
833
+ "learning_rate": 6.116732068727271e-06,
834
+ "loss": 0.05494517,
835
+ "memory(GiB)": 75.3,
836
+ "step": 455,
837
+ "train_speed(iter/s)": 0.003349
838
+ },
839
+ {
840
+ "epoch": 0.4334926045631575,
841
+ "grad_norm": 0.953125,
842
+ "learning_rate": 6.0443896147401856e-06,
843
+ "loss": 0.0547879,
844
+ "memory(GiB)": 75.3,
845
+ "step": 460,
846
+ "train_speed(iter/s)": 0.003349
847
+ },
848
+ {
849
+ "epoch": 0.4382044806997136,
850
+ "grad_norm": 0.82421875,
851
+ "learning_rate": 5.971817819267914e-06,
852
+ "loss": 0.05363967,
853
+ "memory(GiB)": 75.3,
854
+ "step": 465,
855
+ "train_speed(iter/s)": 0.003348
856
+ },
857
+ {
858
+ "epoch": 0.4429163568362697,
859
+ "grad_norm": 0.91796875,
860
+ "learning_rate": 5.8990326186268655e-06,
861
+ "loss": 0.056594,
862
+ "memory(GiB)": 75.3,
863
+ "step": 470,
864
+ "train_speed(iter/s)": 0.003349
865
+ },
866
+ {
867
+ "epoch": 0.4476282329728257,
868
+ "grad_norm": 0.9765625,
869
+ "learning_rate": 5.826049995995905e-06,
870
+ "loss": 0.05898719,
871
+ "memory(GiB)": 75.3,
872
+ "step": 475,
873
+ "train_speed(iter/s)": 0.003349
874
+ },
875
+ {
876
+ "epoch": 0.45234010910938177,
877
+ "grad_norm": 1.3671875,
878
+ "learning_rate": 5.752885977906539e-06,
879
+ "loss": 0.05439388,
880
+ "memory(GiB)": 75.3,
881
+ "step": 480,
882
+ "train_speed(iter/s)": 0.003349
883
+ },
884
+ {
885
+ "epoch": 0.4570519852459379,
886
+ "grad_norm": 1.0390625,
887
+ "learning_rate": 5.679556630723592e-06,
888
+ "loss": 0.05334362,
889
+ "memory(GiB)": 75.3,
890
+ "step": 485,
891
+ "train_speed(iter/s)": 0.003349
892
+ },
893
+ {
894
+ "epoch": 0.4617638613824939,
895
+ "grad_norm": 0.9765625,
896
+ "learning_rate": 5.606078057117136e-06,
897
+ "loss": 0.06019425,
898
+ "memory(GiB)": 75.3,
899
+ "step": 490,
900
+ "train_speed(iter/s)": 0.003349
901
+ },
902
+ {
903
+ "epoch": 0.46647573751904997,
904
+ "grad_norm": 0.95703125,
905
+ "learning_rate": 5.532466392526439e-06,
906
+ "loss": 0.05597678,
907
+ "memory(GiB)": 75.3,
908
+ "step": 495,
909
+ "train_speed(iter/s)": 0.00335
910
+ },
911
+ {
912
+ "epoch": 0.471187613655606,
913
+ "grad_norm": 0.86328125,
914
+ "learning_rate": 5.458737801616721e-06,
915
+ "loss": 0.05094014,
916
+ "memory(GiB)": 75.3,
917
+ "step": 500,
918
+ "train_speed(iter/s)": 0.003349
919
+ },
920
+ {
921
+ "epoch": 0.47589948979216207,
922
+ "grad_norm": 0.875,
923
+ "learning_rate": 5.384908474729501e-06,
924
+ "loss": 0.0548723,
925
+ "memory(GiB)": 75.3,
926
+ "step": 505,
927
+ "train_speed(iter/s)": 0.003348
928
+ },
929
+ {
930
+ "epoch": 0.48061136592871817,
931
+ "grad_norm": 0.8984375,
932
+ "learning_rate": 5.310994624327292e-06,
933
+ "loss": 0.05574841,
934
+ "memory(GiB)": 75.3,
935
+ "step": 510,
936
+ "train_speed(iter/s)": 0.003348
937
+ },
938
+ {
939
+ "epoch": 0.4853232420652742,
940
+ "grad_norm": 0.8671875,
941
+ "learning_rate": 5.23701248143345e-06,
942
+ "loss": 0.05651059,
943
+ "memory(GiB)": 75.3,
944
+ "step": 515,
945
+ "train_speed(iter/s)": 0.003348
946
+ },
947
+ {
948
+ "epoch": 0.49003511820183027,
949
+ "grad_norm": 0.921875,
950
+ "learning_rate": 5.162978292067933e-06,
951
+ "loss": 0.05878415,
952
+ "memory(GiB)": 75.3,
953
+ "step": 520,
954
+ "train_speed(iter/s)": 0.003348
955
+ },
956
+ {
957
+ "epoch": 0.4947469943383863,
958
+ "grad_norm": 1.0234375,
959
+ "learning_rate": 5.088908313679788e-06,
960
+ "loss": 0.05620171,
961
+ "memory(GiB)": 75.3,
962
+ "step": 525,
963
+ "train_speed(iter/s)": 0.003348
964
+ },
965
+ {
966
+ "epoch": 0.49945887047494236,
967
+ "grad_norm": 0.86328125,
968
+ "learning_rate": 5.014818811577104e-06,
969
+ "loss": 0.05407885,
970
+ "memory(GiB)": 75.3,
971
+ "step": 530,
972
+ "train_speed(iter/s)": 0.003348
973
+ },
974
+ {
975
+ "epoch": 0.5041707466114984,
976
+ "grad_norm": 0.84375,
977
+ "learning_rate": 4.940726055355259e-06,
978
+ "loss": 0.05323058,
979
+ "memory(GiB)": 75.3,
980
+ "step": 535,
981
+ "train_speed(iter/s)": 0.003349
982
+ },
983
+ {
984
+ "epoch": 0.5088826227480545,
985
+ "grad_norm": 0.83984375,
986
+ "learning_rate": 4.866646315324217e-06,
987
+ "loss": 0.05346375,
988
+ "memory(GiB)": 75.3,
989
+ "step": 540,
990
+ "train_speed(iter/s)": 0.003349
991
+ },
992
+ {
993
+ "epoch": 0.5135944988846106,
994
+ "grad_norm": 0.828125,
995
+ "learning_rate": 4.792595858935668e-06,
996
+ "loss": 0.05774211,
997
+ "memory(GiB)": 75.3,
998
+ "step": 545,
999
+ "train_speed(iter/s)": 0.003349
1000
+ },
1001
+ {
1002
+ "epoch": 0.5183063750211666,
1003
+ "grad_norm": 0.9921875,
1004
+ "learning_rate": 4.718590947210788e-06,
1005
+ "loss": 0.05547717,
1006
+ "memory(GiB)": 75.3,
1007
+ "step": 550,
1008
+ "train_speed(iter/s)": 0.003349
1009
+ },
1010
+ {
1011
+ "epoch": 0.5230182511577227,
1012
+ "grad_norm": 0.8046875,
1013
+ "learning_rate": 4.644647831169435e-06,
1014
+ "loss": 0.05536319,
1015
+ "memory(GiB)": 75.3,
1016
+ "step": 555,
1017
+ "train_speed(iter/s)": 0.003349
1018
+ },
1019
+ {
1020
+ "epoch": 0.5277301272942787,
1021
+ "grad_norm": 1.015625,
1022
+ "learning_rate": 4.570782748261516e-06,
1023
+ "loss": 0.05369086,
1024
+ "memory(GiB)": 75.3,
1025
+ "step": 560,
1026
+ "train_speed(iter/s)": 0.003349
1027
+ },
1028
+ {
1029
+ "epoch": 0.5324420034308348,
1030
+ "grad_norm": 0.94140625,
1031
+ "learning_rate": 4.497011918801347e-06,
1032
+ "loss": 0.05471834,
1033
+ "memory(GiB)": 75.3,
1034
+ "step": 565,
1035
+ "train_speed(iter/s)": 0.00335
1036
+ },
1037
+ {
1038
+ "epoch": 0.5371538795673909,
1039
+ "grad_norm": 0.9140625,
1040
+ "learning_rate": 4.423351542405764e-06,
1041
+ "loss": 0.05114409,
1042
+ "memory(GiB)": 75.3,
1043
+ "step": 570,
1044
+ "train_speed(iter/s)": 0.00335
1045
+ },
1046
+ {
1047
+ "epoch": 0.5418657557039469,
1048
+ "grad_norm": 0.9765625,
1049
+ "learning_rate": 4.349817794436805e-06,
1050
+ "loss": 0.05673685,
1051
+ "memory(GiB)": 75.3,
1052
+ "step": 575,
1053
+ "train_speed(iter/s)": 0.00335
1054
+ },
1055
+ {
1056
+ "epoch": 0.546577631840503,
1057
+ "grad_norm": 0.88671875,
1058
+ "learning_rate": 4.276426822449682e-06,
1059
+ "loss": 0.05527523,
1060
+ "memory(GiB)": 75.3,
1061
+ "step": 580,
1062
+ "train_speed(iter/s)": 0.00335
1063
+ },
1064
+ {
1065
+ "epoch": 0.551289507977059,
1066
+ "grad_norm": 0.90625,
1067
+ "learning_rate": 4.203194742646893e-06,
1068
+ "loss": 0.05317973,
1069
+ "memory(GiB)": 75.3,
1070
+ "step": 585,
1071
+ "train_speed(iter/s)": 0.00335
1072
+ },
1073
+ {
1074
+ "epoch": 0.5560013841136151,
1075
+ "grad_norm": 1.0078125,
1076
+ "learning_rate": 4.130137636339191e-06,
1077
+ "loss": 0.05449303,
1078
+ "memory(GiB)": 75.3,
1079
+ "step": 590,
1080
+ "train_speed(iter/s)": 0.00335
1081
+ },
1082
+ {
1083
+ "epoch": 0.5607132602501712,
1084
+ "grad_norm": 0.89453125,
1085
+ "learning_rate": 4.057271546414242e-06,
1086
+ "loss": 0.05341119,
1087
+ "memory(GiB)": 75.3,
1088
+ "step": 595,
1089
+ "train_speed(iter/s)": 0.00335
1090
+ },
1091
+ {
1092
+ "epoch": 0.5654251363867272,
1093
+ "grad_norm": 0.8515625,
1094
+ "learning_rate": 3.984612473813689e-06,
1095
+ "loss": 0.05254069,
1096
+ "memory(GiB)": 75.3,
1097
+ "step": 600,
1098
+ "train_speed(iter/s)": 0.00335
1099
+ },
1100
+ {
1101
+ "epoch": 0.5701370125232833,
1102
+ "grad_norm": 0.8984375,
1103
+ "learning_rate": 3.912176374019462e-06,
1104
+ "loss": 0.05324795,
1105
+ "memory(GiB)": 75.3,
1106
+ "step": 605,
1107
+ "train_speed(iter/s)": 0.00335
1108
+ },
1109
+ {
1110
+ "epoch": 0.5748488886598394,
1111
+ "grad_norm": 0.8671875,
1112
+ "learning_rate": 3.839979153550039e-06,
1113
+ "loss": 0.05177047,
1114
+ "memory(GiB)": 75.3,
1115
+ "step": 610,
1116
+ "train_speed(iter/s)": 0.00335
1117
+ },
1118
+ {
1119
+ "epoch": 0.5795607647963954,
1120
+ "grad_norm": 0.82421875,
1121
+ "learning_rate": 3.768036666467486e-06,
1122
+ "loss": 0.05265539,
1123
+ "memory(GiB)": 75.3,
1124
+ "step": 615,
1125
+ "train_speed(iter/s)": 0.00335
1126
+ },
1127
+ {
1128
+ "epoch": 0.5842726409329515,
1129
+ "grad_norm": 0.88671875,
1130
+ "learning_rate": 3.6963647108959868e-06,
1131
+ "loss": 0.05418316,
1132
+ "memory(GiB)": 75.3,
1133
+ "step": 620,
1134
+ "train_speed(iter/s)": 0.00335
1135
+ },
1136
+ {
1137
+ "epoch": 0.5889845170695075,
1138
+ "grad_norm": 0.93359375,
1139
+ "learning_rate": 3.6249790255526916e-06,
1140
+ "loss": 0.05562772,
1141
+ "memory(GiB)": 75.3,
1142
+ "step": 625,
1143
+ "train_speed(iter/s)": 0.00335
1144
+ },
1145
+ {
1146
+ "epoch": 0.5936963932060636,
1147
+ "grad_norm": 0.90234375,
1148
+ "learning_rate": 3.553895286291577e-06,
1149
+ "loss": 0.05445199,
1150
+ "memory(GiB)": 75.3,
1151
+ "step": 630,
1152
+ "train_speed(iter/s)": 0.00335
1153
+ },
1154
+ {
1155
+ "epoch": 0.5984082693426197,
1156
+ "grad_norm": 0.90234375,
1157
+ "learning_rate": 3.483129102661137e-06,
1158
+ "loss": 0.05333483,
1159
+ "memory(GiB)": 75.3,
1160
+ "step": 635,
1161
+ "train_speed(iter/s)": 0.00335
1162
+ },
1163
+ {
1164
+ "epoch": 0.6031201454791757,
1165
+ "grad_norm": 0.8515625,
1166
+ "learning_rate": 3.4126960144766107e-06,
1167
+ "loss": 0.05417204,
1168
+ "memory(GiB)": 75.3,
1169
+ "step": 640,
1170
+ "train_speed(iter/s)": 0.00335
1171
+ },
1172
+ {
1173
+ "epoch": 0.6078320216157318,
1174
+ "grad_norm": 0.91015625,
1175
+ "learning_rate": 3.3426114884075488e-06,
1176
+ "loss": 0.05412987,
1177
+ "memory(GiB)": 75.3,
1178
+ "step": 645,
1179
+ "train_speed(iter/s)": 0.00335
1180
+ },
1181
+ {
1182
+ "epoch": 0.6125438977522878,
1183
+ "grad_norm": 0.87109375,
1184
+ "learning_rate": 3.272890914581417e-06,
1185
+ "loss": 0.05388454,
1186
+ "memory(GiB)": 75.3,
1187
+ "step": 650,
1188
+ "train_speed(iter/s)": 0.003349
1189
+ },
1190
+ {
1191
+ "epoch": 0.6172557738888439,
1192
+ "grad_norm": 0.85546875,
1193
+ "learning_rate": 3.2035496032040303e-06,
1194
+ "loss": 0.05097753,
1195
+ "memory(GiB)": 75.3,
1196
+ "step": 655,
1197
+ "train_speed(iter/s)": 0.003349
1198
+ },
1199
+ {
1200
+ "epoch": 0.6219676500254,
1201
+ "grad_norm": 0.875,
1202
+ "learning_rate": 3.134602781197515e-06,
1203
+ "loss": 0.05341196,
1204
+ "memory(GiB)": 75.3,
1205
+ "step": 660,
1206
+ "train_speed(iter/s)": 0.00335
1207
+ },
1208
+ {
1209
+ "epoch": 0.626679526161956,
1210
+ "grad_norm": 0.90625,
1211
+ "learning_rate": 3.0660655888565827e-06,
1212
+ "loss": 0.05016219,
1213
+ "memory(GiB)": 75.3,
1214
+ "step": 665,
1215
+ "train_speed(iter/s)": 0.00335
1216
+ },
1217
+ {
1218
+ "epoch": 0.6313914022985121,
1219
+ "grad_norm": 0.95703125,
1220
+ "learning_rate": 2.997953076523803e-06,
1221
+ "loss": 0.05216441,
1222
+ "memory(GiB)": 75.3,
1223
+ "step": 670,
1224
+ "train_speed(iter/s)": 0.00335
1225
+ },
1226
+ {
1227
+ "epoch": 0.6361032784350681,
1228
+ "grad_norm": 1.015625,
1229
+ "learning_rate": 2.930280201284654e-06,
1230
+ "loss": 0.05449665,
1231
+ "memory(GiB)": 75.3,
1232
+ "step": 675,
1233
+ "train_speed(iter/s)": 0.00335
1234
+ },
1235
+ {
1236
+ "epoch": 0.6408151545716242,
1237
+ "grad_norm": 0.921875,
1238
+ "learning_rate": 2.863061823683032e-06,
1239
+ "loss": 0.05129569,
1240
+ "memory(GiB)": 75.3,
1241
+ "step": 680,
1242
+ "train_speed(iter/s)": 0.00335
1243
+ },
1244
+ {
1245
+ "epoch": 0.6455270307081803,
1246
+ "grad_norm": 0.87890625,
1247
+ "learning_rate": 2.7963127044579697e-06,
1248
+ "loss": 0.05290835,
1249
+ "memory(GiB)": 75.3,
1250
+ "step": 685,
1251
+ "train_speed(iter/s)": 0.00335
1252
+ },
1253
+ {
1254
+ "epoch": 0.6502389068447363,
1255
+ "grad_norm": 0.87109375,
1256
+ "learning_rate": 2.7300475013022666e-06,
1257
+ "loss": 0.0528672,
1258
+ "memory(GiB)": 75.3,
1259
+ "step": 690,
1260
+ "train_speed(iter/s)": 0.00335
1261
+ },
1262
+ {
1263
+ "epoch": 0.6549507829812924,
1264
+ "grad_norm": 0.984375,
1265
+ "learning_rate": 2.6642807656437565e-06,
1266
+ "loss": 0.05229232,
1267
+ "memory(GiB)": 75.3,
1268
+ "step": 695,
1269
+ "train_speed(iter/s)": 0.00335
1270
+ },
1271
+ {
1272
+ "epoch": 0.6596626591178485,
1273
+ "grad_norm": 0.9609375,
1274
+ "learning_rate": 2.599026939449899e-06,
1275
+ "loss": 0.05371115,
1276
+ "memory(GiB)": 75.3,
1277
+ "step": 700,
1278
+ "train_speed(iter/s)": 0.00335
1279
+ },
1280
+ {
1281
+ "epoch": 0.6643745352544045,
1282
+ "grad_norm": 1.0703125,
1283
+ "learning_rate": 2.534300352056416e-06,
1284
+ "loss": 0.05234203,
1285
+ "memory(GiB)": 75.3,
1286
+ "step": 705,
1287
+ "train_speed(iter/s)": 0.00335
1288
+ },
1289
+ {
1290
+ "epoch": 0.6690864113909606,
1291
+ "grad_norm": 0.98828125,
1292
+ "learning_rate": 2.470115217020654e-06,
1293
+ "loss": 0.05360326,
1294
+ "memory(GiB)": 75.3,
1295
+ "step": 710,
1296
+ "train_speed(iter/s)": 0.00335
1297
+ },
1298
+ {
1299
+ "epoch": 0.6737982875275166,
1300
+ "grad_norm": 0.92578125,
1301
+ "learning_rate": 2.4064856290003863e-06,
1302
+ "loss": 0.05475932,
1303
+ "memory(GiB)": 75.3,
1304
+ "step": 715,
1305
+ "train_speed(iter/s)": 0.00335
1306
+ },
1307
+ {
1308
+ "epoch": 0.6785101636640727,
1309
+ "grad_norm": 1.0703125,
1310
+ "learning_rate": 2.3434255606586925e-06,
1311
+ "loss": 0.05548735,
1312
+ "memory(GiB)": 75.3,
1313
+ "step": 720,
1314
+ "train_speed(iter/s)": 0.00335
1315
+ },
1316
+ {
1317
+ "epoch": 0.6832220398006288,
1318
+ "grad_norm": 0.89453125,
1319
+ "learning_rate": 2.2809488595956746e-06,
1320
+ "loss": 0.05201564,
1321
+ "memory(GiB)": 75.3,
1322
+ "step": 725,
1323
+ "train_speed(iter/s)": 0.00335
1324
+ },
1325
+ {
1326
+ "epoch": 0.6879339159371848,
1327
+ "grad_norm": 0.9140625,
1328
+ "learning_rate": 2.219069245307589e-06,
1329
+ "loss": 0.05408272,
1330
+ "memory(GiB)": 75.3,
1331
+ "step": 730,
1332
+ "train_speed(iter/s)": 0.003351
1333
+ },
1334
+ {
1335
+ "epoch": 0.6926457920737409,
1336
+ "grad_norm": 1.1640625,
1337
+ "learning_rate": 2.157800306174139e-06,
1338
+ "loss": 0.05537663,
1339
+ "memory(GiB)": 75.3,
1340
+ "step": 735,
1341
+ "train_speed(iter/s)": 0.003351
1342
+ },
1343
+ {
1344
+ "epoch": 0.6973576682102969,
1345
+ "grad_norm": 1.125,
1346
+ "learning_rate": 2.0971554964745476e-06,
1347
+ "loss": 0.05455139,
1348
+ "memory(GiB)": 75.3,
1349
+ "step": 740,
1350
+ "train_speed(iter/s)": 0.003351
1351
+ },
1352
+ {
1353
+ "epoch": 0.702069544346853,
1354
+ "grad_norm": 0.87890625,
1355
+ "learning_rate": 2.0371481334330913e-06,
1356
+ "loss": 0.05394316,
1357
+ "memory(GiB)": 75.3,
1358
+ "step": 745,
1359
+ "train_speed(iter/s)": 0.003351
1360
+ },
1361
+ {
1362
+ "epoch": 0.7067814204834091,
1363
+ "grad_norm": 0.828125,
1364
+ "learning_rate": 1.9777913942946987e-06,
1365
+ "loss": 0.05269849,
1366
+ "memory(GiB)": 75.3,
1367
+ "step": 750,
1368
+ "train_speed(iter/s)": 0.003351
1369
+ },
1370
+ {
1371
+ "epoch": 0.7114932966199651,
1372
+ "grad_norm": 0.81640625,
1373
+ "learning_rate": 1.919098313431335e-06,
1374
+ "loss": 0.05057405,
1375
+ "memory(GiB)": 75.3,
1376
+ "step": 755,
1377
+ "train_speed(iter/s)": 0.003351
1378
+ },
1379
+ {
1380
+ "epoch": 0.7162051727565212,
1381
+ "grad_norm": 0.9375,
1382
+ "learning_rate": 1.8610817794797164e-06,
1383
+ "loss": 0.05438253,
1384
+ "memory(GiB)": 75.3,
1385
+ "step": 760,
1386
+ "train_speed(iter/s)": 0.003351
1387
+ },
1388
+ {
1389
+ "epoch": 0.7209170488930772,
1390
+ "grad_norm": 0.84375,
1391
+ "learning_rate": 1.8037545325110506e-06,
1392
+ "loss": 0.05222658,
1393
+ "memory(GiB)": 75.3,
1394
+ "step": 765,
1395
+ "train_speed(iter/s)": 0.003351
1396
+ },
1397
+ {
1398
+ "epoch": 0.7256289250296333,
1399
+ "grad_norm": 0.88671875,
1400
+ "learning_rate": 1.7471291612333997e-06,
1401
+ "loss": 0.05131737,
1402
+ "memory(GiB)": 75.3,
1403
+ "step": 770,
1404
+ "train_speed(iter/s)": 0.003351
1405
+ },
1406
+ {
1407
+ "epoch": 0.7303408011661894,
1408
+ "grad_norm": 0.93359375,
1409
+ "learning_rate": 1.6912181002272714e-06,
1410
+ "loss": 0.05391481,
1411
+ "memory(GiB)": 75.3,
1412
+ "step": 775,
1413
+ "train_speed(iter/s)": 0.003351
1414
+ },
1415
+ {
1416
+ "epoch": 0.7350526773027454,
1417
+ "grad_norm": 0.8984375,
1418
+ "learning_rate": 1.6360336272150684e-06,
1419
+ "loss": 0.05078862,
1420
+ "memory(GiB)": 75.3,
1421
+ "step": 780,
1422
+ "train_speed(iter/s)": 0.003351
1423
+ },
1424
+ {
1425
+ "epoch": 0.7397645534393015,
1426
+ "grad_norm": 0.953125,
1427
+ "learning_rate": 1.581587860364977e-06,
1428
+ "loss": 0.05192038,
1429
+ "memory(GiB)": 75.3,
1430
+ "step": 785,
1431
+ "train_speed(iter/s)": 0.003351
1432
+ },
1433
+ {
1434
+ "epoch": 0.7444764295758576,
1435
+ "grad_norm": 0.96875,
1436
+ "learning_rate": 1.52789275562988e-06,
1437
+ "loss": 0.05364103,
1438
+ "memory(GiB)": 75.3,
1439
+ "step": 790,
1440
+ "train_speed(iter/s)": 0.003351
1441
+ },
1442
+ {
1443
+ "epoch": 0.7491883057124136,
1444
+ "grad_norm": 1.0625,
1445
+ "learning_rate": 1.4749601041219246e-06,
1446
+ "loss": 0.0536845,
1447
+ "memory(GiB)": 75.3,
1448
+ "step": 795,
1449
+ "train_speed(iter/s)": 0.003351
1450
+ },
1451
+ {
1452
+ "epoch": 0.7539001818489697,
1453
+ "grad_norm": 0.90234375,
1454
+ "learning_rate": 1.4228015295232484e-06,
1455
+ "loss": 0.05084696,
1456
+ "memory(GiB)": 75.3,
1457
+ "step": 800,
1458
+ "train_speed(iter/s)": 0.003351
1459
+ },
1460
+ {
1461
+ "epoch": 0.7586120579855257,
1462
+ "grad_norm": 0.875,
1463
+ "learning_rate": 1.371428485533498e-06,
1464
+ "loss": 0.05773014,
1465
+ "memory(GiB)": 75.3,
1466
+ "step": 805,
1467
+ "train_speed(iter/s)": 0.00335
1468
+ },
1469
+ {
1470
+ "epoch": 0.7633239341220818,
1471
+ "grad_norm": 0.91796875,
1472
+ "learning_rate": 1.3208522533546748e-06,
1473
+ "loss": 0.05219783,
1474
+ "memory(GiB)": 75.3,
1475
+ "step": 810,
1476
+ "train_speed(iter/s)": 0.00335
1477
+ },
1478
+ {
1479
+ "epoch": 0.7680358102586379,
1480
+ "grad_norm": 0.96875,
1481
+ "learning_rate": 1.2710839392138386e-06,
1482
+ "loss": 0.05375321,
1483
+ "memory(GiB)": 75.3,
1484
+ "step": 815,
1485
+ "train_speed(iter/s)": 0.00335
1486
+ },
1487
+ {
1488
+ "epoch": 0.7727476863951939,
1489
+ "grad_norm": 0.87890625,
1490
+ "learning_rate": 1.222134471924259e-06,
1491
+ "loss": 0.05204231,
1492
+ "memory(GiB)": 75.3,
1493
+ "step": 820,
1494
+ "train_speed(iter/s)": 0.00335
1495
+ },
1496
+ {
1497
+ "epoch": 0.77745956253175,
1498
+ "grad_norm": 0.91796875,
1499
+ "learning_rate": 1.1740146004855141e-06,
1500
+ "loss": 0.0559127,
1501
+ "memory(GiB)": 75.3,
1502
+ "step": 825,
1503
+ "train_speed(iter/s)": 0.003351
1504
+ },
1505
+ {
1506
+ "epoch": 0.782171438668306,
1507
+ "grad_norm": 0.89453125,
1508
+ "learning_rate": 1.1267348917230737e-06,
1509
+ "loss": 0.05298336,
1510
+ "memory(GiB)": 75.3,
1511
+ "step": 830,
1512
+ "train_speed(iter/s)": 0.00335
1513
+ },
1514
+ {
1515
+ "epoch": 0.7868833148048621,
1516
+ "grad_norm": 0.90625,
1517
+ "learning_rate": 1.080305727967893e-06,
1518
+ "loss": 0.05347639,
1519
+ "memory(GiB)": 75.3,
1520
+ "step": 835,
1521
+ "train_speed(iter/s)": 0.00335
1522
+ },
1523
+ {
1524
+ "epoch": 0.7915951909414182,
1525
+ "grad_norm": 0.84765625,
1526
+ "learning_rate": 1.0347373047765202e-06,
1527
+ "loss": 0.05329442,
1528
+ "memory(GiB)": 75.3,
1529
+ "step": 840,
1530
+ "train_speed(iter/s)": 0.00335
1531
+ },
1532
+ {
1533
+ "epoch": 0.7963070670779742,
1534
+ "grad_norm": 0.8359375,
1535
+ "learning_rate": 9.900396286922025e-07,
1536
+ "loss": 0.0537856,
1537
+ "memory(GiB)": 75.3,
1538
+ "step": 845,
1539
+ "train_speed(iter/s)": 0.00335
1540
+ },
1541
+ {
1542
+ "epoch": 0.8010189432145303,
1543
+ "grad_norm": 0.8125,
1544
+ "learning_rate": 9.462225150475296e-07,
1545
+ "loss": 0.05233877,
1546
+ "memory(GiB)": 75.3,
1547
+ "step": 850,
1548
+ "train_speed(iter/s)": 0.00335
1549
+ },
1550
+ {
1551
+ "epoch": 0.8057308193510863,
1552
+ "grad_norm": 0.88671875,
1553
+ "learning_rate": 9.032955858090319e-07,
1554
+ "loss": 0.0549244,
1555
+ "memory(GiB)": 75.3,
1556
+ "step": 855,
1557
+ "train_speed(iter/s)": 0.003351
1558
+ },
1559
+ {
1560
+ "epoch": 0.8104426954876424,
1561
+ "grad_norm": 0.9140625,
1562
+ "learning_rate": 8.612682674642647e-07,
1563
+ "loss": 0.04935811,
1564
+ "memory(GiB)": 75.3,
1565
+ "step": 860,
1566
+ "train_speed(iter/s)": 0.003351
1567
+ },
1568
+ {
1569
+ "epoch": 0.8151545716241985,
1570
+ "grad_norm": 0.921875,
1571
+ "learning_rate": 8.201497889518073e-07,
1572
+ "loss": 0.05281691,
1573
+ "memory(GiB)": 75.3,
1574
+ "step": 865,
1575
+ "train_speed(iter/s)": 0.003351
1576
+ },
1577
+ {
1578
+ "epoch": 0.8198664477607545,
1579
+ "grad_norm": 0.90625,
1580
+ "learning_rate": 7.799491796346487e-07,
1581
+ "loss": 0.05795277,
1582
+ "memory(GiB)": 75.3,
1583
+ "step": 870,
1584
+ "train_speed(iter/s)": 0.003351
1585
+ },
1586
+ {
1587
+ "epoch": 0.8245783238973106,
1588
+ "grad_norm": 0.8046875,
1589
+ "learning_rate": 7.406752673173851e-07,
1590
+ "loss": 0.05225162,
1591
+ "memory(GiB)": 75.3,
1592
+ "step": 875,
1593
+ "train_speed(iter/s)": 0.003351
1594
+ },
1595
+ {
1596
+ "epoch": 0.8292902000338667,
1597
+ "grad_norm": 0.87890625,
1598
+ "learning_rate": 7.023366763077044e-07,
1599
+ "loss": 0.0509973,
1600
+ "memory(GiB)": 75.3,
1601
+ "step": 880,
1602
+ "train_speed(iter/s)": 0.003351
1603
+ },
1604
+ {
1605
+ "epoch": 0.8340020761704227,
1606
+ "grad_norm": 0.87109375,
1607
+ "learning_rate": 6.649418255225298e-07,
1608
+ "loss": 0.05142277,
1609
+ "memory(GiB)": 75.3,
1610
+ "step": 885,
1611
+ "train_speed(iter/s)": 0.003351
1612
+ },
1613
+ {
1614
+ "epoch": 0.8387139523069788,
1615
+ "grad_norm": 0.95703125,
1616
+ "learning_rate": 6.284989266392805e-07,
1617
+ "loss": 0.05023923,
1618
+ "memory(GiB)": 75.3,
1619
+ "step": 890,
1620
+ "train_speed(iter/s)": 0.003351
1621
+ },
1622
+ {
1623
+ "epoch": 0.8434258284435348,
1624
+ "grad_norm": 0.8828125,
1625
+ "learning_rate": 5.930159822926407e-07,
1626
+ "loss": 0.0534648,
1627
+ "memory(GiB)": 75.3,
1628
+ "step": 895,
1629
+ "train_speed(iter/s)": 0.003351
1630
+ },
1631
+ {
1632
+ "epoch": 0.8481377045800909,
1633
+ "grad_norm": 0.84375,
1634
+ "learning_rate": 5.585007843172286e-07,
1635
+ "loss": 0.05155768,
1636
+ "memory(GiB)": 75.3,
1637
+ "step": 900,
1638
+ "train_speed(iter/s)": 0.003351
1639
+ },
1640
+ {
1641
+ "epoch": 0.852849580716647,
1642
+ "grad_norm": 0.9453125,
1643
+ "learning_rate": 5.249609120365579e-07,
1644
+ "loss": 0.05368913,
1645
+ "memory(GiB)": 75.3,
1646
+ "step": 905,
1647
+ "train_speed(iter/s)": 0.003351
1648
+ },
1649
+ {
1650
+ "epoch": 0.857561456853203,
1651
+ "grad_norm": 0.859375,
1652
+ "learning_rate": 4.924037305986696e-07,
1653
+ "loss": 0.05452033,
1654
+ "memory(GiB)": 75.3,
1655
+ "step": 910,
1656
+ "train_speed(iter/s)": 0.003351
1657
+ },
1658
+ {
1659
+ "epoch": 0.8622733329897591,
1660
+ "grad_norm": 0.8515625,
1661
+ "learning_rate": 4.6083638935878025e-07,
1662
+ "loss": 0.05384221,
1663
+ "memory(GiB)": 75.3,
1664
+ "step": 915,
1665
+ "train_speed(iter/s)": 0.003351
1666
+ },
1667
+ {
1668
+ "epoch": 0.866985209126315,
1669
+ "grad_norm": 0.828125,
1670
+ "learning_rate": 4.302658203093418e-07,
1671
+ "loss": 0.05272598,
1672
+ "memory(GiB)": 75.3,
1673
+ "step": 920,
1674
+ "train_speed(iter/s)": 0.003351
1675
+ },
1676
+ {
1677
+ "epoch": 0.8716970852628712,
1678
+ "grad_norm": 0.8671875,
1679
+ "learning_rate": 4.00698736557808e-07,
1680
+ "loss": 0.05447989,
1681
+ "memory(GiB)": 75.3,
1682
+ "step": 925,
1683
+ "train_speed(iter/s)": 0.003351
1684
+ },
1685
+ {
1686
+ "epoch": 0.8764089613994273,
1687
+ "grad_norm": 0.9453125,
1688
+ "learning_rate": 3.721416308524839e-07,
1689
+ "loss": 0.05123619,
1690
+ "memory(GiB)": 75.3,
1691
+ "step": 930,
1692
+ "train_speed(iter/s)": 0.003351
1693
+ },
1694
+ {
1695
+ "epoch": 0.8811208375359832,
1696
+ "grad_norm": 0.8515625,
1697
+ "learning_rate": 3.4460077415675473e-07,
1698
+ "loss": 0.05347574,
1699
+ "memory(GiB)": 75.3,
1700
+ "step": 935,
1701
+ "train_speed(iter/s)": 0.003351
1702
+ },
1703
+ {
1704
+ "epoch": 0.8858327136725394,
1705
+ "grad_norm": 0.7890625,
1706
+ "learning_rate": 3.1808221427202636e-07,
1707
+ "loss": 0.05334803,
1708
+ "memory(GiB)": 75.3,
1709
+ "step": 940,
1710
+ "train_speed(iter/s)": 0.003351
1711
+ },
1712
+ {
1713
+ "epoch": 0.8905445898090953,
1714
+ "grad_norm": 0.94921875,
1715
+ "learning_rate": 2.925917745096568e-07,
1716
+ "loss": 0.05249671,
1717
+ "memory(GiB)": 75.3,
1718
+ "step": 945,
1719
+ "train_speed(iter/s)": 0.003351
1720
+ },
1721
+ {
1722
+ "epoch": 0.8952564659456514,
1723
+ "grad_norm": 0.91015625,
1724
+ "learning_rate": 2.681350524122045e-07,
1725
+ "loss": 0.05494893,
1726
+ "memory(GiB)": 75.3,
1727
+ "step": 950,
1728
+ "train_speed(iter/s)": 0.003351
1729
+ },
1730
+ {
1731
+ "epoch": 0.8999683420822076,
1732
+ "grad_norm": 0.828125,
1733
+ "learning_rate": 2.447174185242324e-07,
1734
+ "loss": 0.05149726,
1735
+ "memory(GiB)": 75.3,
1736
+ "step": 955,
1737
+ "train_speed(iter/s)": 0.003351
1738
+ },
1739
+ {
1740
+ "epoch": 0.9046802182187635,
1741
+ "grad_norm": 1.0859375,
1742
+ "learning_rate": 2.2234401521297576e-07,
1743
+ "loss": 0.05425293,
1744
+ "memory(GiB)": 75.3,
1745
+ "step": 960,
1746
+ "train_speed(iter/s)": 0.003351
1747
+ },
1748
+ {
1749
+ "epoch": 0.9093920943553196,
1750
+ "grad_norm": 0.87109375,
1751
+ "learning_rate": 2.01019755539108e-07,
1752
+ "loss": 0.0552171,
1753
+ "memory(GiB)": 75.3,
1754
+ "step": 965,
1755
+ "train_speed(iter/s)": 0.003351
1756
+ },
1757
+ {
1758
+ "epoch": 0.9141039704918758,
1759
+ "grad_norm": 0.84375,
1760
+ "learning_rate": 1.8074932217786445e-07,
1761
+ "loss": 0.05237709,
1762
+ "memory(GiB)": 75.3,
1763
+ "step": 970,
1764
+ "train_speed(iter/s)": 0.003351
1765
+ },
1766
+ {
1767
+ "epoch": 0.9188158466284317,
1768
+ "grad_norm": 0.86328125,
1769
+ "learning_rate": 1.6153716639075223e-07,
1770
+ "loss": 0.05221198,
1771
+ "memory(GiB)": 75.3,
1772
+ "step": 975,
1773
+ "train_speed(iter/s)": 0.003351
1774
+ },
1775
+ {
1776
+ "epoch": 0.9235277227649878,
1777
+ "grad_norm": 0.8125,
1778
+ "learning_rate": 1.433875070480878e-07,
1779
+ "loss": 0.05134506,
1780
+ "memory(GiB)": 75.3,
1781
+ "step": 980,
1782
+ "train_speed(iter/s)": 0.003352
1783
+ },
1784
+ {
1785
+ "epoch": 0.9282395989015438,
1786
+ "grad_norm": 0.890625,
1787
+ "learning_rate": 1.2630432970255014e-07,
1788
+ "loss": 0.05436495,
1789
+ "memory(GiB)": 75.3,
1790
+ "step": 985,
1791
+ "train_speed(iter/s)": 0.003352
1792
+ },
1793
+ {
1794
+ "epoch": 0.9329514750380999,
1795
+ "grad_norm": 0.921875,
1796
+ "learning_rate": 1.1029138571398645e-07,
1797
+ "loss": 0.05440986,
1798
+ "memory(GiB)": 75.3,
1799
+ "step": 990,
1800
+ "train_speed(iter/s)": 0.003352
1801
+ },
1802
+ {
1803
+ "epoch": 0.937663351174656,
1804
+ "grad_norm": 0.91796875,
1805
+ "learning_rate": 9.535219142563168e-08,
1806
+ "loss": 0.05418127,
1807
+ "memory(GiB)": 75.3,
1808
+ "step": 995,
1809
+ "train_speed(iter/s)": 0.003352
1810
+ },
1811
+ {
1812
+ "epoch": 0.942375227311212,
1813
+ "grad_norm": 0.8984375,
1814
+ "learning_rate": 8.149002739194222e-08,
1815
+ "loss": 0.05519557,
1816
+ "memory(GiB)": 75.3,
1817
+ "step": 1000,
1818
+ "train_speed(iter/s)": 0.003352
1819
  }
1820
  ],
1821
  "logging_steps": 5,
 
1835
  "attributes": {}
1836
  }
1837
  },
1838
+ "total_flos": 5.440049406181114e+18,
1839
  "train_batch_size": 2,
1840
  "trial_name": null,
1841
  "trial_params": null