lesso03 commited on
Commit
d7f1ed8
1 Parent(s): a0ae6dd

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dc7b8a29868bb0447446ccb9650ddc7eda5c03c84a70fba741571b4d237834df
3
  size 2269195160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:107d672c18b8a62f5938a854dbb1b4aef7780b39f2889fe38f142b72df38458a
3
  size 2269195160
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b793ba7698e1641fdc1e8ef8a7660057c32ec1357fd24f1cb5a8bb75e74f3b25
3
  size 335922386
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9d267eb13fafbb79e310d1608520391a110c5100b89224270b9ae67b19a9e8a
3
  size 335922386
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:07899f311009253221ecb7c812577e712b3036bf5bf7d033d3f1db6da6056cbf
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:938f3c5b1e74025dea7f7ecdfc1c134ecd27894e7d032a9550d538ee63172b38
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a89ffc445067fef9d6d02bb3ff9e61d5e3209e6fa67c7259b3b364b90dbaa2cd
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49d60a69e2379be2053e816cbaff31e6c931b5922dd86c71c9eaf473299cbf62
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.09082652134423251,
5
  "eval_steps": 9,
6
- "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -405,6 +405,404 @@
405
  "learning_rate": 5.868240888334653e-05,
406
  "loss": 0.2474,
407
  "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  }
409
  ],
410
  "logging_steps": 1,
@@ -419,12 +817,12 @@
419
  "should_evaluate": false,
420
  "should_log": false,
421
  "should_save": true,
422
- "should_training_stop": false
423
  },
424
  "attributes": {}
425
  }
426
  },
427
- "total_flos": 7.19631055627223e+16,
428
  "train_batch_size": 8,
429
  "trial_name": null,
430
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.18165304268846502,
5
  "eval_steps": 9,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
405
  "learning_rate": 5.868240888334653e-05,
406
  "loss": 0.2474,
407
  "step": 50
408
+ },
409
+ {
410
+ "epoch": 0.09264305177111716,
411
+ "grad_norm": 0.31370481848716736,
412
+ "learning_rate": 5.695865504800327e-05,
413
+ "loss": 0.2895,
414
+ "step": 51
415
+ },
416
+ {
417
+ "epoch": 0.09445958219800181,
418
+ "grad_norm": 0.26035645604133606,
419
+ "learning_rate": 5.522642316338268e-05,
420
+ "loss": 0.2501,
421
+ "step": 52
422
+ },
423
+ {
424
+ "epoch": 0.09627611262488647,
425
+ "grad_norm": 0.3116852343082428,
426
+ "learning_rate": 5.348782368720626e-05,
427
+ "loss": 0.2527,
428
+ "step": 53
429
+ },
430
+ {
431
+ "epoch": 0.09809264305177112,
432
+ "grad_norm": 0.2944983243942261,
433
+ "learning_rate": 5.174497483512506e-05,
434
+ "loss": 0.2754,
435
+ "step": 54
436
+ },
437
+ {
438
+ "epoch": 0.09809264305177112,
439
+ "eval_loss": 0.27958133816719055,
440
+ "eval_runtime": 108.6945,
441
+ "eval_samples_per_second": 4.269,
442
+ "eval_steps_per_second": 0.534,
443
+ "step": 54
444
+ },
445
+ {
446
+ "epoch": 0.09990917347865577,
447
+ "grad_norm": 0.350281685590744,
448
+ "learning_rate": 5e-05,
449
+ "loss": 0.2596,
450
+ "step": 55
451
+ },
452
+ {
453
+ "epoch": 0.10172570390554042,
454
+ "grad_norm": 0.33483561873435974,
455
+ "learning_rate": 4.825502516487497e-05,
456
+ "loss": 0.3078,
457
+ "step": 56
458
+ },
459
+ {
460
+ "epoch": 0.10354223433242507,
461
+ "grad_norm": 0.28896334767341614,
462
+ "learning_rate": 4.6512176312793736e-05,
463
+ "loss": 0.2864,
464
+ "step": 57
465
+ },
466
+ {
467
+ "epoch": 0.10535876475930972,
468
+ "grad_norm": 0.27719321846961975,
469
+ "learning_rate": 4.477357683661734e-05,
470
+ "loss": 0.2958,
471
+ "step": 58
472
+ },
473
+ {
474
+ "epoch": 0.10717529518619437,
475
+ "grad_norm": 0.31106963753700256,
476
+ "learning_rate": 4.3041344951996746e-05,
477
+ "loss": 0.2851,
478
+ "step": 59
479
+ },
480
+ {
481
+ "epoch": 0.10899182561307902,
482
+ "grad_norm": 0.253791481256485,
483
+ "learning_rate": 4.131759111665349e-05,
484
+ "loss": 0.2168,
485
+ "step": 60
486
+ },
487
+ {
488
+ "epoch": 0.11080835603996367,
489
+ "grad_norm": 0.37355419993400574,
490
+ "learning_rate": 3.960441545911204e-05,
491
+ "loss": 0.3131,
492
+ "step": 61
493
+ },
494
+ {
495
+ "epoch": 0.11262488646684832,
496
+ "grad_norm": 0.40072622895240784,
497
+ "learning_rate": 3.790390522001662e-05,
498
+ "loss": 0.3205,
499
+ "step": 62
500
+ },
501
+ {
502
+ "epoch": 0.11444141689373297,
503
+ "grad_norm": 0.27985572814941406,
504
+ "learning_rate": 3.6218132209150045e-05,
505
+ "loss": 0.2427,
506
+ "step": 63
507
+ },
508
+ {
509
+ "epoch": 0.11444141689373297,
510
+ "eval_loss": 0.27756959199905396,
511
+ "eval_runtime": 108.6448,
512
+ "eval_samples_per_second": 4.271,
513
+ "eval_steps_per_second": 0.534,
514
+ "step": 63
515
+ },
516
+ {
517
+ "epoch": 0.11625794732061762,
518
+ "grad_norm": 0.28662416338920593,
519
+ "learning_rate": 3.4549150281252636e-05,
520
+ "loss": 0.3106,
521
+ "step": 64
522
+ },
523
+ {
524
+ "epoch": 0.11807447774750227,
525
+ "grad_norm": 0.29447686672210693,
526
+ "learning_rate": 3.289899283371657e-05,
527
+ "loss": 0.2368,
528
+ "step": 65
529
+ },
530
+ {
531
+ "epoch": 0.11989100817438691,
532
+ "grad_norm": 0.2652858793735504,
533
+ "learning_rate": 3.12696703292044e-05,
534
+ "loss": 0.2319,
535
+ "step": 66
536
+ },
537
+ {
538
+ "epoch": 0.12170753860127158,
539
+ "grad_norm": 0.29174622893333435,
540
+ "learning_rate": 2.9663167846209998e-05,
541
+ "loss": 0.3058,
542
+ "step": 67
543
+ },
544
+ {
545
+ "epoch": 0.12352406902815623,
546
+ "grad_norm": 0.30116814374923706,
547
+ "learning_rate": 2.8081442660546125e-05,
548
+ "loss": 0.2818,
549
+ "step": 68
550
+ },
551
+ {
552
+ "epoch": 0.12534059945504086,
553
+ "grad_norm": 0.30650588870048523,
554
+ "learning_rate": 2.6526421860705473e-05,
555
+ "loss": 0.2743,
556
+ "step": 69
557
+ },
558
+ {
559
+ "epoch": 0.1271571298819255,
560
+ "grad_norm": 0.27553582191467285,
561
+ "learning_rate": 2.500000000000001e-05,
562
+ "loss": 0.2155,
563
+ "step": 70
564
+ },
565
+ {
566
+ "epoch": 0.12897366030881016,
567
+ "grad_norm": 0.2803662121295929,
568
+ "learning_rate": 2.350403678833976e-05,
569
+ "loss": 0.2782,
570
+ "step": 71
571
+ },
572
+ {
573
+ "epoch": 0.1307901907356948,
574
+ "grad_norm": 0.26244497299194336,
575
+ "learning_rate": 2.2040354826462668e-05,
576
+ "loss": 0.1971,
577
+ "step": 72
578
+ },
579
+ {
580
+ "epoch": 0.1307901907356948,
581
+ "eval_loss": 0.2747555673122406,
582
+ "eval_runtime": 108.6235,
583
+ "eval_samples_per_second": 4.272,
584
+ "eval_steps_per_second": 0.534,
585
+ "step": 72
586
+ },
587
+ {
588
+ "epoch": 0.13260672116257946,
589
+ "grad_norm": 0.29561930894851685,
590
+ "learning_rate": 2.061073738537635e-05,
591
+ "loss": 0.3123,
592
+ "step": 73
593
+ },
594
+ {
595
+ "epoch": 0.1344232515894641,
596
+ "grad_norm": 0.3003482520580292,
597
+ "learning_rate": 1.9216926233717085e-05,
598
+ "loss": 0.3104,
599
+ "step": 74
600
+ },
601
+ {
602
+ "epoch": 0.1362397820163488,
603
+ "grad_norm": 0.2806616425514221,
604
+ "learning_rate": 1.7860619515673033e-05,
605
+ "loss": 0.2908,
606
+ "step": 75
607
+ },
608
+ {
609
+ "epoch": 0.13805631244323344,
610
+ "grad_norm": 0.331093430519104,
611
+ "learning_rate": 1.6543469682057106e-05,
612
+ "loss": 0.2559,
613
+ "step": 76
614
+ },
615
+ {
616
+ "epoch": 0.13987284287011809,
617
+ "grad_norm": 0.2987718880176544,
618
+ "learning_rate": 1.526708147705013e-05,
619
+ "loss": 0.2594,
620
+ "step": 77
621
+ },
622
+ {
623
+ "epoch": 0.14168937329700274,
624
+ "grad_norm": 0.2768915593624115,
625
+ "learning_rate": 1.4033009983067452e-05,
626
+ "loss": 0.274,
627
+ "step": 78
628
+ },
629
+ {
630
+ "epoch": 0.14350590372388738,
631
+ "grad_norm": 0.28360041975975037,
632
+ "learning_rate": 1.2842758726130283e-05,
633
+ "loss": 0.2638,
634
+ "step": 79
635
+ },
636
+ {
637
+ "epoch": 0.14532243415077203,
638
+ "grad_norm": 0.28000327944755554,
639
+ "learning_rate": 1.1697777844051105e-05,
640
+ "loss": 0.2937,
641
+ "step": 80
642
+ },
643
+ {
644
+ "epoch": 0.14713896457765668,
645
+ "grad_norm": 0.3231543004512787,
646
+ "learning_rate": 1.0599462319663905e-05,
647
+ "loss": 0.2612,
648
+ "step": 81
649
+ },
650
+ {
651
+ "epoch": 0.14713896457765668,
652
+ "eval_loss": 0.27281635999679565,
653
+ "eval_runtime": 108.6714,
654
+ "eval_samples_per_second": 4.27,
655
+ "eval_steps_per_second": 0.534,
656
+ "step": 81
657
+ },
658
+ {
659
+ "epoch": 0.14895549500454133,
660
+ "grad_norm": 0.352360337972641,
661
+ "learning_rate": 9.549150281252633e-06,
662
+ "loss": 0.2377,
663
+ "step": 82
664
+ },
665
+ {
666
+ "epoch": 0.15077202543142598,
667
+ "grad_norm": 0.29913556575775146,
668
+ "learning_rate": 8.548121372247918e-06,
669
+ "loss": 0.297,
670
+ "step": 83
671
+ },
672
+ {
673
+ "epoch": 0.15258855585831063,
674
+ "grad_norm": 0.3100139796733856,
675
+ "learning_rate": 7.597595192178702e-06,
676
+ "loss": 0.3254,
677
+ "step": 84
678
+ },
679
+ {
680
+ "epoch": 0.15440508628519528,
681
+ "grad_norm": 0.2626723051071167,
682
+ "learning_rate": 6.698729810778065e-06,
683
+ "loss": 0.2542,
684
+ "step": 85
685
+ },
686
+ {
687
+ "epoch": 0.15622161671207993,
688
+ "grad_norm": 0.2944123148918152,
689
+ "learning_rate": 5.852620357053651e-06,
690
+ "loss": 0.3606,
691
+ "step": 86
692
+ },
693
+ {
694
+ "epoch": 0.15803814713896458,
695
+ "grad_norm": 0.34874227643013,
696
+ "learning_rate": 5.060297685041659e-06,
697
+ "loss": 0.3298,
698
+ "step": 87
699
+ },
700
+ {
701
+ "epoch": 0.15985467756584923,
702
+ "grad_norm": 0.3021911680698395,
703
+ "learning_rate": 4.322727117869951e-06,
704
+ "loss": 0.356,
705
+ "step": 88
706
+ },
707
+ {
708
+ "epoch": 0.16167120799273388,
709
+ "grad_norm": 0.26396703720092773,
710
+ "learning_rate": 3.6408072716606346e-06,
711
+ "loss": 0.2144,
712
+ "step": 89
713
+ },
714
+ {
715
+ "epoch": 0.16348773841961853,
716
+ "grad_norm": 0.29361093044281006,
717
+ "learning_rate": 3.0153689607045845e-06,
718
+ "loss": 0.2807,
719
+ "step": 90
720
+ },
721
+ {
722
+ "epoch": 0.16348773841961853,
723
+ "eval_loss": 0.2723295986652374,
724
+ "eval_runtime": 108.6297,
725
+ "eval_samples_per_second": 4.271,
726
+ "eval_steps_per_second": 0.534,
727
+ "step": 90
728
+ },
729
+ {
730
+ "epoch": 0.16530426884650318,
731
+ "grad_norm": 0.2374623417854309,
732
+ "learning_rate": 2.4471741852423237e-06,
733
+ "loss": 0.2156,
734
+ "step": 91
735
+ },
736
+ {
737
+ "epoch": 0.16712079927338783,
738
+ "grad_norm": 0.26733702421188354,
739
+ "learning_rate": 1.9369152030840556e-06,
740
+ "loss": 0.2058,
741
+ "step": 92
742
+ },
743
+ {
744
+ "epoch": 0.16893732970027248,
745
+ "grad_norm": 0.2634991407394409,
746
+ "learning_rate": 1.4852136862001764e-06,
747
+ "loss": 0.2776,
748
+ "step": 93
749
+ },
750
+ {
751
+ "epoch": 0.17075386012715713,
752
+ "grad_norm": 0.2750228941440582,
753
+ "learning_rate": 1.0926199633097157e-06,
754
+ "loss": 0.2635,
755
+ "step": 94
756
+ },
757
+ {
758
+ "epoch": 0.17257039055404177,
759
+ "grad_norm": 0.2913649380207062,
760
+ "learning_rate": 7.596123493895991e-07,
761
+ "loss": 0.2466,
762
+ "step": 95
763
+ },
764
+ {
765
+ "epoch": 0.17438692098092642,
766
+ "grad_norm": 0.30093085765838623,
767
+ "learning_rate": 4.865965629214819e-07,
768
+ "loss": 0.2855,
769
+ "step": 96
770
+ },
771
+ {
772
+ "epoch": 0.17620345140781107,
773
+ "grad_norm": 0.32675302028656006,
774
+ "learning_rate": 2.7390523158633554e-07,
775
+ "loss": 0.2625,
776
+ "step": 97
777
+ },
778
+ {
779
+ "epoch": 0.17801998183469572,
780
+ "grad_norm": 0.2559075653553009,
781
+ "learning_rate": 1.2179748700879012e-07,
782
+ "loss": 0.26,
783
+ "step": 98
784
+ },
785
+ {
786
+ "epoch": 0.17983651226158037,
787
+ "grad_norm": 0.3088376224040985,
788
+ "learning_rate": 3.04586490452119e-08,
789
+ "loss": 0.298,
790
+ "step": 99
791
+ },
792
+ {
793
+ "epoch": 0.17983651226158037,
794
+ "eval_loss": 0.27232545614242554,
795
+ "eval_runtime": 108.5508,
796
+ "eval_samples_per_second": 4.274,
797
+ "eval_steps_per_second": 0.534,
798
+ "step": 99
799
+ },
800
+ {
801
+ "epoch": 0.18165304268846502,
802
+ "grad_norm": 0.28909745812416077,
803
+ "learning_rate": 0.0,
804
+ "loss": 0.2986,
805
+ "step": 100
806
  }
807
  ],
808
  "logging_steps": 1,
 
817
  "should_evaluate": false,
818
  "should_log": false,
819
  "should_save": true,
820
+ "should_training_stop": true
821
  },
822
  "attributes": {}
823
  }
824
  },
825
+ "total_flos": 1.4281337959612416e+17,
826
  "train_batch_size": 8,
827
  "trial_name": null,
828
  "trial_params": null