farmery commited on
Commit
0943054
·
verified ·
1 Parent(s): 286d253

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e1ab32ae0d4399cb012133bb11b6e9faa79b6a3c6703a480647c30a55b6d855e
3
  size 13587864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:18fc12de5654e0323293b5dd1ac28814fb994a9658fb252a535bc41258c43b46
3
  size 13587864
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3cf790de50f1c78d7f431983db6a779b7c0901de7d7a91f1d6726c03b25151f0
3
  size 27273018
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d024d96a36897f201c8fa7589476419b4311211f4a208ffc2d9a459bb98de426
3
  size 27273018
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a67f0c9d18949d6ab4ca34299948b93c64d14bb0937f0d840c45b02ff0c13c29
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0e862751a4e0db817da07bf59527ac4af8750f08140f75db069d784ab7ae078
3
  size 15024
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:572e5528001fbc97395c4945b42d8240949671b65b7cc8db72e1abea7be6841b
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5b2961bbcc1830d54960d6ae6bf069daa40b1ae66346ee47b349dc68099d8ee
3
  size 15024
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:faa85973f54e4029ff03e236c546a15786b9d3895a290dcafb3c27938c85ea80
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d9faa3e71f84c7c46366efe1c0a99a95347d103fa5203e0f64b50cc02d8b031
3
  size 15024
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:080c6565aee455008a75927fca0b75f89d095ce4b02c008c692fdfd4864ad224
3
  size 15024
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:831873b8ec47d858edb49aabcab94a7eded7500ba341e5a8727949745145665c
3
  size 15024
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e443da8b3fe54273fec3c28cdbc4ca0128af804b14bea3913a81333231fc0282
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5153517b1f7cd2d77a0f7948024c5ad6d674c6b3cf70e1337dc0d8c2248ef01
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "best_metric": 0.3283706307411194,
3
- "best_model_checkpoint": "miner_id_24/checkpoint-50",
4
- "epoch": 0.1498618461106168,
5
  "eval_steps": 25,
6
- "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -381,6 +381,372 @@
381
  "eval_samples_per_second": 173.489,
382
  "eval_steps_per_second": 45.107,
383
  "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  }
385
  ],
386
  "logging_steps": 1,
@@ -409,7 +775,7 @@
409
  "attributes": {}
410
  }
411
  },
412
- "total_flos": 2.296200128626688e+16,
413
  "train_batch_size": 1,
414
  "trial_name": null,
415
  "trial_params": null
 
1
  {
2
+ "best_metric": 0.14205443859100342,
3
+ "best_model_checkpoint": "miner_id_24/checkpoint-100",
4
+ "epoch": 0.2997236922212336,
5
  "eval_steps": 25,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
381
  "eval_samples_per_second": 173.489,
382
  "eval_steps_per_second": 45.107,
383
  "step": 50
384
+ },
385
+ {
386
+ "epoch": 0.1528590830328291,
387
+ "grad_norm": 4.228235721588135,
388
+ "learning_rate": 0.0002984781941052967,
389
+ "loss": 1.0819,
390
+ "step": 51
391
+ },
392
+ {
393
+ "epoch": 0.15585631995504146,
394
+ "grad_norm": 3.424924612045288,
395
+ "learning_rate": 0.0002983786296793692,
396
+ "loss": 0.5749,
397
+ "step": 52
398
+ },
399
+ {
400
+ "epoch": 0.15885355687725378,
401
+ "grad_norm": 3.9904696941375732,
402
+ "learning_rate": 0.00029827593027084546,
403
+ "loss": 0.5831,
404
+ "step": 53
405
+ },
406
+ {
407
+ "epoch": 0.16185079379946612,
408
+ "grad_norm": 2.694119930267334,
409
+ "learning_rate": 0.00029817009829361196,
410
+ "loss": 0.4457,
411
+ "step": 54
412
+ },
413
+ {
414
+ "epoch": 0.16484803072167845,
415
+ "grad_norm": 1.2387803792953491,
416
+ "learning_rate": 0.00029806113623518407,
417
+ "loss": 0.3431,
418
+ "step": 55
419
+ },
420
+ {
421
+ "epoch": 0.1678452676438908,
422
+ "grad_norm": 1.6821039915084839,
423
+ "learning_rate": 0.0002979490466566481,
424
+ "loss": 0.3905,
425
+ "step": 56
426
+ },
427
+ {
428
+ "epoch": 0.17084250456610311,
429
+ "grad_norm": 0.6684949994087219,
430
+ "learning_rate": 0.00029783383219260037,
431
+ "loss": 0.3056,
432
+ "step": 57
433
+ },
434
+ {
435
+ "epoch": 0.17383974148831546,
436
+ "grad_norm": 0.7967026233673096,
437
+ "learning_rate": 0.0002977154955510861,
438
+ "loss": 0.3333,
439
+ "step": 58
440
+ },
441
+ {
442
+ "epoch": 0.17683697841052778,
443
+ "grad_norm": 0.5187807083129883,
444
+ "learning_rate": 0.0002975940395135351,
445
+ "loss": 0.2867,
446
+ "step": 59
447
+ },
448
+ {
449
+ "epoch": 0.17983421533274013,
450
+ "grad_norm": 7.2272138595581055,
451
+ "learning_rate": 0.00029746946693469693,
452
+ "loss": 1.0056,
453
+ "step": 60
454
+ },
455
+ {
456
+ "epoch": 0.18283145225495245,
457
+ "grad_norm": 8.9348726272583,
458
+ "learning_rate": 0.00029734178074257325,
459
+ "loss": 0.7786,
460
+ "step": 61
461
+ },
462
+ {
463
+ "epoch": 0.1858286891771648,
464
+ "grad_norm": 8.19704532623291,
465
+ "learning_rate": 0.0002972109839383494,
466
+ "loss": 0.3928,
467
+ "step": 62
468
+ },
469
+ {
470
+ "epoch": 0.18882592609937715,
471
+ "grad_norm": 4.3927483558654785,
472
+ "learning_rate": 0.00029707707959632386,
473
+ "loss": 0.7528,
474
+ "step": 63
475
+ },
476
+ {
477
+ "epoch": 0.19182316302158947,
478
+ "grad_norm": 2.271043539047241,
479
+ "learning_rate": 0.0002969400708638358,
480
+ "loss": 0.4877,
481
+ "step": 64
482
+ },
483
+ {
484
+ "epoch": 0.19482039994380182,
485
+ "grad_norm": 1.3864595890045166,
486
+ "learning_rate": 0.000296799960961191,
487
+ "loss": 0.2625,
488
+ "step": 65
489
+ },
490
+ {
491
+ "epoch": 0.19781763686601414,
492
+ "grad_norm": 2.2087323665618896,
493
+ "learning_rate": 0.00029665675318158656,
494
+ "loss": 0.2945,
495
+ "step": 66
496
+ },
497
+ {
498
+ "epoch": 0.2008148737882265,
499
+ "grad_norm": 3.766403913497925,
500
+ "learning_rate": 0.00029651045089103316,
501
+ "loss": 0.3807,
502
+ "step": 67
503
+ },
504
+ {
505
+ "epoch": 0.2038121107104388,
506
+ "grad_norm": 2.598832368850708,
507
+ "learning_rate": 0.0002963610575282762,
508
+ "loss": 0.3149,
509
+ "step": 68
510
+ },
511
+ {
512
+ "epoch": 0.20680934763265116,
513
+ "grad_norm": 0.677237331867218,
514
+ "learning_rate": 0.0002962085766047146,
515
+ "loss": 0.346,
516
+ "step": 69
517
+ },
518
+ {
519
+ "epoch": 0.20980658455486348,
520
+ "grad_norm": 0.5142577886581421,
521
+ "learning_rate": 0.00029605301170431867,
522
+ "loss": 0.2855,
523
+ "step": 70
524
+ },
525
+ {
526
+ "epoch": 0.21280382147707583,
527
+ "grad_norm": 0.5518949031829834,
528
+ "learning_rate": 0.00029589436648354566,
529
+ "loss": 0.3163,
530
+ "step": 71
531
+ },
532
+ {
533
+ "epoch": 0.21580105839928815,
534
+ "grad_norm": 0.336823046207428,
535
+ "learning_rate": 0.00029573264467125377,
536
+ "loss": 0.16,
537
+ "step": 72
538
+ },
539
+ {
540
+ "epoch": 0.2187982953215005,
541
+ "grad_norm": 0.2474360167980194,
542
+ "learning_rate": 0.0002955678500686147,
543
+ "loss": 0.0297,
544
+ "step": 73
545
+ },
546
+ {
547
+ "epoch": 0.22179553224371282,
548
+ "grad_norm": 0.18458165228366852,
549
+ "learning_rate": 0.0002953999865490242,
550
+ "loss": 0.0609,
551
+ "step": 74
552
+ },
553
+ {
554
+ "epoch": 0.22479276916592517,
555
+ "grad_norm": 0.36120983958244324,
556
+ "learning_rate": 0.0002952290580580109,
557
+ "loss": 0.0862,
558
+ "step": 75
559
+ },
560
+ {
561
+ "epoch": 0.22479276916592517,
562
+ "eval_loss": 0.16452732682228088,
563
+ "eval_runtime": 0.2888,
564
+ "eval_samples_per_second": 173.11,
565
+ "eval_steps_per_second": 45.009,
566
+ "step": 75
567
+ },
568
+ {
569
+ "epoch": 0.2277900060881375,
570
+ "grad_norm": 1.3333512544631958,
571
+ "learning_rate": 0.0002950550686131438,
572
+ "loss": 0.6146,
573
+ "step": 76
574
+ },
575
+ {
576
+ "epoch": 0.23078724301034984,
577
+ "grad_norm": 1.2993559837341309,
578
+ "learning_rate": 0.00029487802230393777,
579
+ "loss": 0.2574,
580
+ "step": 77
581
+ },
582
+ {
583
+ "epoch": 0.23378447993256216,
584
+ "grad_norm": 1.2781016826629639,
585
+ "learning_rate": 0.00029469792329175725,
586
+ "loss": 0.2978,
587
+ "step": 78
588
+ },
589
+ {
590
+ "epoch": 0.2367817168547745,
591
+ "grad_norm": 5.145886421203613,
592
+ "learning_rate": 0.0002945147758097187,
593
+ "loss": 0.3251,
594
+ "step": 79
595
+ },
596
+ {
597
+ "epoch": 0.23977895377698685,
598
+ "grad_norm": 5.573575019836426,
599
+ "learning_rate": 0.00029432858416259097,
600
+ "loss": 0.3483,
601
+ "step": 80
602
+ },
603
+ {
604
+ "epoch": 0.24277619069919917,
605
+ "grad_norm": 2.6032469272613525,
606
+ "learning_rate": 0.0002941393527266941,
607
+ "loss": 0.306,
608
+ "step": 81
609
+ },
610
+ {
611
+ "epoch": 0.24577342762141152,
612
+ "grad_norm": 0.6271111965179443,
613
+ "learning_rate": 0.00029394708594979657,
614
+ "loss": 0.318,
615
+ "step": 82
616
+ },
617
+ {
618
+ "epoch": 0.24877066454362384,
619
+ "grad_norm": 0.5439050793647766,
620
+ "learning_rate": 0.0002937517883510106,
621
+ "loss": 0.2547,
622
+ "step": 83
623
+ },
624
+ {
625
+ "epoch": 0.25176790146583616,
626
+ "grad_norm": 0.5188155770301819,
627
+ "learning_rate": 0.0002935534645206861,
628
+ "loss": 0.2402,
629
+ "step": 84
630
+ },
631
+ {
632
+ "epoch": 0.25476513838804854,
633
+ "grad_norm": 1.3832889795303345,
634
+ "learning_rate": 0.00029335211912030247,
635
+ "loss": 0.147,
636
+ "step": 85
637
+ },
638
+ {
639
+ "epoch": 0.25776237531026086,
640
+ "grad_norm": 0.20522421598434448,
641
+ "learning_rate": 0.0002931477568823596,
642
+ "loss": 0.0365,
643
+ "step": 86
644
+ },
645
+ {
646
+ "epoch": 0.2607596122324732,
647
+ "grad_norm": 0.07317493855953217,
648
+ "learning_rate": 0.00029294038261026595,
649
+ "loss": 0.0178,
650
+ "step": 87
651
+ },
652
+ {
653
+ "epoch": 0.2637568491546855,
654
+ "grad_norm": 5.754029273986816,
655
+ "learning_rate": 0.0002927300011782263,
656
+ "loss": 0.5049,
657
+ "step": 88
658
+ },
659
+ {
660
+ "epoch": 0.2667540860768979,
661
+ "grad_norm": 1.9069617986679077,
662
+ "learning_rate": 0.0002925166175311266,
663
+ "loss": 0.297,
664
+ "step": 89
665
+ },
666
+ {
667
+ "epoch": 0.2697513229991102,
668
+ "grad_norm": 1.4803589582443237,
669
+ "learning_rate": 0.0002923002366844182,
670
+ "loss": 0.2419,
671
+ "step": 90
672
+ },
673
+ {
674
+ "epoch": 0.2727485599213225,
675
+ "grad_norm": 1.1726033687591553,
676
+ "learning_rate": 0.0002920808637239998,
677
+ "loss": 0.2449,
678
+ "step": 91
679
+ },
680
+ {
681
+ "epoch": 0.27574579684353484,
682
+ "grad_norm": 1.1483900547027588,
683
+ "learning_rate": 0.00029185850380609757,
684
+ "loss": 0.2845,
685
+ "step": 92
686
+ },
687
+ {
688
+ "epoch": 0.2787430337657472,
689
+ "grad_norm": 0.6498438715934753,
690
+ "learning_rate": 0.00029163316215714477,
691
+ "loss": 0.3168,
692
+ "step": 93
693
+ },
694
+ {
695
+ "epoch": 0.28174027068795954,
696
+ "grad_norm": 3.8744821548461914,
697
+ "learning_rate": 0.00029140484407365807,
698
+ "loss": 0.3098,
699
+ "step": 94
700
+ },
701
+ {
702
+ "epoch": 0.28473750761017186,
703
+ "grad_norm": 5.238924026489258,
704
+ "learning_rate": 0.00029117355492211345,
705
+ "loss": 0.3747,
706
+ "step": 95
707
+ },
708
+ {
709
+ "epoch": 0.28773474453238423,
710
+ "grad_norm": 4.835148334503174,
711
+ "learning_rate": 0.0002909393001388201,
712
+ "loss": 0.311,
713
+ "step": 96
714
+ },
715
+ {
716
+ "epoch": 0.29073198145459656,
717
+ "grad_norm": 3.1126749515533447,
718
+ "learning_rate": 0.00029070208522979246,
719
+ "loss": 0.1933,
720
+ "step": 97
721
+ },
722
+ {
723
+ "epoch": 0.2937292183768089,
724
+ "grad_norm": 0.31741341948509216,
725
+ "learning_rate": 0.000290461915770621,
726
+ "loss": 0.0311,
727
+ "step": 98
728
+ },
729
+ {
730
+ "epoch": 0.2967264552990212,
731
+ "grad_norm": 0.13816803693771362,
732
+ "learning_rate": 0.00029021879740634106,
733
+ "loss": 0.0489,
734
+ "step": 99
735
+ },
736
+ {
737
+ "epoch": 0.2997236922212336,
738
+ "grad_norm": 0.16050726175308228,
739
+ "learning_rate": 0.0002899727358513002,
740
+ "loss": 0.0421,
741
+ "step": 100
742
+ },
743
+ {
744
+ "epoch": 0.2997236922212336,
745
+ "eval_loss": 0.14205443859100342,
746
+ "eval_runtime": 0.2881,
747
+ "eval_samples_per_second": 173.58,
748
+ "eval_steps_per_second": 45.131,
749
+ "step": 100
750
  }
751
  ],
752
  "logging_steps": 1,
 
775
  "attributes": {}
776
  }
777
  },
778
+ "total_flos": 4.592226717086515e+16,
779
  "train_batch_size": 1,
780
  "trial_name": null,
781
  "trial_params": null