fats-fme commited on
Commit
a84de21
1 Parent(s): f6e947b

Training in progress, step 100, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3e5746d8612d58aa433130f0aeb579c3061ae38b1b63c9eb6d2be91ccd8b24e
3
  size 767856
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f60d2e9ae32d1985aa65462862ca4b70c3a8998e596c04a5f721ece25854dbe1
3
  size 767856
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eca9c1fba4cd082c756afab056990553c73a5bd9c8c2fae920d24d4a5fb412fd
3
  size 1601338
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82f30fd5719e050daf7167fa83b2a9cab144fb5fbf8a1cecc61e1b04d9d5a001
3
  size 1601338
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0bb32afa8aaa96706ed9a0dfe3f617053594b0e6c9166e271de0fa133a9172f
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:486cd21b30f7f4b647912779f0f99f9cd55f4d5bc4247b3ed077084036f1e10a
3
  size 14512
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e07c0666cbc8d2ed66161e7b2aa4fe92e46d99690618326abce6840e2777020c
3
  size 14512
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19ca30fccc09c76ea95914a475456a7ee2561fa696312099871f01293924d014
3
  size 14512
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:940d423240b39d966113615a1fba0e170b7aa70deeb57e606975b0bf165e01d8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba41b4d003bf4f315ef5ca1f1b9201e78a8c869e239253754ed9ce404d87fd4f
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.2526847757422615,
5
  "eval_steps": 50,
6
- "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -373,6 +373,364 @@
373
  "eval_samples_per_second": 39.139,
374
  "eval_steps_per_second": 9.843,
375
  "step": 50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
376
  }
377
  ],
378
  "logging_steps": 1,
@@ -392,7 +750,7 @@
392
  "attributes": {}
393
  }
394
  },
395
- "total_flos": 56810370760704.0,
396
  "train_batch_size": 2,
397
  "trial_name": null,
398
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.505369551484523,
5
  "eval_steps": 50,
6
+ "global_step": 100,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
373
  "eval_samples_per_second": 39.139,
374
  "eval_steps_per_second": 9.843,
375
  "step": 50
376
+ },
377
+ {
378
+ "epoch": 0.25773847125710675,
379
+ "grad_norm": 2.438196897506714,
380
+ "learning_rate": 9.99885820390154e-05,
381
+ "loss": 9.9308,
382
+ "step": 51
383
+ },
384
+ {
385
+ "epoch": 0.262792166771952,
386
+ "grad_norm": 2.2800676822662354,
387
+ "learning_rate": 9.995433337085491e-05,
388
+ "loss": 9.7762,
389
+ "step": 52
390
+ },
391
+ {
392
+ "epoch": 0.26784586228679724,
393
+ "grad_norm": 2.2999722957611084,
394
+ "learning_rate": 9.989726963751682e-05,
395
+ "loss": 9.6314,
396
+ "step": 53
397
+ },
398
+ {
399
+ "epoch": 0.27289955780164243,
400
+ "grad_norm": 2.0864381790161133,
401
+ "learning_rate": 9.981741690106034e-05,
402
+ "loss": 9.5644,
403
+ "step": 54
404
+ },
405
+ {
406
+ "epoch": 0.2779532533164877,
407
+ "grad_norm": 1.9628472328186035,
408
+ "learning_rate": 9.971481163170268e-05,
409
+ "loss": 9.6852,
410
+ "step": 55
411
+ },
412
+ {
413
+ "epoch": 0.2830069488313329,
414
+ "grad_norm": 2.0100128650665283,
415
+ "learning_rate": 9.95895006911623e-05,
416
+ "loss": 9.532,
417
+ "step": 56
418
+ },
419
+ {
420
+ "epoch": 0.2880606443461781,
421
+ "grad_norm": 1.903132438659668,
422
+ "learning_rate": 9.944154131125642e-05,
423
+ "loss": 9.5968,
424
+ "step": 57
425
+ },
426
+ {
427
+ "epoch": 0.29311433986102337,
428
+ "grad_norm": 1.8894416093826294,
429
+ "learning_rate": 9.927100106776212e-05,
430
+ "loss": 9.45,
431
+ "step": 58
432
+ },
433
+ {
434
+ "epoch": 0.2981680353758686,
435
+ "grad_norm": 1.8624144792556763,
436
+ "learning_rate": 9.907795784955327e-05,
437
+ "loss": 9.462,
438
+ "step": 59
439
+ },
440
+ {
441
+ "epoch": 0.30322173089071386,
442
+ "grad_norm": 1.88225519657135,
443
+ "learning_rate": 9.88624998230272e-05,
444
+ "loss": 9.3977,
445
+ "step": 60
446
+ },
447
+ {
448
+ "epoch": 0.30827542640555905,
449
+ "grad_norm": 1.9469467401504517,
450
+ "learning_rate": 9.862472539183756e-05,
451
+ "loss": 9.2517,
452
+ "step": 61
453
+ },
454
+ {
455
+ "epoch": 0.3133291219204043,
456
+ "grad_norm": 1.6683619022369385,
457
+ "learning_rate": 9.836474315195147e-05,
458
+ "loss": 9.4305,
459
+ "step": 62
460
+ },
461
+ {
462
+ "epoch": 0.31838281743524954,
463
+ "grad_norm": 1.663127064704895,
464
+ "learning_rate": 9.808267184205183e-05,
465
+ "loss": 9.2934,
466
+ "step": 63
467
+ },
468
+ {
469
+ "epoch": 0.32343651295009473,
470
+ "grad_norm": 1.6440531015396118,
471
+ "learning_rate": 9.777864028930705e-05,
472
+ "loss": 9.3013,
473
+ "step": 64
474
+ },
475
+ {
476
+ "epoch": 0.32849020846494,
477
+ "grad_norm": 1.6826865673065186,
478
+ "learning_rate": 9.745278735053343e-05,
479
+ "loss": 9.2594,
480
+ "step": 65
481
+ },
482
+ {
483
+ "epoch": 0.33354390397978523,
484
+ "grad_norm": 1.596401572227478,
485
+ "learning_rate": 9.710526184877667e-05,
486
+ "loss": 9.2143,
487
+ "step": 66
488
+ },
489
+ {
490
+ "epoch": 0.3385975994946305,
491
+ "grad_norm": 1.764570951461792,
492
+ "learning_rate": 9.673622250534156e-05,
493
+ "loss": 9.0152,
494
+ "step": 67
495
+ },
496
+ {
497
+ "epoch": 0.34365129500947567,
498
+ "grad_norm": 1.5638481378555298,
499
+ "learning_rate": 9.63458378673011e-05,
500
+ "loss": 9.1172,
501
+ "step": 68
502
+ },
503
+ {
504
+ "epoch": 0.3487049905243209,
505
+ "grad_norm": 1.5338134765625,
506
+ "learning_rate": 9.593428623051792e-05,
507
+ "loss": 9.0363,
508
+ "step": 69
509
+ },
510
+ {
511
+ "epoch": 0.35375868603916616,
512
+ "grad_norm": 1.6992957592010498,
513
+ "learning_rate": 9.550175555821333e-05,
514
+ "loss": 9.0962,
515
+ "step": 70
516
+ },
517
+ {
518
+ "epoch": 0.35881238155401135,
519
+ "grad_norm": 1.6055617332458496,
520
+ "learning_rate": 9.504844339512095e-05,
521
+ "loss": 9.1217,
522
+ "step": 71
523
+ },
524
+ {
525
+ "epoch": 0.3638660770688566,
526
+ "grad_norm": 1.7240350246429443,
527
+ "learning_rate": 9.457455677726448e-05,
528
+ "loss": 9.2245,
529
+ "step": 72
530
+ },
531
+ {
532
+ "epoch": 0.36891977258370184,
533
+ "grad_norm": 1.9936858415603638,
534
+ "learning_rate": 9.408031213740045e-05,
535
+ "loss": 9.3585,
536
+ "step": 73
537
+ },
538
+ {
539
+ "epoch": 0.37397346809854703,
540
+ "grad_norm": 2.1292333602905273,
541
+ "learning_rate": 9.356593520616948e-05,
542
+ "loss": 9.4107,
543
+ "step": 74
544
+ },
545
+ {
546
+ "epoch": 0.3790271636133923,
547
+ "grad_norm": 3.4360055923461914,
548
+ "learning_rate": 9.303166090900082e-05,
549
+ "loss": 9.1299,
550
+ "step": 75
551
+ },
552
+ {
553
+ "epoch": 0.38408085912823753,
554
+ "grad_norm": 1.3017970323562622,
555
+ "learning_rate": 9.24777332588177e-05,
556
+ "loss": 9.224,
557
+ "step": 76
558
+ },
559
+ {
560
+ "epoch": 0.3891345546430828,
561
+ "grad_norm": 1.1683531999588013,
562
+ "learning_rate": 9.190440524459203e-05,
563
+ "loss": 9.2729,
564
+ "step": 77
565
+ },
566
+ {
567
+ "epoch": 0.39418825015792797,
568
+ "grad_norm": 1.235701084136963,
569
+ "learning_rate": 9.131193871579975e-05,
570
+ "loss": 9.0347,
571
+ "step": 78
572
+ },
573
+ {
574
+ "epoch": 0.3992419456727732,
575
+ "grad_norm": 1.1109133958816528,
576
+ "learning_rate": 9.070060426282925e-05,
577
+ "loss": 9.0375,
578
+ "step": 79
579
+ },
580
+ {
581
+ "epoch": 0.40429564118761846,
582
+ "grad_norm": 1.2668910026550293,
583
+ "learning_rate": 9.007068109339784e-05,
584
+ "loss": 8.9948,
585
+ "step": 80
586
+ },
587
+ {
588
+ "epoch": 0.40934933670246365,
589
+ "grad_norm": 1.3104698657989502,
590
+ "learning_rate": 8.942245690503239e-05,
591
+ "loss": 8.9623,
592
+ "step": 81
593
+ },
594
+ {
595
+ "epoch": 0.4144030322173089,
596
+ "grad_norm": 1.198638916015625,
597
+ "learning_rate": 8.87562277536726e-05,
598
+ "loss": 9.0571,
599
+ "step": 82
600
+ },
601
+ {
602
+ "epoch": 0.41945672773215414,
603
+ "grad_norm": 1.2889971733093262,
604
+ "learning_rate": 8.807229791845673e-05,
605
+ "loss": 8.9898,
606
+ "step": 83
607
+ },
608
+ {
609
+ "epoch": 0.4245104232469994,
610
+ "grad_norm": 1.2756685018539429,
611
+ "learning_rate": 8.737097976275178e-05,
612
+ "loss": 8.896,
613
+ "step": 84
614
+ },
615
+ {
616
+ "epoch": 0.4295641187618446,
617
+ "grad_norm": 1.1489578485488892,
618
+ "learning_rate": 8.665259359149132e-05,
619
+ "loss": 9.067,
620
+ "step": 85
621
+ },
622
+ {
623
+ "epoch": 0.43461781427668983,
624
+ "grad_norm": 1.1491981744766235,
625
+ "learning_rate": 8.591746750488639e-05,
626
+ "loss": 8.8872,
627
+ "step": 86
628
+ },
629
+ {
630
+ "epoch": 0.4396715097915351,
631
+ "grad_norm": 1.072772741317749,
632
+ "learning_rate": 8.516593724857598e-05,
633
+ "loss": 8.887,
634
+ "step": 87
635
+ },
636
+ {
637
+ "epoch": 0.44472520530638027,
638
+ "grad_norm": 1.0014466047286987,
639
+ "learning_rate": 8.439834606028594e-05,
640
+ "loss": 8.8939,
641
+ "step": 88
642
+ },
643
+ {
644
+ "epoch": 0.4497789008212255,
645
+ "grad_norm": 1.1188455820083618,
646
+ "learning_rate": 8.361504451306585e-05,
647
+ "loss": 8.7898,
648
+ "step": 89
649
+ },
650
+ {
651
+ "epoch": 0.45483259633607076,
652
+ "grad_norm": 1.039220929145813,
653
+ "learning_rate": 8.28163903551759e-05,
654
+ "loss": 8.8616,
655
+ "step": 90
656
+ },
657
+ {
658
+ "epoch": 0.459886291850916,
659
+ "grad_norm": 1.1389069557189941,
660
+ "learning_rate": 8.200274834669675e-05,
661
+ "loss": 8.7895,
662
+ "step": 91
663
+ },
664
+ {
665
+ "epoch": 0.4649399873657612,
666
+ "grad_norm": 1.1462750434875488,
667
+ "learning_rate": 8.117449009293668e-05,
668
+ "loss": 8.8468,
669
+ "step": 92
670
+ },
671
+ {
672
+ "epoch": 0.46999368288060644,
673
+ "grad_norm": 1.1461669206619263,
674
+ "learning_rate": 8.033199387471277e-05,
675
+ "loss": 8.7652,
676
+ "step": 93
677
+ },
678
+ {
679
+ "epoch": 0.4750473783954517,
680
+ "grad_norm": 1.254766583442688,
681
+ "learning_rate": 7.9475644475583e-05,
682
+ "loss": 8.612,
683
+ "step": 94
684
+ },
685
+ {
686
+ "epoch": 0.4801010739102969,
687
+ "grad_norm": 1.3156293630599976,
688
+ "learning_rate": 7.860583300610849e-05,
689
+ "loss": 8.7556,
690
+ "step": 95
691
+ },
692
+ {
693
+ "epoch": 0.48515476942514213,
694
+ "grad_norm": 1.2289044857025146,
695
+ "learning_rate": 7.772295672522615e-05,
696
+ "loss": 8.8208,
697
+ "step": 96
698
+ },
699
+ {
700
+ "epoch": 0.4902084649399874,
701
+ "grad_norm": 1.7439113855361938,
702
+ "learning_rate": 7.682741885881315e-05,
703
+ "loss": 9.0702,
704
+ "step": 97
705
+ },
706
+ {
707
+ "epoch": 0.4952621604548326,
708
+ "grad_norm": 1.3451626300811768,
709
+ "learning_rate": 7.591962841552627e-05,
710
+ "loss": 8.8565,
711
+ "step": 98
712
+ },
713
+ {
714
+ "epoch": 0.5003158559696779,
715
+ "grad_norm": 1.7298027276992798,
716
+ "learning_rate": 7.500000000000001e-05,
717
+ "loss": 9.0691,
718
+ "step": 99
719
+ },
720
+ {
721
+ "epoch": 0.505369551484523,
722
+ "grad_norm": 2.368654727935791,
723
+ "learning_rate": 7.406895362348916e-05,
724
+ "loss": 8.8959,
725
+ "step": 100
726
+ },
727
+ {
728
+ "epoch": 0.505369551484523,
729
+ "eval_loss": 8.748159408569336,
730
+ "eval_runtime": 8.2562,
731
+ "eval_samples_per_second": 40.455,
732
+ "eval_steps_per_second": 10.174,
733
+ "step": 100
734
  }
735
  ],
736
  "logging_steps": 1,
 
750
  "attributes": {}
751
  }
752
  },
753
+ "total_flos": 113484993921024.0,
754
  "train_batch_size": 2,
755
  "trial_name": null,
756
  "trial_params": null