lapp0 commited on
Commit
7f01071
1 Parent(s): 27456ed

End of training

Browse files
README.md CHANGED
@@ -15,14 +15,14 @@ This student model is distilled from the teacher model [roneneldan/TinyStories-3
15
  The [Distily](https://github.com/lapp0/distily) library was used for this distillation.
16
 
17
  It achieves the following results on the evaluation set:
18
- - eval_enwikippl: 108.1245
19
- - eval_frwikippl: 11043.4336
20
- - eval_zhwikippl: 55788.7734
21
- - eval_tinystoriesppl: 6.7037
22
- - eval_loss: 0.7047
23
- - eval_runtime: 13.0964
24
- - eval_samples_per_second: 76.357
25
- - eval_steps_per_second: 9.545
26
 
27
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
28
  should probably proofread and complete it, then remove this comment.
@@ -62,27 +62,27 @@ Peak GPU Memory: 6.6064 GB
62
  | step | epoch | enwikippl | frwikippl | loss | runtime | samples_per_second | steps_per_second | tinystoriesppl | zhwikippl |
63
  | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
64
  | **teacher eval** | | 169.9865 | 47377.9414 | | | | | 3.9789 | 4998.1294 |
65
- | 0 | 0 | 50480.5703 | 85684.4844 | 6.8305 | 13.0511 | 76.622 | 9.578 | 33932.0586 | 94692.1562 |
66
- | 5000 | 0.0505 | 107.6398 | 10878.2363 | 0.7372 | 13.0368 | 76.706 | 9.588 | 6.6259 | 48821.6875 |
67
- | 10000 | 0.1010 | 103.7832 | 10693.6533 | 0.7210 | 13.0383 | 76.697 | 9.587 | 6.3876 | 52904.0898 |
68
- | 15000 | 0.1515 | 113.9463 | 10959.7607 | 0.7146 | 13.0455 | 76.655 | 9.582 | 7.3001 | 55833.4297 |
69
- | 20000 | 0.2020 | 102.8906 | 10842.2969 | 0.7117 | 13.0448 | 76.659 | 9.582 | 6.3362 | 55967.6680 |
70
- | 25000 | 0.2525 | 107.6648 | 11021.6855 | 0.7063 | 13.0457 | 76.654 | 9.582 | 6.7065 | 55654.9688 |
71
- | 30000 | 0.3030 | 107.8986 | 11027.8887 | 0.7052 | 13.0423 | 76.673 | 9.584 | 6.6954 | 55122.9922 |
72
- | 35000 | 0.3535 | 107.8986 | 10953.5859 | 0.7051 | 12.9974 | 76.939 | 9.617 | 6.6910 | 54771.1680 |
73
- | 40000 | 0.4040 | 107.9989 | 10941.2451 | 0.7053 | 13.0736 | 76.49 | 9.561 | 6.7123 | 55122.9922 |
74
- | 45000 | 0.4545 | 107.8317 | 10986.0273 | 0.7051 | 13.0495 | 76.632 | 9.579 | 6.7056 | 55064.1953 |
75
- | 50000 | 0.5051 | 107.9905 | 11037.2217 | 0.7049 | 13.0288 | 76.753 | 9.594 | 6.7202 | 55922.9062 |
76
- | 55000 | 0.5556 | 108.2753 | 10973.6602 | 0.7051 | 13.0751 | 76.481 | 9.56 | 6.7202 | 54917.4609 |
77
- | 60000 | 0.6061 | 108.0324 | 11037.2217 | 0.7052 | 13.0104 | 76.861 | 9.608 | 6.7093 | 55358.7930 |
78
- | 65000 | 0.6566 | 108.3089 | 11043.4336 | 0.7049 | 13.0425 | 76.673 | 9.584 | 6.7123 | 55122.9922 |
79
- | 70000 | 0.7071 | 108.2418 | 11043.4336 | 0.7047 | 12.9968 | 76.942 | 9.618 | 6.7065 | 55122.9922 |
80
- | 75000 | 0.7576 | 107.9069 | 11043.4336 | 0.7046 | 13.0103 | 76.862 | 9.608 | 6.7004 | 55506.7109 |
81
- | 80000 | 0.8081 | 108.1915 | 11043.4336 | 0.7047 | 13.0166 | 76.825 | 9.603 | 6.6979 | 55788.7734 |
82
- | 85000 | 0.8586 | 108.3089 | 11043.4336 | 0.7045 | 13.0625 | 76.555 | 9.569 | 6.7076 | 55759.0430 |
83
- | 90000 | 0.9091 | 108.2083 | 11043.4336 | 0.7047 | 13.0397 | 76.689 | 9.586 | 6.7059 | 55788.7734 |
84
- | 95000 | 0.9596 | 108.1999 | 11043.4336 | 0.7045 | 13.0487 | 76.636 | 9.579 | 6.7062 | 55788.7734 |
85
- | 99000 | 1.0 | 108.1245 | 11043.4336 | 0.7047 | 13.0964 | 76.357 | 9.545 | 6.7037 | 55788.7734 |
86
 
87
  ### Framework versions
88
  - Distily 0.2.0
 
15
  The [Distily](https://github.com/lapp0/distily) library was used for this distillation.
16
 
17
  It achieves the following results on the evaluation set:
18
+ - eval_enwikippl: 107.6398
19
+ - eval_frwikippl: 10204.3643
20
+ - eval_zhwikippl: 49954.8242
21
+ - eval_tinystoriesppl: 6.6903
22
+ - eval_loss: 0.7036
23
+ - eval_runtime: 13.0602
24
+ - eval_samples_per_second: 76.568
25
+ - eval_steps_per_second: 9.571
26
 
27
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
28
  should probably proofread and complete it, then remove this comment.
 
62
  | step | epoch | enwikippl | frwikippl | loss | runtime | samples_per_second | steps_per_second | tinystoriesppl | zhwikippl |
63
  | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
64
  | **teacher eval** | | 169.9865 | 47377.9414 | | | | | 3.9789 | 4998.1294 |
65
+ | 0 | 0 | 50480.5703 | 85684.4844 | 6.8305 | 13.0304 | 76.744 | 9.593 | 33932.0586 | 94692.1562 |
66
+ | 5000 | 0.0505 | 110.8554 | 10584.2598 | 0.7523 | 13.0416 | 76.677 | 9.585 | 6.7911 | 42034.9414 |
67
+ | 10000 | 0.1010 | 104.0690 | 10210.1172 | 0.7242 | 13.0341 | 76.722 | 9.59 | 6.4174 | 44683.2305 |
68
+ | 15000 | 0.1515 | 113.6466 | 10400.9941 | 0.7156 | 13.0171 | 76.822 | 9.603 | 7.2840 | 46906.4258 |
69
+ | 20000 | 0.2020 | 111.4970 | 9877.6748 | 0.7117 | 13.0184 | 76.814 | 9.602 | 7.1889 | 47931.1602 |
70
+ | 25000 | 0.2525 | 107.3317 | 10121.3330 | 0.7051 | 13.088 | 76.406 | 9.551 | 6.6947 | 49516.9375 |
71
+ | 30000 | 0.3030 | 107.4814 | 10147.0312 | 0.7042 | 13.0664 | 76.532 | 9.567 | 6.6925 | 49728.7578 |
72
+ | 35000 | 0.3535 | 107.5147 | 10109.9404 | 0.7041 | 13.0324 | 76.732 | 9.591 | 6.6794 | 49279.6914 |
73
+ | 40000 | 0.4040 | 107.5064 | 10121.3330 | 0.7041 | 13.1335 | 76.141 | 9.518 | 6.6994 | 49835.0078 |
74
+ | 45000 | 0.4545 | 107.3816 | 10129.8984 | 0.7039 | 13.1075 | 76.292 | 9.537 | 6.6972 | 49464.1211 |
75
+ | 50000 | 0.5051 | 107.5231 | 10129.8984 | 0.7040 | 13.0137 | 76.842 | 9.605 | 6.7041 | 49808.4492 |
76
+ | 55000 | 0.5556 | 107.7482 | 10135.5996 | 0.7040 | 13.0084 | 76.874 | 9.609 | 6.7052 | 49464.1211 |
77
+ | 60000 | 0.6061 | 107.6064 | 10204.3643 | 0.7040 | 13.0291 | 76.751 | 9.594 | 6.6991 | 49914.8711 |
78
+ | 65000 | 0.6566 | 107.6981 | 10204.3643 | 0.7037 | 13.0479 | 76.641 | 9.58 | 6.6958 | 49543.3398 |
79
+ | 70000 | 0.7071 | 107.8484 | 10204.3643 | 0.7036 | 13.0612 | 76.563 | 9.57 | 6.6953 | 49848.3164 |
80
+ | 75000 | 0.7576 | 107.5897 | 10204.3643 | 0.7036 | 13.1821 | 75.86 | 9.483 | 6.6895 | 49888.2188 |
81
+ | 80000 | 0.8081 | 107.6398 | 10204.3643 | 0.7037 | 13.1572 | 76.004 | 9.5 | 6.6900 | 49835.0078 |
82
+ | 85000 | 0.8586 | 107.7148 | 10204.3643 | 0.7037 | 12.9936 | 76.961 | 9.62 | 6.6928 | 49928.1523 |
83
+ | 90000 | 0.9091 | 107.6398 | 10204.3643 | 0.7035 | 13.0225 | 76.79 | 9.599 | 6.6919 | 49954.8242 |
84
+ | 95000 | 0.9596 | 107.6398 | 10204.3643 | 0.7036 | 13.0696 | 76.514 | 9.564 | 6.6914 | 49954.8242 |
85
+ | 99000 | 1.0 | 107.6398 | 10204.3643 | 0.7036 | 13.0602 | 76.568 | 9.571 | 6.6903 | 49954.8242 |
86
 
87
  ### Framework versions
88
  - Distily 0.2.0
logs/copy_teacher_modules=_(_lm_head___False)_, learning_rate=1e-05, max_grad_norm=100/events.out.tfevents.1724042225.5f530b1cf724 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d3e78b7c39fd7d70f8e2a5c6a75a86d9fcd88e839677582d19650529b5b1cfa
3
+ size 312