lapp0 commited on
Commit
1015a66
1 Parent(s): 861d04c

End of training

Browse files
README.md CHANGED
@@ -16,14 +16,14 @@ This student model is distilled from the teacher model [gpt2](https://huggingfac
16
  The [Distily](https://github.com/lapp0/distily) library was used for this distillation.
17
 
18
  It achieves the following results on the evaluation set:
19
- - eval_enwikippl: 394.4440
20
- - eval_frwikippl: 376.8607
21
- - eval_zhwikippl: 132.9248
22
- - eval_tinystoriesppl: 879.5369
23
- - eval_loss: 0.6564
24
- - eval_runtime: 21.3247
25
- - eval_samples_per_second: 46.894
26
- - eval_steps_per_second: 11.724
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
29
  should probably proofread and complete it, then remove this comment.
@@ -63,27 +63,27 @@ Peak GPU Memory: 3.9285 GB
63
  | step | epoch | enwikippl | frwikippl | loss | runtime | samples_per_second | steps_per_second | tinystoriesppl | zhwikippl |
64
  | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
65
  | **teacher eval** | | 270.2348 | 76.8142 | | | | | 671.1238 | 22.8030 |
66
- | 0 | 0 | 166859.125 | 3884651118592.0 | 20.7305 | 20.9672 | 47.694 | 11.923 | 78.9353 | 8900573134848.0 |
67
- | 5000 | 0.0505 | 713.4594 | 1011.3455 | 1.2903 | 20.9827 | 47.658 | 11.915 | 1238.3685 | 298.2239 |
68
- | 10000 | 0.1010 | 601.3743 | 730.4988 | 1.0969 | 21.3361 | 46.869 | 11.717 | 1104.3544 | 265.0505 |
69
- | 15000 | 0.1515 | 504.4594 | 600.9854 | 0.9815 | 21.1535 | 47.274 | 11.818 | 935.7164 | 249.6911 |
70
- | 20000 | 0.2020 | 546.4334 | 695.7017 | 0.9179 | 21.1018 | 47.389 | 11.847 | 1108.5160 | 222.1384 |
71
- | 25000 | 0.2525 | 435.9988 | 478.2172 | 0.7794 | 21.1029 | 47.387 | 11.847 | 909.0684 | 166.1946 |
72
- | 30000 | 0.3030 | 408.8560 | 428.7766 | 0.7228 | 21.311 | 46.924 | 11.731 | 866.9764 | 155.9177 |
73
- | 35000 | 0.3535 | 415.0065 | 423.1964 | 0.7142 | 21.3097 | 46.927 | 11.732 | 897.1238 | 147.4223 |
74
- | 40000 | 0.4040 | 417.0611 | 438.1946 | 0.7138 | 21.0917 | 47.412 | 11.853 | 902.5912 | 137.5701 |
75
- | 45000 | 0.4545 | 410.7367 | 423.2858 | 0.7031 | 21.1335 | 47.318 | 11.83 | 889.1501 | 144.9844 |
76
- | 50000 | 0.5051 | 409.4502 | 412.4997 | 0.6987 | 21.3483 | 46.842 | 11.711 | 892.8698 | 159.3132 |
77
- | 55000 | 0.5556 | 406.3222 | 416.2205 | 0.6960 | 21.3527 | 46.832 | 11.708 | 884.7875 | 141.1298 |
78
- | 60000 | 0.6061 | 404.2499 | 392.8503 | 0.6944 | 21.3318 | 46.878 | 11.72 | 879.3189 | 131.7685 |
79
- | 65000 | 0.6566 | 395.6986 | 379.7383 | 0.6676 | 21.3383 | 46.864 | 11.716 | 873.5950 | 134.9347 |
80
- | 70000 | 0.7071 | 394.2606 | 379.3372 | 0.6649 | 21.3522 | 46.834 | 11.708 | 872.5484 | 138.0020 |
81
- | 75000 | 0.7576 | 392.1590 | 377.4717 | 0.6629 | 21.3581 | 46.821 | 11.705 | 870.7470 | 126.5486 |
82
- | 80000 | 0.8081 | 394.0469 | 381.8300 | 0.6625 | 21.0719 | 47.457 | 11.864 | 876.5246 | 132.6237 |
83
- | 85000 | 0.8586 | 393.9400 | 373.6366 | 0.6581 | 21.2823 | 46.987 | 11.747 | 877.9388 | 132.0940 |
84
- | 90000 | 0.9091 | 395.3693 | 377.8174 | 0.6574 | 21.391 | 46.749 | 11.687 | 883.5812 | 134.0020 |
85
- | 95000 | 0.9596 | 394.5662 | 377.2059 | 0.6567 | 21.0529 | 47.499 | 11.875 | 880.1915 | 132.9958 |
86
- | 99000 | 1.0 | 394.4440 | 376.8607 | 0.6564 | 21.3247 | 46.894 | 11.724 | 879.5369 | 132.9248 |
87
 
88
  ### Framework versions
89
  - Distily 0.2.0
 
16
  The [Distily](https://github.com/lapp0/distily) library was used for this distillation.
17
 
18
  It achieves the following results on the evaluation set:
19
+ - eval_enwikippl: 840.1149
20
+ - eval_frwikippl: 528.4605
21
+ - eval_zhwikippl: 126.6330
22
+ - eval_tinystoriesppl: 1037.4924
23
+ - eval_loss: 0.5100
24
+ - eval_runtime: 21.5094
25
+ - eval_samples_per_second: 46.491
26
+ - eval_steps_per_second: 11.623
27
 
28
  <!-- This model card has been generated automatically according to the information the Trainer had access to. You
29
  should probably proofread and complete it, then remove this comment.
 
63
  | step | epoch | enwikippl | frwikippl | loss | runtime | samples_per_second | steps_per_second | tinystoriesppl | zhwikippl |
64
  | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
65
  | **teacher eval** | | 270.2348 | 76.8142 | | | | | 671.1238 | 22.8030 |
66
+ | 0 | 0 | 120078.375 | 1867851235328.0 | 19.4492 | 21.0652 | 47.472 | 11.868 | 72.8770 | 4013754155008.0 |
67
+ | 5000 | 0.0505 | 1216.0441 | 888.1107 | 0.7144 | 21.4135 | 46.7 | 11.675 | 1267.6812 | 332.8297 |
68
+ | 10000 | 0.1010 | 1162.2788 | 799.4963 | 0.6619 | 21.4269 | 46.67 | 11.668 | 1249.7319 | 438.5025 |
69
+ | 15000 | 0.1515 | 980.3101 | 668.6794 | 0.6395 | 21.4739 | 46.568 | 11.642 | 1056.4025 | 425.3380 |
70
+ | 20000 | 0.2020 | 1064.2865 | 759.8051 | 0.6318 | 21.4643 | 46.589 | 11.647 | 1151.2905 | 311.5830 |
71
+ | 25000 | 0.2525 | 916.0289 | 621.8902 | 0.5662 | 21.1368 | 47.311 | 11.828 | 1071.6635 | 190.3806 |
72
+ | 30000 | 0.3030 | 891.1293 | 582.2575 | 0.5445 | 21.4338 | 46.655 | 11.664 | 1072.1951 | 208.7082 |
73
+ | 35000 | 0.3535 | 886.6196 | 544.0957 | 0.5381 | 21.5335 | 46.439 | 11.61 | 1057.8008 | 142.8915 |
74
+ | 40000 | 0.4040 | 880.1868 | 549.4098 | 0.5349 | 21.4687 | 46.58 | 11.645 | 1076.1021 | 142.8439 |
75
+ | 45000 | 0.4545 | 868.9573 | 564.4311 | 0.5323 | 21.4349 | 46.653 | 11.663 | 1042.4788 | 161.4311 |
76
+ | 50000 | 0.5051 | 877.1919 | 541.3246 | 0.5320 | 21.548 | 46.408 | 11.602 | 1058.0631 | 167.7873 |
77
+ | 55000 | 0.5556 | 869.4625 | 543.6743 | 0.5313 | 21.4821 | 46.55 | 11.638 | 1043.7725 | 163.6863 |
78
+ | 60000 | 0.6061 | 872.2788 | 553.3121 | 0.5305 | 21.4316 | 46.66 | 11.665 | 1068.5228 | 141.9700 |
79
+ | 65000 | 0.6566 | 833.5512 | 524.0497 | 0.5156 | 21.1637 | 47.251 | 11.813 | 1028.6963 | 137.2677 |
80
+ | 70000 | 0.7071 | 837.5645 | 523.4596 | 0.5133 | 21.4101 | 46.707 | 11.677 | 1031.1652 | 124.3812 |
81
+ | 75000 | 0.7576 | 847.7309 | 523.0175 | 0.5129 | 21.1745 | 47.227 | 11.807 | 1047.8357 | 130.6221 |
82
+ | 80000 | 0.8081 | 843.6693 | 534.2609 | 0.5125 | 21.388 | 46.755 | 11.689 | 1040.4556 | 125.4979 |
83
+ | 85000 | 0.8586 | 843.2120 | 524.1607 | 0.5106 | 21.4851 | 46.544 | 11.636 | 1042.5220 | 126.1609 |
84
+ | 90000 | 0.9091 | 842.1672 | 529.2425 | 0.5101 | 21.4494 | 46.621 | 11.655 | 1040.6277 | 126.7345 |
85
+ | 95000 | 0.9596 | 838.0835 | 528.3859 | 0.5099 | 21.1216 | 47.345 | 11.836 | 1034.5377 | 126.5655 |
86
+ | 99000 | 1.0 | 840.1149 | 528.4605 | 0.5100 | 21.5094 | 46.491 | 11.623 | 1037.4924 | 126.6330 |
87
 
88
  ### Framework versions
89
  - Distily 0.2.0
logs/dataset_subset=default, dataset_uri=distily_c4_multilingual_1M/events.out.tfevents.1724081704.f383272e719b ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5f907fe5fb388f490a0d79ee021d2d12c100b380f1129ea265f3b85e108828f
3
+ size 312