hunterhector
commited on
Commit
•
9a127b5
1
Parent(s):
15626cf
add eval results
Browse files- .gitignore +1 -0
- common.py +1 -1
- data/txt360_eval/CKPT Eval - BoolQ.csv +68 -0
- data/txt360_eval/CKPT Eval - GSM8K.csv +68 -0
- data/txt360_eval/CKPT Eval - HellaSwag.csv +69 -0
- data/txt360_eval/CKPT Eval - MATH.csv +68 -0
- data/txt360_eval/CKPT Eval - MMLU.csv +68 -0
- data/txt360_eval/CKPT Eval - MedQA.csv +68 -0
- data/txt360_eval/CKPT Eval - NQ.csv +68 -0
- data/txt360_eval/CKPT Eval - PIQA.csv +69 -0
- data/txt360_eval/CKPT Eval - TriviaQA.csv +68 -0
- data/txt360_eval/CKPT Eval - WinoGrande.csv +69 -0
- main.py +9 -10
- overview.py +1 -1
- results.py +122 -14
.gitignore
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
.sesskey
|
2 |
*.pyc
|
|
|
|
1 |
.sesskey
|
2 |
*.pyc
|
3 |
+
.DS_Store
|
common.py
CHANGED
@@ -282,7 +282,7 @@ table_div_pii = Div(NotStr(table_html_pii), style="display: flex; justify-conten
|
|
282 |
|
283 |
global_div = Div(
|
284 |
Section(
|
285 |
-
H2("Overview of
|
286 |
H3("What This Section Contains"),
|
287 |
P(
|
288 |
"This section discusses all details related to deduplication and filterings steps that were uniformly applied to all data. The section is split into the following topic areas: "
|
|
|
282 |
|
283 |
global_div = Div(
|
284 |
Section(
|
285 |
+
H2("Overview of Shared Processing Steps"),
|
286 |
H3("What This Section Contains"),
|
287 |
P(
|
288 |
"This section discusses all details related to deduplication and filterings steps that were uniformly applied to all data. The section is split into the following topic areas: "
|
data/txt360_eval/CKPT Eval - BoolQ.csv
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
0-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
|
2 |
+
hf-time: 4 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
|
3 |
+
5k,0.5761,0.5624,,0.6116,0.5514,0.5945,0.5446,0.5336,0.5902,0.5908,0.5394,0.5865,0.5284
|
4 |
+
10k,0.6242,0.5853,,0.6131,,0.5358,0.6122,0.6080,0.5471,0.5511,0.6138,0.5902,0.5780
|
5 |
+
15k,0.6480,0.6291,,0.6061,0.6217,0.5468,0.6205,0.6242,0.6248,0.5917,0.6211,0.5933,0.5713
|
6 |
+
20k,0.6541,0.6474,,0.5865,0.6187,0.6122,0.6199,0.6116,0.6119,0.5636,0.6239,0.5988,0.5850
|
7 |
+
25k,0.6670,0.6012,,0.6398,0.6251,0.6162,0.6349,0.6239,0.6291,0.5630,0.6336,0.6232,0.6312
|
8 |
+
30k,0.6777,0.6523,,0.6379,0.6083,0.6260,0.6437,0.6263,0.6107,0.5835,0.5865,0.6391,0.6425
|
9 |
+
35k,0.6495,0.6584,,0.6388,,0.6333,0.6346,0.6343,0.6144,0.4933,0.6043,0.6278,0.6480
|
10 |
+
40k,0.6771,0.6930,,0.6489,0.6410,0.6596,0.6330,0.6214,0.6520,0.5685,0.5768,0.6343,0.6505
|
11 |
+
45k,0.6624,0.6887,,0.6590,0.6422,0.6223,0.6401,0.6131,0.6153,0.5578,0.6058,0.6336,0.6529
|
12 |
+
50k,0.6761,0.6951,,0.6575,0.6566,0.6593,0.6557,0.6058,0.6541,0.5972,0.6018,0.6177,0.6563
|
13 |
+
55k,0.6847,0.6725,,0.6752,0.6321,0.6688,0.6523,0.6520,0.6679,0.5908,0.5343,0.6214,0.6618
|
14 |
+
60k,0.6920,0.6697,,0.6566,0.6226,0.6642,0.6401,0.6162,0.6361,0.5908,0.5972,0.6226,0.6645
|
15 |
+
65k,0.6979,0.6905,,0.6865,0.6352,0.6758,0.6688,0.6691,0.6942,0.6315,0.5682,0.6196,0.6352
|
16 |
+
70k,0.7104,0.6966,,0.6795,0.6456,0.6746,0.6651,0.6624,0.6575,0.5997,0.5324,0.6358,0.6526
|
17 |
+
75k,0.7269,0.6850,,0.6862,0.6514,,0.6621,0.6774,0.6817,0.6217,0.6009,0.6453,0.6535
|
18 |
+
80k,0.6997,0.6817,,0.6945,0.6327,0.6664,0.6667,0.6709,0.6703,0.6275,0.5896,0.6502,0.6612
|
19 |
+
85k,0.7346,0.6939,,0.6853,0.6746,0.6902,0.6602,0.6330,0.6737,0.6272,0.5239,0.6489,0.6703
|
20 |
+
90k,0.7254,0.6908,,0.6936,0.6612,0.6713,0.6755,0.6835,0.6315,0.6275,0.5428,0.6128,0.6807
|
21 |
+
95k,0.7165,0.7229,,0.7003,0.6587,,0.6823,0.6404,0.6670,0.6089,0.6138,0.6456,0.6612
|
22 |
+
100k,0.7153,0.7073,,0.6869,,0.6676,0.6746,0.6618,0.6587,0.6006,0.5584,0.6566,0.6810
|
23 |
+
105k,0.7333,0.7147,,0.6682,,0.6899,0.6609,0.6853,0.6853,0.6544,0.5740,0.6520,0.6755
|
24 |
+
110k,0.7376,0.7095,,0.6954,0.6664,0.6703,0.6810,0.6612,0.6798,0.6618,,0.6346,0.6434
|
25 |
+
115k,0.7168,0.7095,,0.7156,0.6645,0.6746,0.6997,0.6829,0.6813,0.6523,,0.6596,0.6920
|
26 |
+
120k,0.7370,0.7226,,0.7177,0.6648,0.6752,0.7015,,0.6841,0.6633,,0.6587,0.6890
|
27 |
+
125k,0.7361,0.7144,,0.7034,0.6636,0.6826,0.6869,0.6657,,0.6593,,0.6593,0.6795
|
28 |
+
130k,0.7284,0.7269,,0.6939,0.6786,0.6554,0.6988,0.6719,0.6777,0.6260,,,0.7018
|
29 |
+
135k,0.7483,0.7141,,0.7128,,0.6847,0.7028,0.6838,0.6933,0.6602,,,0.6966
|
30 |
+
140k,,0.7312,,0.7080,,0.6777,0.6997,0.6957,0.7040,0.6624,,,0.6884
|
31 |
+
145k,,,,0.7281,,0.6844,0.6908,0.6743,0.6914,0.6657,,,0.7061
|
32 |
+
150k,,,,0.7297,,0.6795,,0.6807,0.6991,0.6526,,,0.7024
|
33 |
+
155k,,,,0.7162,,0.7021,0.6976,0.6792,0.6927,0.6587,,,0.7028
|
34 |
+
160k,,,,0.6902,,0.6810,0.6985,0.6930,0.6893,0.6434,,,0.7098
|
35 |
+
165k,,,,0.7239,,0.6896,0.7037,,0.7021,0.6581,,,0.7080
|
36 |
+
170k,,,,0.7471,,0.6780,0.7141,,0.6911,0.6761,,,0.7058
|
37 |
+
175k,,,,0.7486,,0.6817,0.6942,,0.7095,0.6557,,,0.7021
|
38 |
+
180k,,,,0.6985,,0.6979,0.7162,,0.7067,0.6468,,,0.6523
|
39 |
+
185k,,,,0.7187,,0.6887,0.7031,,0.6917,0.6642,,,0.6914
|
40 |
+
190k,,,,0.7333,,0.6963,,,0.7113,0.6563,,,0.718
|
41 |
+
195k,,,,0.7269,,0.7021,,,0.7199,0.6817,,,0.7165
|
42 |
+
200k,,,,0.7135,,0.7080,,,0.707,0.6709,,,0.7015
|
43 |
+
205k,,,,0.7388,,0.7015,,,0.7168,0.6722,,,0.722
|
44 |
+
210k,,,,0.7489,,0.7089,,,,0.6765,,,0.6948
|
45 |
+
215k,,,,0.7538,,0.7183,,,0.7309,0.6869,,,0.6835
|
46 |
+
220k,,,,0.7474,,0.7171,,,0.7398,0.6893,,,
|
47 |
+
225k,,,,0.7251,,0.7131,,,0.7061,0.6801,,,
|
48 |
+
230k,,,,0.7083,,,,,0.7232,0.6765,,,
|
49 |
+
235k,,,,0.6930,,,,,0.6884,0.6434,,,
|
50 |
+
240k,,,,0.7541,,,,,,0.6875,,,
|
51 |
+
245k,,,,0.7541,,,,,,0.6713,,,
|
52 |
+
250k,,,,0.7498,,,,,,0.6798,,,
|
53 |
+
255k,,,,0.7749,,,,,,0.6578,,,
|
54 |
+
260k,,,,0.7615,,,,,,0.6954,,,
|
55 |
+
265k,,,,0.7486,,,,,,0.6807,,,
|
56 |
+
270k,,,,0.7226,,,,,,0.6869,,,
|
57 |
+
275k,,,,0.7269,,,,,,0.6841,,,
|
58 |
+
280k,,,,0.7517,,,,,,0.6804,,,
|
59 |
+
285k,,,,0.7150,,,,,,0.7006,,,
|
60 |
+
290k,,,,,,,,,,0.6826,,,
|
61 |
+
300k,,,,,,,,,,0.6706,,,
|
62 |
+
305k,,,,,,,,,,0.7006,,,
|
63 |
+
310k,,,,,,,,,,0.6777,,,
|
64 |
+
315k,,,,,,,,,,0.6859,,,
|
65 |
+
320k,,,,,,,,,,0.6939,,,
|
66 |
+
325k,,,,,,,,,,,,,
|
67 |
+
330k,,,,,,,,,,,,,
|
68 |
+
335k,,,,,,,,,,,,,
|
data/txt360_eval/CKPT Eval - GSM8K.csv
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
5-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
|
2 |
+
hf-time: 115 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
|
3 |
+
5k,0.0152,0.0099,,,0.0076,0.0015,0.0045,0.0030,,0.0152,0.0106,0.0197,0.0197
|
4 |
+
10k,0.0152,0.0190,,0.0015,,0.0091,0.0000,0.0212,0.0144,0.0159,0.0136,0.0174,0.0243
|
5 |
+
15k,0.0182,0.0167,,0.0053,0.0068,0.0045,0.0083,0.0212,0.0068,0.0174,0.0190,0.0174,0.0136
|
6 |
+
20k,0.0250,0.0212,,,,,0.0030,0.0159,0.0220,0.0167,0.0190,0.0220,0.0174
|
7 |
+
25k,0.0288,0.0114,,,,0.0129,0.0053,0.0258,0.0144,0.0152,0.0144,0.0144,0.0144
|
8 |
+
30k,0.0220,0.0265,,0.0197,0.0038,0.0152,0.0167,0.0227,0.0220,0.0205,0.0129,0.0167,0.0038
|
9 |
+
35k,0.0296,0.0212,,0.0136,0.0045,0.0190,0.0045,0.0227,0.0220,0.0174,0.0174,0.0243,0.0182
|
10 |
+
40k,0.0235,0.0288,,0.0068,0.0121,0.0220,0.0015,0.0243,0.0265,0.0152,0.0212,0.0190,0.0182
|
11 |
+
45k,0.0387,0.0250,,0.0258,0.0038,0.0273,0.0106,0.0296,0.0273,0.0182,0.0152,0.0174,0.0129
|
12 |
+
50k,0.0318,0.0303,,0.0015,0.0243,0.0227,0.0121,0.0190,0.0220,0.0197,0.0205,0.0182,0.0068
|
13 |
+
55k,0.0296,0.0311,,0.0023,0.0235,0.0235,0.0250,0.0326,0.0197,0.0182,0.0174,0.0250,0.0091
|
14 |
+
60k,0.0432,0.0326,,0.0167,0.0212,0.0212,0.0182,0.0349,0.0220,0.0182,0.0099,0.0190,0.0197
|
15 |
+
65k,0.0470,0.0379,,0.0015,0.0159,0.0281,0.0136,0.0296,0.0212,0.0212,0.0129,0.0205,0.0114
|
16 |
+
70k,0.0432,0.0417,,0.0136,0.0197,0.0174,0.0114,0.0341,0.0243,0.0205,0.0136,0.0250,0.0091
|
17 |
+
75k,0.0508,0.0470,,0.0174,0.0121,0.0250,0.0182,0.0356,0.0288,0.0281,0.0174,0.0190,0.0106
|
18 |
+
80k,0.0561,0.0417,,0.0068,0.0000,0.0190,0.0083,0.0318,0.0356,0.0273,0.0167,0.0265,0.0182
|
19 |
+
85k,0.0728,0.0341,,0.0341,0.0190,0.0296,0.0205,0.0265,0.0250,0.0220,0.0129,0.0235,0.0083
|
20 |
+
90k,0.0690,0.0425,,0.0197,0.0190,0.0281,0.0061,0.0417,0.0265,0.0273,0.0167,0.0190,0.0182
|
21 |
+
95k,0.0735,0.0447,,0.0167,0.0250,0.0281,0.0136,0.0349,0.0281,0.0174,0.0106,0.0288,0.0159
|
22 |
+
100k,0.0637,0.0470,,0.0159,,0.0227,0.0045,0.0409,0.0311,0.0265,0.0205,0.0190,0.0190
|
23 |
+
105k,0.0637,0.0447,,0.0341,,0.0303,0.0129,0.0371,0.0311,0.0273,0.0205,0.0311,0.0129
|
24 |
+
110k,0.0872,0.0576,,0.0038,0.0273,0.0129,0.0205,0.0478,0.0296,0.0212,,0.0281,0.0182
|
25 |
+
115k,0.0788,0.0576,,0.0091,0.0167,0.0311,0.0167,0.0508,0.0349,0.0220,,0.0220,0.0174
|
26 |
+
120k,0.0834,0.0455,,0.0227,0.0265,0.0167,0.0212,0.0371,0.0318,0.0167,,0.0220,0.0152
|
27 |
+
125k,0.1001,0.0493,,0.0288,0.0250,0.0205,0.0387,0.0402,0.0318,0.0182,,0.0235,0.0144
|
28 |
+
130k,0.0766,0.0470,,0.0068,0.0258,0.0288,0.0174,,0.0341,0.0243,,,0.0205
|
29 |
+
135k,0.0879,0.0607,,0.0190,,0.0349,0.0258,0.0409,0.0288,0.0212,,,0.0281
|
30 |
+
140k,,0.0569,,0.0379,,0.0356,0.0227,0.0440,0.0341,0.0144,,,0.0144
|
31 |
+
145k,,,,0.0341,,0.0379,0.0015,0.0387,,0.0174,,,0.0273
|
32 |
+
150k,,,,,,0.0281,,0.0470,0.0265,0.0220,,,0.0258
|
33 |
+
155k,,,,0.0318,,0.0303,0.0121,0.0561,0.0523,0.0227,,,0.0243
|
34 |
+
160k,,,,0.0356,,0.0243,0.0061,0.0425,0.0432,0.0220,,,0.0303
|
35 |
+
165k,,,,0.0167,,0.0409,0.0015,,0.0470,0.0281,,,
|
36 |
+
170k,,,,0.0334,,0.0281,0.0129,,0.0455,0.0273,,,0.0235
|
37 |
+
175k,,,,0.0371,,0.0326,0.0190,,0.0409,0.0190,,,0.0273
|
38 |
+
180k,,,,0.0425,,0.0364,0.0227,,0.0356,0.0243,,,0.0288
|
39 |
+
185k,,,,0.0341,,0.0318,0.0341,,0.0546,0.0235,,,0.0364
|
40 |
+
190k,,,,0.0296,,0.0364,,,0.0425,0.0220,,,0.0349
|
41 |
+
195k,,,,0.0250,,0.0303,,,0.0493,0.0258,,,
|
42 |
+
200k,,,,0.0250,,0.0371,,,0.0493,0.0273,,,0.0205
|
43 |
+
205k,,,,0.0455,,0.0409,,,0.0553,0.0220,,,0.0258
|
44 |
+
210k,,,,0.0462,,0.0371,,,0.0523,0.0281,,,
|
45 |
+
215k,,,,0.0349,,0.0265,,,0.0500,0.0235,,,0.0281
|
46 |
+
220k,,,,0.0432,,0.0167,,,0.0462,0.0326,,,
|
47 |
+
225k,,,,0.0447,,0.0212,,,,0.0265,,,
|
48 |
+
230k,,,,0.0440,,,,,0.0493,0.0273,,,
|
49 |
+
235k,,,,0.0402,,,,,0.0508,0.0220,,,
|
50 |
+
240k,,,,0.0341,,,,,,0.0281,,,
|
51 |
+
245k,,,,0.0462,,,,,,0.0356,,,
|
52 |
+
250k,,,,0.0500,,,,,,,,,
|
53 |
+
255k,,,,0.0569,,,,,,0.0303,,,
|
54 |
+
260k,,,,0.0500,,,,,,0.0334,,,
|
55 |
+
265k,,,,0.0455,,,,,,0.0318,,,
|
56 |
+
270k,,,,0.0538,,,,,,0.0273,,,
|
57 |
+
275k,,,,0.0470,,,,,,,,,
|
58 |
+
280k,,,,0.0553,,,,,,0.0364,,,
|
59 |
+
285k,,,,0.0531,,,,,,0.0349,,,
|
60 |
+
290k,,,,,,,,,,0.0311,,,
|
61 |
+
300k,,,,,,,,,,,,,
|
62 |
+
305k,,,,,,,,,,0.0311,,,
|
63 |
+
310k,,,,,,,,,,0.0273,,,
|
64 |
+
315k,,,,,,,,,,,,,
|
65 |
+
320k,,,,,,,,,,,,,
|
66 |
+
325k,,,,,,,,,,,,,
|
67 |
+
330k,,,,,,,,,,,,,
|
68 |
+
335k,,,,,,,,,,,,,
|
data/txt360_eval/CKPT Eval - HellaSwag.csv
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
ga,Slim-Pajama 600B (bsz=4K x 1024),,,,,,FineWeb-1.5T,,Ours-Base,,Ours-Upsampling1,,Ours-Upsampling2,,Ours-Code-Upsampling2,,All-Upsampling1,,All-Upsampling1,,All-Upsampling1,,All-Upsampling1,,DCLM-Base,
|
2 |
+
0-shot: 5 min,Llama-8x8B-baseline,,Llama-8x8B-seq8192,,Llama-8x8B-mup,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-1x8B-seq8192,,Llama_extend-1x8B-seq8192,,Jais-1x8B-seq8192,,Llama-1x8B-seq8192,
|
3 |
+
10-shot: 36 min,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot,0-shot,10-shot
|
4 |
+
5k,0.5315,0.5301,0.5165,0.5693,,,0.5622,0.5376,0.5254,0.5119,0.5356,0.5291,0.5324,0.5210,0.5377,0.5339,0.5366,0.5278,0.4507,0.4300,0.4413,0.4262,0.4497,0.4397,0.4624,0.4469
|
5 |
+
10k,0.6076,0.6008,0.5949,0.5693,,,0.6433,0.6202,0.5836,0.5827,,0.5975,0.6046,0.5886,0.6036,0.5987,0.6139,0.5901,0.5279,0.4889,0.5141,0.4872,0.5219,0.5028,0.5454,0.5150
|
6 |
+
15k,0.6422,0.6278,0.6314,0.5998,,,0.6716,0.6367,0.6114,0.6002,0.6281,0.6079,0.6336,0.6118,0.6399,0.6266,0.6388,0.6172,0.5495,0.5211,0.5444,0.5142,0.5469,0.5096,0.5785,0.5484
|
7 |
+
20k,0.6616,0.6424,0.6496,0.6244,,,0.6855,,0.6271,0.6223,0.6461,0.6230,0.6492,0.6329,0.6511,0.6475,0.6548,0.6382,0.5685,0.5310,0.5579,0.5270,0.5813,0.5377,0.5946,0.5649
|
8 |
+
25k,0.6738,0.6577,0.6683,0.6390,,,0.6945,0.6662,0.6413,,0.6612,0.6404,0.6665,0.6417,0.6652,0.6629,0.6683,0.6499,0.5759,0.5369,0.5787,0.5486,0.5864,0.5598,0.6105,0.5796
|
9 |
+
30k,0.6863,0.6656,0.6758,0.6368,,,0.7059,0.6639,,0.6387,0.6692,0.6425,0.6746,0.6485,0.6708,0.6584,0.6741,0.6587,0.5891,0.5490,0.5915,0.5437,0.5990,0.5625,0.6197,0.5897
|
10 |
+
35k,0.6956,0.6762,0.6850,0.6430,,,0.7158,0.6602,0.6547,0.6420,,0.6348,0.6832,0.6572,0.6816,0.6705,0.6864,0.6682,0.5985,0.5590,0.5954,0.5553,0.6090,0.5667,0.6343,0.6046
|
11 |
+
40k,0.7022,0.6812,0.6966,0.6524,,,0.7184,0.6814,0.6642,0.6452,0.6751,0.6347,0.6821,0.6533,0.6865,0.6717,0.6917,0.6646,0.6015,0.5595,0.6033,0.5592,0.6112,0.5704,0.6429,0.6100
|
12 |
+
45k,0.7048,0.6954,0.6991,0.6583,,,0.7220,0.6921,0.6698,0.6479,0.6802,,0.6905,0.6616,0.6919,0.6812,0.6933,0.6704,0.6103,0.5663,0.6040,0.5623,0.6175,0.5773,0.6473,0.6212
|
13 |
+
50k,0.7171,0.6998,0.7041,0.6574,,,0.7250,0.6785,0.6689,0.6611,0.6931,0.6726,0.6964,0.6720,0.6905,0.6715,0.7018,0.6902,0.6106,0.5510,0.6138,0.5676,0.6230,0.5934,0.6477,0.6109
|
14 |
+
55k,0.7187,0.7012,0.7080,0.6768,,,0.7305,0.6967,0.6697,0.6571,0.6899,0.6614,0.6959,0.6764,0.7047,0.6816,0.7052,0.6799,0.6182,0.5759,0.6200,0.5753,0.6260,0.5857,0.6518,0.6145
|
15 |
+
60k,0.7240,0.7037,0.7129,0.6748,,,0.7236,0.6955,0.6748,0.6573,0.6941,0.6584,0.6904,0.6850,0.6982,0.6731,0.7040,0.6767,0.6207,0.5849,0.6217,0.5711,0.6318,0.5744,0.6566,0.6204
|
16 |
+
65k,0.7297,0.7130,0.7142,0.6700,,,0.7355,0.6994,0.6752,0.6590,0.6907,0.6598,0.7061,0.6772,0.6963,0.6824,0.7074,0.6857,0.6235,0.5766,0.6299,0.5750,0.6381,0.5973,0.6544,0.6264
|
17 |
+
70k,0.7298,0.7148,0.7224,0.6796,,,0.7399,0.7034,0.6773,0.6631,0.6968,0.6735,0.7054,0.6789,0.7043,0.6936,0.7074,0.6883,0.6294,0.5982,0.6341,0.5872,0.6403,0.5928,0.6617,0.6220
|
18 |
+
75k,0.7329,0.7144,0.7261,0.6972,,,0.7374,0.6934,0.6854,0.6661,0.7014,0.6622,0.7065,0.6843,0.7029,0.6837,0.7027,0.6853,0.6285,0.5932,0.6336,0.5830,0.6376,0.5907,0.6706,0.6237
|
19 |
+
80k,0.7414,0.7271,0.7316,0.6937,,,0.7422,0.6989,0.6862,0.6717,0.7051,0.6762,0.7118,0.6954,0.7178,0.6909,0.7139,0.6908,0.6315,0.5898,0.6363,0.5877,0.6491,0.5968,0.6710,0.6032
|
20 |
+
85k,0.7449,0.7278,0.7334,0.7011,,,0.7444,0.7101,0.6887,0.6635,0.7086,0.6739,0.7126,0.6872,0.7052,0.6927,0.7178,0.7048,0.6359,0.5970,0.6375,0.5941,0.6380,0.5897,0.6789,0.6203
|
21 |
+
90k,0.7483,0.7379,0.7379,0.6949,,,0.7443,0.7064,0.6917,0.6818,0.7079,0.6804,0.7148,0.6926,0.7106,0.6976,0.7146,0.6818,0.6400,0.6052,0.6327,0.5846,0.6521,0.6120,0.6781,0.6271
|
22 |
+
95k,0.7510,0.7411,0.7427,0.6987,,,0.7376,0.6943,0.6901,0.6719,0.7097,0.6616,0.7115,0.6946,0.7221,0.6979,0.7240,0.6953,0.6388,0.5870,0.6373,0.5899,0.6460,0.5970,0.6798,0.6320
|
23 |
+
100k,0.7550,0.7419,0.7437,0.7070,,,0.7457,0.7153,,,0.7060,0.6902,0.7117,0.6955,0.7167,0.7002,0.7241,0.7013,0.6447,0.6079,0.6431,0.5916,0.6490,0.6095,0.6854,0.6467
|
24 |
+
105k,0.7547,0.7424,0.7445,0.7042,,,0.7476,0.7158,,,0.7141,0.6804,0.7132,0.6953,0.7222,0.6980,0.7263,0.6912,0.6470,0.6060,0.6473,0.5908,0.6588,0.6023,0.6809,0.6144
|
25 |
+
110k,0.7605,0.7491,0.7540,0.7070,,,0.7486,0.7210,0.6942,0.6850,0.7107,0.6696,0.7166,0.6883,0.7221,0.7020,0.7284,0.7013,0.6482,0.6196,,,0.6620,0.6166,0.6888,0.6269
|
26 |
+
115k,0.7626,0.7491,0.7540,0.7070,,,0.7522,0.7213,0.6957,0.6832,0.7120,0.6698,0.7179,0.6955,0.7284,0.7101,0.7274,0.7045,0.6511,0.6004,,,0.6636,0.5998,0.6882,0.6250
|
27 |
+
120k,0.7641,0.7545,0.7532,0.7110,,,0.7520,0.7217,0.7022,0.6911,0.7139,0.6855,0.7224,0.7017,0.7132,0.6866,0.7329,0.7089,0.6532,0.6145,,,0.6611,0.6085,0.6874,0.6250
|
28 |
+
125k,0.7636,0.7552,0.7538,0.7126,,,0.7533,0.7195,0.7029,0.6946,0.7211,0.6944,0.7221,0.6937,0.7250,0.7155,0.7285,0.7194,0.6571,0.6184,,,0.6624,0.6071,0.6896,0.6294
|
29 |
+
130k,0.7619,0.7547,0.7539,0.7168,,,0.7573,0.7178,0.7032,0.6929,0.7195,0.6969,0.7261,0.7103,0.7320,0.7221,0.7337,0.7096,0.6593,0.6174,,,,,0.6929,0.6273
|
30 |
+
135k,0.7641,0.7570,0.7543,0.7162,,,0.7580,0.7324,,,0.7177,0.6978,0.7198,0.6969,0.7249,0.7162,0.7324,0.7107,0.6584,0.6316,,,,,0.6941,0.6392
|
31 |
+
140k,,,0.7615,0.7250,,,0.7596,0.7329,,,0.7236,0.7106,0.7245,0.7140,0.7306,0.7228,0.7338,0.7099,0.6577,0.6142,,,,,0.6925,0.6334
|
32 |
+
145k,,,,,,,0.7573,0.7207,,,0.7194,0.7040,0.7247,0.7077,0.7347,0.7231,0.7431,0.7195,0.6628,0.6295,,,,,0.6984,0.6543
|
33 |
+
150k,,,,,,,0.7614,0.7352,,,0.7170,0.7029,,,0.7304,0.7116,0.7386,0.7233,0.6592,0.6212,,,,,0.6978,0.6291
|
34 |
+
155k,,,,,,,0.7579,0.7360,,,0.7245,0.7127,0.7294,0.7058,0.7378,0.7162,0.7448,0.7139,0.6662,0.6246,,,,,0.6929,0.6396
|
35 |
+
160k,,,,,,,0.7606,0.7356,,,0.7199,0.6983,0.7279,0.7109,0.7343,0.7230,0.7385,0.7172,0.6666,0.6169,,,,,0.7009,0.6266
|
36 |
+
165k,,,,,,,,0.7403,,,0.7249,0.7058,0.7297,0.7119,,,0.7493,0.7234,0.6680,0.6268,,,,,0.7003,0.6104
|
37 |
+
170k,,,,,,,0.7696,0.7422,,,0.7262,0.7070,0.7323,0.7031,,,0.7499,0.7260,0.6710,0.6319,,,,,0.7010,0.6514
|
38 |
+
175k,,,,,,,0.7745,0.7450,,,0.7303,0.7180,0.7338,0.7206,,,0.7502,0.7257,0.6707,0.6205,,,,,0.7047,0.6401
|
39 |
+
180k,,,,,,,0.7676,0.7384,,,0.7299,0.7249,0.7316,0.7250,,,0.7457,0.7270,0.6721,0.6327,,,,,0.7079,0.6421
|
40 |
+
185k,,,,,,,0.7678,0.7441,,,0.7319,0.7232,0.7354,0.7340,,,0.7519,0.7309,0.6732,0.6275,,,,,0.7050,0.6130
|
41 |
+
190k,,,,,,,0.7701,0.7505,,,0.7336,0.7193,,,,,0.7493,0.7305,0.6729,0.6343,,,,,0.7097,0.6568
|
42 |
+
195k,,,,,,,0.7730,0.7504,,,0.7293,0.7137,,,,,0.7579,0.7376,0.6774,0.6251,,,,,0.7074,0.6363
|
43 |
+
200k,,,,,,,0.7753,0.7521,,,0.7366,0.7138,,,,,0.7567,0.7372,0.6795,0.6279,,,,,0.7122,0.6430
|
44 |
+
205k,,,,,,,0.7744,0.7537,,,0.7360,0.7312,,,,,0.7560,0.7453,,0.6293,,,,,0.7175,0.6647
|
45 |
+
210k,,,,,,,0.7729,0.7539,,,0.7368,0.7284,,,,,0.7658,0.7465,,0.6431,,,,,0.7179,0.6109
|
46 |
+
215k,,,,,,,0.7804,0.7596,,,0.7359,0.7295,,,,,0.7621,0.7357,0.6819,0.6370,,,,,0.7136,0.6287
|
47 |
+
220k,,,,,,,0.7752,0.7633,,,0.7384,0.7436,,,,,0.7678,0.7457,0.6860,0.6384,,,,,,
|
48 |
+
225k,,,,,,,0.7808,0.7607,,,0.7340,0.7366,,,,,0.7649,0.7427,0.6805,0.6354,,,,,,
|
49 |
+
230k,,,,,,,0.7786,0.7614,,,,,,,,,0.7662,0.7561,0.6855,0.6483,,,,,,
|
50 |
+
235k,,,,,,,0.7844,0.7619,,,,,,,,,0.7676,0.7532,0.6880,0.6471,,,,,,
|
51 |
+
240k,,,,,,,0.7866,0.7677,,,,,,,,,,,0.6841,0.6509,,,,,,
|
52 |
+
245k,,,,,,,0.7857,0.7684,,,,,,,,,,,0.6850,0.6487,,,,,,
|
53 |
+
250k,,,,,,,0.7851,0.7738,,,,,,,,,,,0.6892,0.6541,,,,,,
|
54 |
+
255k,,,,,,,0.7845,0.7716,,,,,,,,,,,0.6875,0.6448,,,,,,
|
55 |
+
260k,,,,,,,0.7893,0.7705,,,,,,,,,,,0.6945,0.6480,,,,,,
|
56 |
+
265k,,,,,,,0.7918,0.7727,,,,,,,,,,,0.6933,0.6552,,,,,,
|
57 |
+
270k,,,,,,,0.7917,0.7725,,,,,,,,,,,0.6980,0.6548,,,,,,
|
58 |
+
275k,,,,,,,0.7925,0.7741,,,,,,,,,,,0.6950,0.6604,,,,,,
|
59 |
+
280k,,,,,,,0.7943,0.7769,,,,,,,,,,,,0.6574,,,,,,
|
60 |
+
285k,,,,,,,0.7946,0.7781,,,,,,,,,,,0.6970,0.6644,,,,,,
|
61 |
+
290k,,,,,,,,,,,,,,,,,,,0.6970,0.6674,,,,,,
|
62 |
+
300k,,,,,,,,,,,,,,,,,,,0.6969,0.6592,,,,,,
|
63 |
+
305k,,,,,,,,,,,,,,,,,,,0.6997,0.6655,,,,,,
|
64 |
+
310k,,,,,,,,,,,,,,,,,,,0.6988,0.6639,,,,,,
|
65 |
+
315k,,,,,,,,,,,,,,,,,,,0.7023,0.6749,,,,,,
|
66 |
+
320k,,,,,,,,,,,,,,,,,,,0.7012,0.6706,,,,,,
|
67 |
+
325k,,,,,,,,,,,,,,,,,,,,,,,,,,
|
68 |
+
330k,,,,,,,,,,,,,,,,,,,,,,,,,,
|
69 |
+
335k,,,,,,,,,,,,,,,,,,,,,,,,,,
|
data/txt360_eval/CKPT Eval - MATH.csv
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
5-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
|
2 |
+
time: 5 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
|
3 |
+
5k,0.2335,0.2308,,0.2251,,0.2157,0.2221,0.2231,0.2211,0.2251,0.2191,0.2271,0.2238
|
4 |
+
10k,0.2489,0.2519,,0.2379,0.2211,0.2332,0.2415,0.2342,0.2399,0.2285,0.2342,0.2402,0.2224
|
5 |
+
15k,0.2626,0.2469,,0.2526,,0.2389,0.2322,0.2479,0.2580,0.2375,0.2271,0.2355,0.2375
|
6 |
+
20k,0.2737,0.2606,,0.2469,0.2399,,0.2419,0.2526,0.2663,0.2469,0.2499,0.2439,0.2322
|
7 |
+
25k,0.2700,0.2653,,0.2523,0.2395,0.2600,0.2526,0.2616,0.2559,0.2369,0.2476,0.2462,0.2355
|
8 |
+
30k,0.2687,0.2556,,0.2402,,0.2452,0.2533,0.2606,0.2503,0.2456,0.2452,0.2446,0.2372
|
9 |
+
35k,0.2765,0.2533,,0.2683,0.2596,0.2590,0.2509,0.2630,0.2737,0.2392,0.2405,0.2536,0.2402
|
10 |
+
40k,0.2667,0.2683,,0.2496,0.2496,0.2593,0.2529,0.2697,0.2663,0.2379,0.2486,0.2526,0.2422
|
11 |
+
45k,0.2750,0.2620,,0.2616,0.2586,0.2563,0.2503,0.2683,0.2673,0.2479,0.2496,0.2513,0.2472
|
12 |
+
50k,0.2861,0.2697,,0.2693,0.2553,0.2596,0.2553,0.2700,0.2771,0.2442,0.2425,0.2546,0.2395
|
13 |
+
55k,0.2848,0.2693,,0.2640,0.2630,0.2566,0.2479,0.2630,0.2757,0.2526,0.2506,0.2586,0.2509
|
14 |
+
60k,0.2945,0.2784,,0.2727,0.2596,0.2633,0.2590,0.2690,0.2714,0.2519,0.2563,0.2553,0.2479
|
15 |
+
65k,0.3008,0.2767,,0.2680,0.2623,0.2704,0.2610,0.2492,0.2727,0.2529,0.2559,0.2647,0.2462
|
16 |
+
70k,0.2891,0.2824,,0.2730,0.2596,0.2710,0.2700,0.2677,0.2807,0.2469,0.2459,0.2626,0.2576
|
17 |
+
75k,0.2982,0.2938,,0.2784,0.2647,0.2630,0.2697,0.2777,0.2620,0.2626,0.2499,0.2583,0.2549
|
18 |
+
80k,0.2948,0.2801,,0.2737,0.2727,0.2643,0.2553,0.2657,0.2704,0.2509,0.2590,0.2549,0.2563
|
19 |
+
85k,0.2992,0.2938,,0.2754,0.2620,0.2704,0.2677,0.2600,0.2771,0.2496,0.2385,0.2620,0.2529
|
20 |
+
90k,0.3002,0.2888,,0.2764,0.2714,0.2737,0.2573,0.2693,0.2918,0.2616,0.2492,0.2566,0.2516
|
21 |
+
95k,0.3025,0.2817,,0.2616,0.2690,0.2737,0.2523,0.2690,0.2791,0.2492,0.2576,0.2576,0.2549
|
22 |
+
100k,0.2951,0.2894,,0.2616,,0.2817,0.2660,0.2757,0.2861,0.2546,0.2479,0.2667,0.2559
|
23 |
+
105k,0.3052,0.2928,,0.2653,,0.2710,0.2707,0.2771,0.2868,0.2529,0.2482,0.2640,0.2633
|
24 |
+
110k,0.3052,0.2985,,0.2600,0.2764,0.2781,0.2600,0.2764,0.2824,0.2536,,0.2727,0.2606
|
25 |
+
115k,0.3025,0.2985,,0.2690,0.2791,0.2720,0.2704,0.2744,0.2918,0.2623,,0.2807,0.2496
|
26 |
+
120k,0.3042,0.2985,,0.2750,0.2647,0.2650,0.2814,0.2754,0.2955,0.2677,,0.2626,0.2586
|
27 |
+
125k,0.3149,0.3018,,0.2683,0.2707,0.2647,0.2757,0.2760,0.2804,0.2509,,0.2704,0.2496
|
28 |
+
130k,0.3179,0.2978,,0.2781,0.2747,0.2653,0.2760,0.2774,0.2767,0.2593,,,0.2513
|
29 |
+
135k,0.3226,0.2945,,0.2747,,0.2717,0.2673,0.2784,0.2884,0.2606,,,0.2533
|
30 |
+
140k,,0.3018,,0.2771,,0.2757,0.2794,0.2787,0.2821,0.2459,,,0.2596
|
31 |
+
145k,,,,0.2724,,0.2650,0.2720,0.2888,0.2801,0.2543,,,0.2633
|
32 |
+
150k,,,,0.2720,,0.2814,,0.2864,0.2901,0.2590,,,0.2543
|
33 |
+
155k,,,,,,0.2784,0.2720,0.2874,0.2938,0.2580,,,0.2566
|
34 |
+
160k,,,,0.2817,,0.2834,0.2653,0.2807,0.2814,0.2563,,,0.2549
|
35 |
+
165k,,,,0.2834,,0.2821,0.2804,,0.2955,0.2559,,,0.2536
|
36 |
+
170k,,,,0.2854,,0.2824,0.2804,,0.3119,0.2536,,,0.2626
|
37 |
+
175k,,,,0.2804,,0.2915,0.2750,,0.2988,0.2489,,,0.2657
|
38 |
+
180k,,,,0.2767,,0.2901,0.2958,,0.3099,0.2623,,,0.2643
|
39 |
+
185k,,,,0.2767,,0.2948,0.2804,,0.3055,0.2570,,,0.2643
|
40 |
+
190k,,,,0.2787,,0.2925,,,0.3065,0.2573,,,0.2760
|
41 |
+
195k,,,,0.2858,,0.2898,,,0.3119,0.2640,,,0.2657
|
42 |
+
200k,,,,0.2771,,0.3028,,,0.3112,0.2610,,,0.2687
|
43 |
+
205k,,,,0.2851,,0.2921,,,0.3002,0.2680,,,0.2667
|
44 |
+
210k,,,,0.2838,,0.2817,,,0.3022,0.2650,,,0.2714
|
45 |
+
215k,,,,0.2838,,0.2851,,,0.3069,0.2653,,,0.2600
|
46 |
+
220k,,,,0.2938,,0.2814,,,0.3002,0.2549,,,
|
47 |
+
225k,,,,0.2935,,0.2898,,,0.3049,0.2633,,,
|
48 |
+
230k,,,,0.2888,,,,,0.3132,0.2653,,,
|
49 |
+
235k,,,,0.3055,,,,,0.2951,0.2717,,,
|
50 |
+
240k,,,,0.2995,,,,,,0.2667,,,
|
51 |
+
245k,,,,0.2928,,,,,,0.2610,,,
|
52 |
+
250k,,,,0.3092,,,,,,0.2650,,,
|
53 |
+
255k,,,,0.3152,,,,,,0.2643,,,
|
54 |
+
260k,,,,0.2951,,,,,,0.2616,,,
|
55 |
+
265k,,,,0.3045,,,,,,0.2610,,,
|
56 |
+
270k,,,,0.3018,,,,,,,,,
|
57 |
+
275k,,,,0.3065,,,,,,,,,
|
58 |
+
280k,,,,0.3015,,,,,,,,,
|
59 |
+
285k,,,,0.2965,,,,,,0.2586,,,
|
60 |
+
290k,,,,,,,,,,0.2623,,,
|
61 |
+
300k,,,,,,,,,,0.2603,,,
|
62 |
+
305k,,,,,,,,,,0.2630,,,
|
63 |
+
310k,,,,,,,,,,0.2710,,,
|
64 |
+
315k,,,,,,,,,,0.2677,,,
|
65 |
+
320k,,,,,,,,,,0.2650,,,
|
66 |
+
325k,,,,,,,,,,,,,
|
67 |
+
330k,,,,,,,,,,,,,
|
68 |
+
335k,,,,,,,,,,,,,
|
data/txt360_eval/CKPT Eval - MMLU.csv
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
5-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base,Comments
|
2 |
+
time: 20 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192,"1. Comparing with upsample2-155k, the social science acc of dclm-195k is much higher"
|
3 |
+
5k,0.2398,0.2671,,,0.2579,0.2418,0.2482,0.2690,0.2456,0.2512,0.2532,0.2428,0.2530,
|
4 |
+
10k,0.2535,0.2520,,0.2594,0.2612,,0.2628,0.2319,0.2525,0.2462,0.2582,0.2713,0.2529,
|
5 |
+
15k,0.2527,0.2347,,,,0.2489,0.2334,0.2483,0.2503,0.2549,0.2380,0.2653,0.2507,
|
6 |
+
20k,0.2530,0.2478,,0.2495,0.2467,0.2677,0.2449,0.2507,0.2540,0.2416,0.2612,0.2482,0.2553,
|
7 |
+
25k,0.2503,0.2488,,,0.2431,0.2597,0.2571,0.2506,0.2534,0.2505,0.2464,0.2577,0.2552,
|
8 |
+
30k,0.2297,0.2539,,,,0.2592,0.2678,0.2389,0.2557,0.2468,0.2517,0.2485,0.2556,
|
9 |
+
35k,0.2356,0.2374,,0.2426,0.2591,0.2550,0.2562,0.2594,0.2494,0.2403,0.2451,0.2547,0.2443,
|
10 |
+
40k,0.2406,0.2462,,0.2467,0.2485,0.2344,0.2408,0.2555,0.2686,0.2552,0.2500,0.2553,0.2775,
|
11 |
+
45k,0.2470,0.2428,,0.2418,0.2296,0.2512,0.2712,0.2630,0.2503,0.2368,0.2536,0.2557,0.2393,
|
12 |
+
50k,0.2421,0.2368,,0.2382,0.2441,0.2727,0.2558,0.2558,0.2322,0.2499,0.2563,0.2305,0.2485,
|
13 |
+
55k,0.2460,0.2551,,0.2408,0.2536,0.2389,0.2440,0.2444,0.2747,0.2552,0.2516,0.2339,0.2595,
|
14 |
+
60k,0.2415,0.2397,,0.2718,0.2539,0.2518,0.2339,0.2551,0.2432,0.2517,0.2589,0.2379,0.2589,
|
15 |
+
65k,0.2490,0.2641,,0.2637,0.2423,0.2589,0.2342,0.2303,0.2478,0.2485,0.2643,0.2485,0.2798,
|
16 |
+
70k,0.2578,0.2641,,0.2534,0.2359,0.2716,0.2673,0.2307,0.2478,0.2483,0.2426,0.2499,0.2583,
|
17 |
+
75k,0.2587,0.2599,,0.2529,0.2372,0.2514,0.2579,0.2519,0.2478,0.2742,0.2594,0.2371,0.2653,
|
18 |
+
80k,0.2493,0.2519,,0.2504,0.2344,0.2582,0.2535,0.2433,0.2718,0.2596,0.2536,0.2553,0.2573,
|
19 |
+
85k,0.2527,0.2789,,0.2547,0.2496,0.2564,0.2418,0.2572,0.2465,0.2663,0.2552,0.2485,0.2584,
|
20 |
+
90k,0.2679,0.2668,,0.2595,0.2464,0.2608,0.2359,0.2777,0.2475,0.2543,0.2514,0.2411,0.2499,
|
21 |
+
95k,0.2551,0.2763,,0.2621,0.2469,0.2505,0.2534,0.2584,0.2424,0.2607,0.2742,0.2385,0.2521,
|
22 |
+
100k,0.2594,0.2564,,0.2550,,0.2614,0.2461,0.2611,0.2497,0.2675,0.2545,0.2540,0.2574,
|
23 |
+
105k,0.2787,0.2473,,0.2659,,0.2542,0.2729,0.2666,0.2468,0.2610,0.2726,0.2465,0.2798,
|
24 |
+
110k,0.3079,0.2458,,0.2551,0.2629,0.2512,0.2604,0.3027,0.2522,0.2673,,0.2410,0.2540,
|
25 |
+
115k,0.3185,0.2458,,0.2624,0.2324,0.2569,0.2590,0.2863,0.2584,0.2624,,0.2396,0.2771,
|
26 |
+
120k,0.3139,0.2832,,0.2626,0.2663,0.2718,0.2629,0.3190,0.2748,0.2419,,0.2544,0.2772,
|
27 |
+
125k,0.2960,0.2928,,0.2712,0.2733,0.2663,0.2768,0.2788,0.2570,0.2616,,0.2466,0.2856,
|
28 |
+
130k,0.3033,0.2844,,0.2404,0.2635,0.2767,0.2676,0.3191,0.2812,0.2538,,,0.2973,
|
29 |
+
135k,0.2934,0.2895,,0.2641,,0.2713,0.2735,0.3119,0.2882,0.2661,,,0.3203,
|
30 |
+
140k,,0.3045,,0.2553,,0.2811,0.2765,0.2866,0.3019,0.2730,,,0.2772,
|
31 |
+
145k,,,,0.2492,,0.2850,0.2708,0.3107,0.3090,0.2582,,,0.3435,
|
32 |
+
150k,,,,0.2595,,0.2780,,0.3225,0.3199,0.2541,,,0.3112,
|
33 |
+
155k,,,,0.2681,,0.2664,0.2463,0.3618,0.3116,0.2594,,,0.3361,
|
34 |
+
160k,,,,0.2605,,0.2793,0.2821,0.3047,0.3240,0.2688,,,0.3392,
|
35 |
+
165k,,,,0.2725,,0.2933,0.2816,,0.3478,0.2653,,,0.3485,
|
36 |
+
170k,,,,0.2514,,0.2656,0.2893,,0.3423,0.2537,,,0.3355,
|
37 |
+
175k,,,,0.2535,,0.3007,0.3317,,0.3156,0.2621,,,0.3162,
|
38 |
+
180k,,,,0.2561,,0.2785,0.2624,,0.2893,0.2555,,,0.3398,
|
39 |
+
185k,,,,0.2523,,0.3131,0.3026,,0.3876,0.2461,,,0.3631,
|
40 |
+
190k,,,,0.2653,,0.3226,,,0.3131,0.2540,,,0.3930,
|
41 |
+
195k,,,,0.2681,,0.3136,,,0.3473,0.2550,,,0.3972,
|
42 |
+
200k,,,,0.2515,,0.2811,,,0.3257,0.2481,,,0.3660,
|
43 |
+
205k,,,,0.2619,,0.3004,,,0.3836,,,,0.3748,
|
44 |
+
210k,,,,0.2687,,0.2996,,,0.3063,0.2646,,,0.3668,
|
45 |
+
215k,,,,0.2653,,0.3329,,,0.3947,0.2626,,,0.3641,
|
46 |
+
220k,,,,0.2631,,0.3590,,,0.3621,0.2600,,,,
|
47 |
+
225k,,,,0.2737,,0.3453,,,0.4151,0.2589,,,,
|
48 |
+
230k,,,,0.2833,,,,,0.3825,0.2587,,,,
|
49 |
+
235k,,,,0.2703,,,,,0.3897,,,,,
|
50 |
+
240k,,,,0.2572,,,,,,0.2610,,,,
|
51 |
+
245k,,,,0.2700,,,,,,0.2612,,,,
|
52 |
+
250k,,,,0.2639,,,,,,0.2583,,,,
|
53 |
+
255k,,,,0.2680,,,,,,0.2564,,,,
|
54 |
+
260k,,,,0.2897,,,,,,0.2631,,,,
|
55 |
+
265k,,,,0.2815,,,,,,0.2635,,,,
|
56 |
+
270k,,,,0.2693,,,,,,,,,,
|
57 |
+
275k,,,,0.2789,,,,,,0.2643,,,,
|
58 |
+
280k,,,,0.3052,,,,,,0.2687,,,,
|
59 |
+
285k,,,,0.2850,,,,,,0.2605,,,,
|
60 |
+
290k,,,,,,,,,,0.2779,,,,
|
61 |
+
300k,,,,,,,,,,0.2755,,,,
|
62 |
+
305k,,,,,,,,,,,,,,
|
63 |
+
310k,,,,,,,,,,0.2614,,,,
|
64 |
+
315k,,,,,,,,,,0.2646,,,,
|
65 |
+
320k,,,,,,,,,,0.2745,,,,
|
66 |
+
325k,,,,,,,,,,,,,,
|
67 |
+
330k,,,,,,,,,,,,,,
|
68 |
+
335k,,,,,,,,,,,,,,
|
data/txt360_eval/CKPT Eval - MedQA.csv
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
0-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
|
2 |
+
time: 3 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
|
3 |
+
5k,0.2679,0.2946,,0.2152,,0.2781,0.2482,0.2160,0.2687,0.2639,0.2215,0.2584,0.2396
|
4 |
+
10k,0.2749,0.2506,,0.2380,0.2372,0.2624,0.2616,0.2734,0.2718,0.2647,0.2435,0.2789,0.2435
|
5 |
+
15k,0.2388,0.2773,,0.2270,,0.2749,0.2797,0.2545,0.2639,0.2726,0.2773,0.2647,0.2412
|
6 |
+
20k,0.2412,0.2773,,0.2419,,,0.2317,0.2789,0.2757,0.2372,0.2506,0.2742,0.2577
|
7 |
+
25k,0.2302,0.2624,,0.2184,,0.2561,0.2569,0.2804,0.2474,0.2616,0.2498,0.2624,0.2419
|
8 |
+
30k,0.2545,0.2553,,0.2679,0.2522,0.2239,0.2097,0.2742,0.2608,0.2592,0.2694,0.2655,0.2765
|
9 |
+
35k,0.2757,0.2577,,0.2647,0.2655,0.2671,0.2467,0.2907,0.2694,0.2797,0.2498,0.2749,0.2514
|
10 |
+
40k,0.2710,0.2608,,0.2671,0.2396,0.2490,0.2569,0.2734,0.2482,0.2647,0.2742,0.2608,0.2671
|
11 |
+
45k,0.2765,0.2506,,0.2742,0.2734,0.2600,0.2255,0.2859,0.2333,0.2671,0.2435,0.2474,0.2749
|
12 |
+
50k,0.2742,0.2624,,0.2749,0.2537,0.2482,0.2372,0.2412,0.2655,0.2789,0.2412,0.2702,0.2577
|
13 |
+
55k,0.2679,0.2545,,0.2797,0.2561,0.2632,0.2294,0.2718,0.2537,0.2647,0.2647,0.2757,0.2632
|
14 |
+
60k,0.2773,0.2427,,0.2294,0.2325,0.2789,,0.2419,0.2639,0.2679,0.2506,0.2687,0.2419
|
15 |
+
65k,0.2404,0.2687,,0.2663,0.2757,0.2310,0.2749,0.2836,0.2726,0.2734,0.2537,0.2608,0.2459
|
16 |
+
70k,0.2600,0.2592,,0.2592,0.2757,0.2797,0.2632,0.2569,0.2435,0.2773,0.2765,0.2702,0.2584
|
17 |
+
75k,0.2679,0.2584,,0.2490,0.2679,0.2789,0.2616,0.2710,0.2765,0.2742,0.2710,0.2687,0.2789
|
18 |
+
80k,0.2632,0.2702,,0.2797,0.2419,0.2757,0.2522,0.2616,0.2789,0.2655,0.2694,0.2435,0.2757
|
19 |
+
85k,0.2734,0.2451,,0.2655,0.2844,0.2608,0.2687,0.2742,0.2553,0.2663,0.2749,0.2639,0.2773
|
20 |
+
90k,0.2797,0.2506,,0.2310,0.2364,0.2679,0.2624,,0.2679,0.2608,0.2561,0.2765,0.2820
|
21 |
+
95k,0.2529,0.2545,,0.2742,0.2820,0.2797,0.2647,0.2757,0.2749,0.2663,0.2105,0.2655,0.2749
|
22 |
+
100k,0.2694,0.2459,,0.2679,,0.2168,0.2702,0.2459,0.2663,0.2655,0.2537,0.2655,0.2781
|
23 |
+
105k,0.2537,0.2529,,0.2655,,0.2773,0.2632,0.2592,0.2726,0.2687,0.2671,0.2749,0.2812
|
24 |
+
110k,0.2663,0.2419,,0.2718,0.2474,0.2584,0.2537,0.2569,0.2537,0.2349,,0.2537,0.2765
|
25 |
+
115k,0.2459,0.2419,,0.2655,0.2718,0.2773,0.2247,0.2852,0.2867,0.2490,,0.2561,0.2364
|
26 |
+
120k,0.2624,0.2561,,0.2930,0.2537,0.2671,,0.2718,0.2844,0.2545,,0.2608,0.2443
|
27 |
+
125k,0.2451,0.2742,,0.2624,0.2364,0.2451,0.2145,0.2985,0.2883,0.2726,,0.2498,0.2867
|
28 |
+
130k,0.2655,0.2797,,0.2828,0.2412,0.2836,0.2891,0.2930,0.2922,0.2522,,,0.2765
|
29 |
+
135k,0.2749,0.2655,,,,0.2443,0.2765,0.2883,0.2702,0.2679,,,0.2679
|
30 |
+
140k,,0.2781,,0.2529,,0.2427,0.2545,0.2962,0.2930,0.2569,,,0.2820
|
31 |
+
145k,,,,0.2490,,0.2427,0.2718,0.3048,0.3024,0.2639,,,0.2632
|
32 |
+
150k,,,,,,0.2694,,0.2482,0.3244,0.2655,,,0.3150
|
33 |
+
155k,,,,0.2608,,0.2789,0.2624,0.3134,,0.2490,,,0.3009
|
34 |
+
160k,,,,0.2529,,0.2765,0.2726,0.3079,0.2852,0.2577,,,0.2757
|
35 |
+
165k,,,,0.2388,,0.2592,0.2742,,0.2561,0.2380,,,0.3009
|
36 |
+
170k,,,,0.2435,,0.2852,0.2506,,0.3056,0.2380,,,0.2836
|
37 |
+
175k,,,,0.2632,,0.2757,0.2647,,0.3126,0.2671,,,0.2993
|
38 |
+
180k,,,,0.2608,,0.2592,0.2899,,0.3166,0.2396,,,0.3071
|
39 |
+
185k,,,,0.2710,,0.2859,0.2561,,0.3268,0.2537,,,0.2490
|
40 |
+
190k,,,,0.2812,,0.2914,,,0.3040,0.2577,,,0.2828
|
41 |
+
195k,,,,0.2482,,0.2797,,,0.3472,0.2694,,,0.2883
|
42 |
+
200k,,,,0.2639,,0.2584,,,0.3339,0.2639,,,0.3126
|
43 |
+
205k,,,,0.2514,,0.3158,,,0.3409,,,,0.2710
|
44 |
+
210k,,,,0.2742,,0.3016,,,0.3378,0.2624,,,0.2962
|
45 |
+
215k,,,,0.2592,,0.2859,,,0.3362,,,,0.2859
|
46 |
+
220k,,,,0.2262,,0.3001,,,0.3559,0.2781,,,
|
47 |
+
225k,,,,0.2490,,0.3134,,,0.3213,0.2608,,,
|
48 |
+
230k,,,,0.2357,,,,,0.3472,0.2828,,,
|
49 |
+
235k,,,,0.2514,,,,,0.3614,0.2639,,,
|
50 |
+
240k,,,,0.2624,,,,,,0.2867,,,
|
51 |
+
245k,,,,0.2482,,,,,,0.2718,,,
|
52 |
+
250k,,,,0.2592,,,,,,0.2624,,,
|
53 |
+
255k,,,,0.2537,,,,,,0.2781,,,
|
54 |
+
260k,,,,0.2639,,,,,,0.2679,,,
|
55 |
+
265k,,,,0.2844,,,,,,0.2616,,,
|
56 |
+
270k,,,,0.2624,,,,,,,,,
|
57 |
+
275k,,,,0.2757,,,,,,,,,
|
58 |
+
280k,,,,0.2852,,,,,,0.2592,,,
|
59 |
+
285k,,,,0.2726,,,,,,0.2781,,,
|
60 |
+
290k,,,,,,,,,,0.2671,,,
|
61 |
+
300k,,,,,,,,,,0.2742,,,
|
62 |
+
305k,,,,,,,,,,0.2624,,,
|
63 |
+
310k,,,,,,,,,,0.2718,,,
|
64 |
+
315k,,,,,,,,,,0.2694,,,
|
65 |
+
320k,,,,,,,,,,0.2749,,,
|
66 |
+
325k,,,,,,,,,,,,,
|
67 |
+
330k,,,,,,,,,,,,,
|
68 |
+
335k,,,,,,,,,,,,,
|
data/txt360_eval/CKPT Eval - NQ.csv
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
5-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base
|
2 |
+
time: 22 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192
|
3 |
+
5k,0.0615,0.0537,,0.0341,0.0416,0.0634,0.0565,0.0579,0.0526,0.0219,0.0213,0.0205,0.0274
|
4 |
+
10k,0.1075,0.1053,,0.0715,,0.0906,0.0931,0.0828,0.0767,0.0391,0.0418,0.0529,0.0554
|
5 |
+
15k,0.1382,0.1136,,0.0765,,0.1147,0.1061,0.1152,0.1127,0.0607,0.0560,0.0629,0.0587
|
6 |
+
20k,0.1490,0.1393,,0.0787,,0.1161,0.1183,0.1285,0.1247,0.0529,0.0623,0.0668,0.0709
|
7 |
+
25k,0.1687,0.1416,,0.0892,0.1150,0.1402,0.1352,0.1380,0.1343,0.0584,0.0687,0.0762,0.0828
|
8 |
+
30k,0.1767,0.1557,,0.0911,0.1366,0.1454,0.1271,0.1501,0.1421,0.0723,0.0687,0.0723,0.0839
|
9 |
+
35k,0.1706,0.1756,,0.0970,0.1488,0.1573,0.1485,0.1565,0.1524,0.0803,0.0798,0.0803,0.0778
|
10 |
+
40k,0.1942,0.1759,,0.1028,0.1355,0.1560,0.1488,0.1554,0.1562,0.0759,0.0848,0.0845,0.0886
|
11 |
+
45k,0.1798,0.1820,,0.1078,0.1488,0.1715,0.1620,0.1684,0.1598,0.0881,0.0911,0.0867,0.0848
|
12 |
+
50k,0.1972,0.1809,,0.1050,0.1540,,0.1590,0.1657,0.1698,0.0864,0.0909,0.0909,0.0884
|
13 |
+
55k,0.2158,0.1956,,0.1097,0.1607,0.1659,0.1662,0.1751,0.1704,0.0892,0.0898,0.0745,0.0931
|
14 |
+
60k,0.2039,0.2036,,0.1211,0.1654,0.1734,0.1612,0.1745,0.1801,0.0817,0.0850,0.0922,0.0986
|
15 |
+
65k,0.2244,0.2044,,0.1089,0.1573,0.1765,0.1693,0.1776,0.1823,0.0920,0.0967,0.1025,0.1066
|
16 |
+
70k,0.2233,0.2233,,0.1222,0.1634,0.1845,0.1679,0.1859,0.1767,0.1022,0.0925,0.1039,0.1177
|
17 |
+
75k,0.2305,0.2277,,0.1097,0.1709,0.1825,0.1881,0.1737,0.1762,0.1069,0.0936,0.1116,0.1199
|
18 |
+
80k,0.2457,0.2252,,0.1277,0.1573,0.1900,0.1776,0.1787,0.1964,0.1047,0.0981,0.1033,0.1097
|
19 |
+
85k,0.2501,0.2285,,0.1280,0.1776,0.1914,0.1889,0.1870,0.1889,0.0942,0.0964,0.1144,0.1213
|
20 |
+
90k,0.2504,0.2521,,0.1158,0.1598,0.1911,0.1806,0.1898,0.1773,0.1058,0.0964,0.1186,0.1163
|
21 |
+
95k,0.2579,0.2443,,0.1235,0.1762,0.1911,0.1781,0.1989,0.1917,0.1097,0.0928,0.1213,0.1169
|
22 |
+
100k,0.2526,0.2446,,0.1258,,0.2097,0.1928,0.1903,0.1947,0.1125,0.1025,0.1127,0.1188
|
23 |
+
105k,0.2679,0.2482,,0.1366,,0.2028,0.1814,0.1922,0.2094,0.1199,0.1069,0.1186,0.1269
|
24 |
+
110k,0.2717,0.2562,,0.1377,0.1756,0.2019,0.1859,0.1975,,0.1152,,0.1252,0.1252
|
25 |
+
115k,0.2745,0.2562,,0.1346,0.1831,0.1956,0.1947,0.1903,0.2119,0.1127,,0.1285,0.1111
|
26 |
+
120k,0.2801,0.2612,,0.1402,0.2014,0.2000,,0.2044,0.2119,0.1188,,0.1166,0.1219
|
27 |
+
125k,0.2751,0.2657,,0.1307,0.2030,0.2014,0.1992,0.2053,0.1787,0.1230,,0.1274,0.1418
|
28 |
+
130k,0.2884,0.2673,,0.1368,0.1997,0.2125,0.1994,0.2011,0.2086,0.1127,,,0.1335
|
29 |
+
135k,0.2842,0.2673,,0.1363,,0.2069,0.2014,0.2036,0.2069,0.1255,,,0.1299
|
30 |
+
140k,,0.2679,,0.1435,,0.2039,0.1986,0.2042,0.2058,0.1263,,,0.1299
|
31 |
+
145k,,,,0.1532,,0.2172,0.1953,0.2078,0.2102,0.1274,,,0.1443
|
32 |
+
150k,,,,0.1404,,0.2125,,0.2127,0.2075,0.1263,,,0.1410
|
33 |
+
155k,,,,0.1418,,0.2235,0.1931,0.2066,0.2205,0.1418,,,0.1460
|
34 |
+
160k,,,,0.1346,,0.2183,0.2116,0.2069,0.2208,0.1319,,,0.1413
|
35 |
+
165k,,,,0.1524,,0.2219,0.2139,,0.2213,0.1296,,,0.1424
|
36 |
+
170k,,,,0.1388,,0.2175,,,0.2169,0.1366,,,0.1454
|
37 |
+
175k,,,,0.1438,,0.2235,0.2222,,0.2321,0.1349,,,0.1399
|
38 |
+
180k,,,,0.1471,,0.2260,0.2249,,0.236,0.1465,,,0.1421
|
39 |
+
185k,,,,0.1499,,0.2341,0.2222,,0.2366,0.1449,,,0.1421
|
40 |
+
190k,,,,0.1504,,0.2233,,,0.2274,0.1413,,,0.1471
|
41 |
+
195k,,,,0.1554,,0.2330,,,0.2454,0.1440,,,0.1407
|
42 |
+
200k,,,,0.1565,,0.2238,,,0.2346,0.1407,,,0.1449
|
43 |
+
205k,,,,0.1726,,0.2271,,,0.2316,0.1382,,,0.1501
|
44 |
+
210k,,,,0.1623,,0.2305,,,0.2493,0.1526,,,0.1424
|
45 |
+
215k,,,,0.1576,,0.2299,,,0.2355,0.1518,,,0.1535
|
46 |
+
220k,,,,0.1693,,0.2330,,,0.2427,0.1529,,,
|
47 |
+
225k,,,,0.1596,,0.2366,,,0.2440,0.1479,,,
|
48 |
+
230k,,,,0.1693,,,,,0.2554,0.1560,,,
|
49 |
+
235k,,,,0.1720,,,,,0.2535,0.1540,,,
|
50 |
+
240k,,,,0.1712,,,,,,0.1554,,,
|
51 |
+
245k,,,,0.1704,,,,,,0.1532,,,
|
52 |
+
250k,,,,0.1784,,,,,,0.1551,,,
|
53 |
+
255k,,,,0.1740,,,,,,0.1623,,,
|
54 |
+
260k,,,,0.1756,,,,,,0.1618,,,
|
55 |
+
265k,,,,0.1886,,,,,,0.1604,,,
|
56 |
+
270k,,,,0.1820,,,,,,0.1612,,,
|
57 |
+
275k,,,,0.1870,,,,,,0.1629,,,
|
58 |
+
280k,,,,0.1704,,,,,,0.1645,,,
|
59 |
+
285k,,,,0.1903,,,,,,0.1665,,,
|
60 |
+
290k,,,,,,,,,,0.1648,,,
|
61 |
+
300k,,,,,,,,,,0.1712,,,
|
62 |
+
305k,,,,,,,,,,0.1690,,,
|
63 |
+
310k,,,,,,,,,,0.1712,,,
|
64 |
+
315k,,,,,,,,,,,,,
|
65 |
+
320k,,,,,,,,,,,,,
|
66 |
+
325k,,,,,,,,,,,,,
|
67 |
+
330k,,,,,,,,,,,,,
|
68 |
+
335k,,,,,,,,,,,,,
|
data/txt360_eval/CKPT Eval - PIQA.csv
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,Slim-Pajama 600B (bsz=4K x 1024),,,,,,FineWeb-1.5T,,Ours-Base,,Ours-Upsampling1,,Ours-Upsampling2,,Ours-Code-Upsampling2,,All-Upsampling1,,All-Upsampling1,,All-Upsampling1,,All-Upsampling1,,DCLM-Base,
|
2 |
+
0-shot: 3 min,Llama-8x8B-baseline,,Llama-8x8B-seq8192,,Llama-8x8B-mup,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-1x8B-seq8192,,Llama_extend-1x8B-seq8192,,Jais-1x8B-seq8192,,Llama-1x8B-seq8192,
|
3 |
+
5-shot: 4 min,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot
|
4 |
+
5k,0.7236,0.7073,0.7176,0.7133,,,0.7470,0.7263,,0.7106,0.7280,0.7242,0.7378,0.7296,0.7356,0.7323,0.7318,0.7263,0.7078,0.7002,0.7057,0.6844,0.7116,0.6975,0.7046,0.6942
|
5 |
+
10k,0.7492,0.7421,0.7427,0.7318,,,0.7650,0.7524,0.7573,0.7454,0.7568,0.7486,0.7557,0.7492,0.7726,0.7568,0.7612,0.7486,0.7198,0.7089,0.7209,0.7127,0.7280,0.7236,0.7367,0.7144
|
6 |
+
15k,0.7688,0.7541,0.7639,0.7481,,,0.7775,0.7655,0.7628,,0.7748,0.7622,0.7769,0.7622,0.7786,0.7737,0.7655,0.7661,0.7367,0.7329,0.7378,0.7312,0.7350,0.7318,0.7443,0.7133
|
7 |
+
20k,0.7639,0.7655,0.7682,0.7579,,,0.7807,0.7612,0.7671,0.7590,0.7845,,0.7709,0.7693,0.7813,0.7650,0.7840,0.7758,0.7465,0.7378,0.7470,0.7312,0.7486,0.7394,0.7514,0.7323
|
8 |
+
25k,0.7639,0.7677,0.7682,0.7671,,,0.7878,0.7748,,0.7590,,0.7693,0.7913,0.7671,0.7818,0.7715,0.7791,0.7715,0.7470,0.7378,0.7503,0.7437,0.7470,0.7492,0.7497,0.7345
|
9 |
+
30k,0.7764,0.7677,0.7797,0.7563,,,0.7862,0.7704,0.7780,0.7617,0.7802,0.7737,0.7829,0.7655,0.7813,0.7661,0.7889,0.7731,0.7524,0.7388,0.7497,0.7497,0.7546,0.7437,0.7563,0.7427
|
10 |
+
35k,0.7802,0.7677,0.7769,0.7622,,,0.7933,0.7726,0.7769,0.7699,0.7878,0.7682,0.7824,0.7737,0.7813,0.7797,0.7987,0.7780,0.7573,0.7361,0.7508,0.7421,0.7612,0.7579,0.7655,0.7486
|
11 |
+
40k,0.7873,0.7802,0.7802,0.7682,,,0.7905,0.7758,,0.7731,0.7889,0.7731,0.7943,0.7704,0.7835,0.7775,0.7878,0.7769,0.7573,0.7383,0.7579,0.7481,0.7606,0.7492,0.7650,0.7519
|
12 |
+
45k,0.7813,0.7786,0.7764,0.7699,,,0.7982,0.7824,0.7786,0.7661,0.7911,0.7775,0.7829,0.7737,0.7894,0.7835,0.7949,0.7780,0.7579,0.7465,0.7639,0.7465,0.7612,0.7514,0.7677,0.7497
|
13 |
+
50k,0.7818,0.7797,0.7878,0.7753,,,0.7992,0.7780,0.7775,0.7748,0.7856,0.7775,0.7943,0.7699,0.7998,0.7851,0.7933,0.7786,0.7557,0.7437,0.7524,0.7443,0.7677,0.7579,0.7802,0.7601
|
14 |
+
55k,0.7900,0.7780,0.7905,0.7829,,,0.8079,0.7737,0.7786,0.7775,0.7878,0.7731,0.7884,0.7780,0.7976,0.7905,0.7943,0.7824,0.7661,0.7546,0.7655,0.7541,0.7704,0.7606,0.7704,0.7606
|
15 |
+
60k,0.7916,0.7851,0.7911,0.7797,,,0.7922,0.7797,0.7818,0.7813,0.7900,0.7699,0.7905,0.7661,0.7943,0.7878,0.8003,0.7818,0.7650,0.7530,0.7628,0.7557,0.7661,0.7628,0.7661,0.7579
|
16 |
+
65k,0.7938,0.7840,0.7927,0.7840,,,0.7976,0.7769,0.7900,0.7780,0.7933,0.7731,0.7835,0.7671,0.7960,0.7845,0.7943,0.7780,0.7720,0.7492,0.7606,0.7535,0.7748,0.7639,0.7704,0.7584
|
17 |
+
70k,0.7922,0.7835,0.7922,0.7845,,,0.8052,0.7916,0.7916,0.7818,0.7949,0.7807,0.7900,0.7726,0.7889,0.7845,0.7976,0.7900,0.7633,0.7524,0.7612,0.7552,0.7644,0.7622,0.7699,0.7568
|
18 |
+
75k,0.7938,0.7927,0.7949,0.7840,,,0.8030,0.7873,0.7878,0.7715,0.7938,0.7807,0.8079,0.7922,0.7927,0.7905,0.8020,0.7933,0.7655,0.7541,0.7682,0.7508,0.7737,0.7568,0.7737,0.7573
|
19 |
+
80k,0.7911,0.7878,0.7873,0.7894,,,0.7971,0.7742,0.7829,0.7797,0.7987,0.7824,0.7992,0.7894,0.8003,0.7900,0.7933,0.7884,0.7671,0.7497,0.7682,0.7524,0.7748,0.7563,0.7742,0.7628
|
20 |
+
85k,0.7949,0.7894,0.7900,0.7889,,,0.8003,0.7840,0.8014,0.7786,0.8025,0.7894,0.7949,0.7818,0.7992,0.7894,0.7965,0.7851,0.7682,0.7530,0.7731,0.7563,0.7829,0.7622,0.7780,0.7704
|
21 |
+
90k,0.7982,0.7894,0.7916,0.7943,,,0.7976,0.7797,0.7873,0.7720,0.7971,0.7862,0.7856,0.7845,0.7960,0.7976,0.7998,0.7878,0.7731,0.7535,0.7650,0.7552,0.7737,0.7622,0.7742,0.7617
|
22 |
+
95k,0.8058,0.7992,0.8020,0.7873,,,0.8041,0.7742,0.7905,0.7840,0.8014,0.7807,0.7954,0.7829,0.8025,0.7911,0.8003,0.7884,0.7709,0.7535,0.7699,0.7519,0.7731,0.7612,0.7753,0.7704
|
23 |
+
100k,0.8069,0.7992,0.8052,0.7873,,,0.8069,0.7856,,,0.8041,0.7851,0.7998,0.7824,0.8014,0.7927,0.8009,0.7905,0.7628,0.7508,0.7715,0.7628,0.7748,0.7584,0.7758,0.7720
|
24 |
+
105k,0.8058,0.7965,0.8025,0.7943,,,0.8074,0.7916,,,0.8030,0.7900,0.8063,0.7927,0.8036,0.7949,0.7960,0.7905,0.7688,0.7568,0.7644,0.7601,0.7753,0.7682,0.7797,0.7639
|
25 |
+
110k,0.8041,0.7987,0.8069,0.7982,,,0.8085,0.7797,0.7856,0.7856,0.8009,0.7922,0.7938,0.7856,0.8020,0.7911,0.7998,0.7916,0.7682,0.7563,,,0.7791,0.7699,0.7845,0.7633
|
26 |
+
115k,0.8090,0.8009,0.8069,0.7982,,,0.8118,0.7867,0.7911,0.7802,0.8020,0.7867,0.8041,0.7922,0.8052,0.7916,0.8052,0.7938,0.7612,0.7541,,,0.7780,0.7633,0.7709,0.7639
|
27 |
+
120k,0.8145,0.7949,0.8041,0.7911,,,0.8074,0.7878,0.7982,0.7851,0.7976,0.7922,0.8025,0.7905,0.7938,0.7927,0.7949,0.7905,0.7704,0.7715,,,0.7813,0.7720,0.7867,0.7758
|
28 |
+
125k,0.8079,0.8009,0.8058,0.7900,,,0.8107,0.7829,0.8009,0.7900,0.8020,0.7894,0.8047,0.7916,0.8047,0.7976,0.8003,0.7922,0.7677,0.7671,,,0.7824,0.7737,0.7764,0.7699
|
29 |
+
130k,0.8069,0.8058,0.8041,0.7982,,,0.8079,0.7845,0.7916,0.7797,0.8036,0.7916,0.8014,0.7949,0.8058,0.8014,0.7922,0.7943,0.7835,0.7622,,,,,0.7748,0.7720
|
30 |
+
135k,0.8063,0.8047,0.8090,0.8020,,,0.8074,0.7878,,,0.8009,0.7878,0.8052,0.7835,0.8014,0.8030,0.8014,0.7927,0.7764,0.7682,,,,,0.7867,0.7813
|
31 |
+
140k,,,0.8090,0.7992,,,0.8123,0.7911,,,0.8047,0.7916,0.8063,0.7971,0.8079,0.8036,0.7987,0.7976,0.7764,0.7628,,,,,0.7862,0.7720
|
32 |
+
145k,,,,,,,0.8069,0.7807,,,0.8047,0.7922,0.8052,0.7845,0.7982,0.8025,0.8030,0.8085,0.7748,0.7688,,,,,0.7791,0.7699
|
33 |
+
150k,,,,,,,0.8058,0.7949,,,0.8058,0.7878,,,0.8090,0.7998,0.7987,0.8025,0.7693,0.7579,,,,,0.7916,0.7769
|
34 |
+
155k,,,,,,,0.8096,0.8041,,,0.8096,0.7922,0.7954,0.7775,0.8101,0.8041,0.8107,0.7965,0.7769,0.7639,,,,,0.7933,0.7731
|
35 |
+
160k,,,,,,,0.8101,0.7900,,,0.8014,0.7976,0.8020,0.7894,0.8128,0.8036,0.8079,0.8009,0.7753,0.7715,,,,,0.7987,0.7709
|
36 |
+
165k,,,,,,,0.8112,0.7933,,,0.8030,0.7971,0.8058,0.7878,,,0.8101,0.8009,0.7824,0.7709,,,,,0.7873,0.7682
|
37 |
+
170k,,,,,,,,0.7916,,,0.8047,0.7954,0.8041,0.7922,,,0.8036,0.8041,0.7797,0.7720,,,,,0.7884,0.7715
|
38 |
+
175k,,,,,,,0.8194,0.7965,,,0.8030,0.7911,0.7982,0.7927,,,0.8118,0.8096,0.7709,0.7666,,,,,0.7911,0.7802
|
39 |
+
180k,,,,,,,0.8118,0.7845,,,0.8041,0.7954,0.8025,0.7987,,,0.8172,0.7976,0.7775,0.7677,,,,,0.7884,0.7851
|
40 |
+
185k,,,,,,,0.8259,0.7982,,,0.8025,0.7960,0.8036,0.7905,,,0.8096,0.7987,0.7851,0.7737,,,,,0.7927,0.7813
|
41 |
+
190k,,,,,,,0.8139,0.8025,,,0.7998,0.7987,,,,,0.8128,0.7998,0.7840,0.7758,,,,,0.7922,0.7867
|
42 |
+
195k,,,,,,,0.8188,0.7965,,,0.8090,0.7878,,,,,0.8161,0.8052,0.7748,0.7677,,,,,0.7884,0.7769
|
43 |
+
200k,,,,,,,0.8112,0.8025,,,0.8079,0.8009,,,,,0.8128,0.8041,0.7802,0.7726,,,,,0.7916,0.7802
|
44 |
+
205k,,,,,,,0.8188,0.8009,,,0.8003,0.7938,,,,,0.8177,0.8145,0.7813,0.7726,,,,,0.7949,0.7748
|
45 |
+
210k,,,,,,,0.8188,0.7971,,,0.8047,0.7889,,,,,0.8161,0.8101,0.7818,0.7786,,,,,0.7894,0.7867
|
46 |
+
215k,,,,,,,0.8188,0.7992,,,0.8030,0.7922,,,,,0.8085,0.8085,0.7813,0.7748,,,,,0.7845,0.7802
|
47 |
+
220k,,,,,,,0.8199,0.8030,,,0.8085,0.7976,,,,,0.8096,0.8074,0.7769,0.7704,,,,,,
|
48 |
+
225k,,,,,,,0.8199,0.8041,,,0.8052,0.8014,,,,,0.8134,0.8101,0.7829,0.7731,,,,,,
|
49 |
+
230k,,,,,,,0.8172,0.8041,,,,,,,,,0.8134,0.8107,0.7824,0.7802,,,,,,
|
50 |
+
235k,,,,,,,0.8199,0.8085,,,,,,,,,0.8205,0.8118,0.7813,0.7764,,,,,,
|
51 |
+
240k,,,,,,,0.8166,0.8101,,,,,,,,,,,0.7829,0.7824,,,,,,
|
52 |
+
245k,,,,,,,0.8215,0.8090,,,,,,,,,,,0.7873,0.7753,,,,,,
|
53 |
+
250k,,,,,,,0.8172,0.8107,,,,,,,,,,,0.7807,0.7797,,,,,,
|
54 |
+
255k,,,,,,,0.8254,0.8128,,,,,,,,,,,0.7824,0.7737,,,,,,
|
55 |
+
260k,,,,,,,0.8215,0.809,,,,,,,,,,,0.7807,0.7797,,,,,,
|
56 |
+
265k,,,,,,,0.8210,0.8139,,,,,,,,,,,0.7775,0.7753,,,,,,
|
57 |
+
270k,,,,,,,0.8145,0.8079,,,,,,,,,,,0.7824,,,,,,,
|
58 |
+
275k,,,,,,,0.8161,0.8139,,,,,,,,,,,0.7889,0.7769,,,,,,
|
59 |
+
280k,,,,,,,0.8248,0.8150,,,,,,,,,,,0.7807,0.7726,,,,,,
|
60 |
+
285k,,,,,,,0.8210,0.8101,,,,,,,,,,,0.7916,0.7818,,,,,,
|
61 |
+
290k,,,,,,,,,,,,,,,,,,,0.7851,0.7758,,,,,,
|
62 |
+
300k,,,,,,,,,,,,,,,,,,,0.7840,0.7780,,,,,,
|
63 |
+
305k,,,,,,,,,,,,,,,,,,,0.7873,0.7829,,,,,,
|
64 |
+
310k,,,,,,,,,,,,,,,,,,,0.7813,0.7829,,,,,,
|
65 |
+
315k,,,,,,,,,,,,,,,,,,,0.7851,0.7791,,,,,,
|
66 |
+
320k,,,,,,,,,,,,,,,,,,,0.7873,0.7813,,,,,,
|
67 |
+
325k,,,,,,,,,,,,,,,,,,,,,,,,,,
|
68 |
+
330k,,,,,,,,,,,,,,,,,,,,,,,,,,
|
69 |
+
335k,,,,,,,,,,,,,,,,,,,,,,,,,,
|
data/txt360_eval/CKPT Eval - TriviaQA.csv
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
5-shot,Slim-Pajama 600B (bsz=4K x 1024),,,FineWeb-1.5T,Ours-Base,Ours-Upsampling1,Ours-Upsampling2,Ours-Code-Upsampling2,All-Upsampling1,All-Upsampling1,All-Upsampling1,All-Upsampling1,DCLM-Base,Comments
|
2 |
+
time: 76 min,Llama-8x8B-baseline,Llama-8x8B-seq8192,Llama-8x8B-mup,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-8x8B-seq8192,Llama-1x8B-seq8192,Llama_extend-1x8B-seq8192,Jais-1x8B-seq8192,Llama-1x8B-seq8192,1. Takes 25min to load checkpoints.
|
3 |
+
5k,0.1944,0.1764,,0.1025,0.1232,0.1086,0.1260,0.1647,,0.0841,0.0762,0.0824,0.1066,2. GPU utility is only 20%
|
4 |
+
10k,0.3372,0.3292,,0.2073,,0.2636,0.1150,0.2659,0.2604,0.1348,0.1343,0.1585,0.1850,
|
5 |
+
15k,0.4050,0.3909,,0.3005,,0.3250,0.1872,0.3445,0.3244,0.1821,0.1930,0.1968,0.2443,
|
6 |
+
20k,0.4451,0.4497,,0.3506,0.2795,,0.2719,0.3802,0.3637,0.2086,0.2196,0.2231,0.2772,
|
7 |
+
25k,0.4899,0.4601,,0.3070,,0.3975,0.4093,0.4105,0.4120,0.2261,0.2375,0.2574,0.3146,
|
8 |
+
30k,0.5125,0.4824,,0.2461,0.2974,0.0303,0.4195,0.4330,0.4294,0.2352,0.2484,0.2675,0.3328,
|
9 |
+
35k,0.5249,0.5091,,0.3639,0.3572,0.1983,0.3587,0.4434,0.4428,0.2433,,0.2863,0.3507,
|
10 |
+
40k,0.5555,0.5166,,0.3537,0.0346,0.4571,0.4434,0.4618,0.4623,0.2708,0.2828,0.3020,0.3606,
|
11 |
+
45k,0.5664,0.5403,,0.3602,0.2674,0.2654,0.4366,0.4746,0.4792,0.2668,0.3018,0.3065,0.3726,
|
12 |
+
50k,0.5690,0.5217,,0.2407,0.3689,0.4355,0.4051,0.4885,0.4795,0.2906,0.2952,0.3187,0.3807,
|
13 |
+
55k,0.5843,0.5680,,0.2081,0.4101,0.4341,0.3230,0.4931,0.4940,0.2940,0.3117,0.3242,0.3984,
|
14 |
+
60k,0.5916,0.5814,,0.4068,0.4107,0.4861,0.4469,0.4955,0.5130,0.3137,0.3090,0.3422,0.4081,
|
15 |
+
65k,0.6032,0.5774,,0.3145,0.4477,0.4858,0.4907,0.5039,0.5087,0.3097,0.3184,0.3397,0.4156,
|
16 |
+
70k,0.6030,0.5920,,0.4102,0.4736,0.5080,0.4920,0.5164,0.5129,0.3236,0.3360,0.3375,0.4242,
|
17 |
+
75k,0.6216,0.6187,,0.2820,0.4226,0.4777,0.2245,0.5190,0.5042,0.3265,0.3341,0.3483,0.4220,
|
18 |
+
80k,0.6397,0.6127,,0.0975,0.4217,0.3698,,0.5185,0.5301,0.3352,0.3412,0.3532,0.4306,
|
19 |
+
85k,0.6416,0.6254,,0.0722,0.4763,0.3700,0.5029,0.5249,0.5350,0.3448,0.3423,0.3530,0.4340,
|
20 |
+
90k,0.6510,0.6317,,0.3388,0.1472,0.4793,0.0317,0.5337,0.5220,0.3440,0.3559,0.3644,0.4418,
|
21 |
+
95k,0.6655,0.6479,,0.5283,0.4938,0.5144,0.5180,0.5432,0.5446,0.3331,0.3393,0.3683,0.4454,
|
22 |
+
100k,0.6723,0.6486,,0.4317,0.1100,0.5121,0.5358,0.5383,0.5514,0.3520,0.3544,0.3698,0.4378,
|
23 |
+
105k,0.6755,0.6582,,0.1886,,0.5280,0.5153,0.5499,0.5562,0.3626,0.3642,0.3683,0.4525,
|
24 |
+
110k,0.6798,0.6668,,0.3510,,0.5468,0.5182,0.5541,0.5654,0.3694,,0.3903,0.4566,
|
25 |
+
115k,0.6796,0.6668,,0.3692,0.4759,0.5347,0.5132,0.5508,0.5577,0.3741,,0.3908,0.4482,
|
26 |
+
120k,0.6822,0.6688,,0.3690,0.4352,0.5376,0.5483,0.5567,0.5658,0.3881,,0.3950,0.4524,
|
27 |
+
125k,0.6894,0.6743,,0.3365,0.5206,0.4855,0.5211,0.5617,0.5658,0.3725,,0.3880,0.4592,
|
28 |
+
130k,0.6914,0.6709,,0.3550,0.0088,0.5238,0.5245,0.5597,0.5609,0.3698,,,0.4594,
|
29 |
+
135k,0.6915,0.6721,,0.3892,,0.5467,0.3977,0.5541,0.5774,0.3782,,,0.4636,
|
30 |
+
140k,,0.6773,,0.3930,,0.3110,0.4991,0.5572,0.5675,0.3906,,,0.4741,
|
31 |
+
145k,,,,0.4538,,0.5720,0.4872,0.5642,0.5639,,,,0.4720,
|
32 |
+
150k,,,,0.2883,,0.5612,,0.5701,0.5844,0.3899,,,0.4651,
|
33 |
+
155k,,,,0.4185,,0.5030,0.1586,0.5790,0.5755,0.4044,,,0.4784,
|
34 |
+
160k,,,,0.2720,,0.5701,0.5630,0.5819,0.5864,0.4049,,,0.4665,
|
35 |
+
165k,,,,0.4252,,0.5388,0.5642,,0.5853,0.4007,,,0.4793,
|
36 |
+
170k,,,,0.1507,,0.5951,0.5739,,,0.4150,,,0.4846,
|
37 |
+
175k,,,,0.3242,,0.5437,0.5640,,0.5979,0.4092,,,0.4908,
|
38 |
+
180k,,,,0.2653,,0.5580,0.5912,,0.6054,0.4189,,,,
|
39 |
+
185k,,,,0.2651,,0.5709,0.5852,,0.6064,,,,0.5030,
|
40 |
+
190k,,,,0.2380,,0.5142,,,0.5996,0.4193,,,0.5115,
|
41 |
+
195k,,,,0.4048,,0.5964,,,0.6243,0.4265,,,,
|
42 |
+
200k,,,,0.5058,,0.5684,,,0.6248,0.4256,,,,
|
43 |
+
205k,,,,0.0945,,0.5878,,,0.6224,0.4190,,,0.5105,
|
44 |
+
210k,,,,0.1557,,0.6020,,,0.6311,0.4415,,,0.5164,
|
45 |
+
215k,,,,0.2483,,0.5995,,,0.6293,0.4353,,,0.5163,
|
46 |
+
220k,,,,0.1725,,0.5924,,,0.6375,,,,,
|
47 |
+
225k,,,,0.2467,,0.4832,,,0.6340,0.4556,,,,
|
48 |
+
230k,,,,0.1653,,,,,0.6436,0.4622,,,,
|
49 |
+
235k,,,,0.1884,,,,,0.6411,0.4608,,,,
|
50 |
+
240k,,,,0.0719,,,,,,0.4536,,,,
|
51 |
+
245k,,,,0.3757,,,,,,0.4641,,,,
|
52 |
+
250k,,,,0.5859,,,,,,,,,,
|
53 |
+
255k,,,,0.4987,,,,,,0.4741,,,,
|
54 |
+
260k,,,,0.3940,,,,,,0.4712,,,,
|
55 |
+
265k,,,,0.3607,,,,,,0.4767,,,,
|
56 |
+
270k,,,,0.3898,,,,,,0.4795,,,,
|
57 |
+
275k,,,,0.4123,,,,,,,,,,
|
58 |
+
280k,,,,0.2413,,,,,,0.4787,,,,
|
59 |
+
285k,,,,0.3665,,,,,,0.4843,,,,
|
60 |
+
290k,,,,,,,,,,0.4818,,,,
|
61 |
+
300k,,,,,,,,,,0.4969,,,,
|
62 |
+
305k,,,,,,,,,,0.4941,,,,
|
63 |
+
310k,,,,,,,,,,0.4963,,,,
|
64 |
+
315k,,,,,,,,,,,,,,
|
65 |
+
320k,,,,,,,,,,,,,,
|
66 |
+
325k,,,,,,,,,,,,,,
|
67 |
+
330k,,,,,,,,,,,,,,
|
68 |
+
335k,,,,,,,,,,,,,,
|
data/txt360_eval/CKPT Eval - WinoGrande.csv
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,Slim-Pajama 600B (bsz=4K x 1024),,,,,,FineWeb-1.5T,,Ours-Base,,Ours-Upsampling1,,Ours-Upsampling2,,Ours-Code-Upsampling2,,All-Upsampling1,,All-Upsampling1,,All-Upsampling1,,All-Upsampling1,,DCLM-Base,
|
2 |
+
0-shot: 3 min,Llama-8x8B-baseline,,Llama-8x8B-seq8192,,Llama-8x8B-mup,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-8x8B-seq8192,,Llama-1x8B-seq8192,,Llama_extend-1x8B-seq8192,,Jais-1x8B-seq8192,,Llama-1x8B-seq8192,
|
3 |
+
5-shot: 3 min,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot,0-shot,5-shot
|
4 |
+
5k,0.5454,0.5367,0.5572,0.5367,,,0.5691,0.5335,0.5351,0.5233,0.5241,,0.5367,0.5391,0.5470,0.5414,0.5383,0.5320,0.5067,0.5201,0.5257,0.5122,0.5020,0.5217,0.5217,0.5312
|
5 |
+
10k,0.5801,0.6054,0.5730,0.5525,,,0.5904,0.5620,0.5604,0.5738,0.5825,0.5691,0.5817,0.5809,0.5777,0.5706,0.5667,0.5620,0.5383,0.5375,0.5241,0.5430,0.5343,0.5241,0.5722,0.5517
|
6 |
+
15k,0.6172,0.6038,0.5848,0.6046,,,0.5927,0.5951,0.5919,0.5699,,0.6014,0.5880,0.5856,0.5991,0.5943,0.5896,0.5841,0.5596,0.5470,0.5391,0.5414,0.5509,0.5454,0.5620,0.5612
|
7 |
+
20k,0.6109,0.6267,0.5935,0.6085,,,0.6448,0.6022,0.6006,,0.6204,,0.6180,0.6235,0.6014,0.6062,0.5935,0.6014,0.5580,0.5691,0.5533,0.5328,0.5659,0.5580,0.5943,0.5714
|
8 |
+
25k,0.6417,0.6369,0.5998,0.6140,,,0.6196,0.6164,0.6125,0.5998,0.6093,0.6117,0.6062,0.6117,0.6212,0.6014,0.6101,0.6188,0.5785,0.5770,0.5556,0.5572,0.5762,0.5691,0.5904,0.5864
|
9 |
+
30k,0.6377,0.6456,0.6251,0.6361,,,0.6488,0.6251,,,0.6330,0.5983,0.6140,0.6022,0.6338,0.6109,0.6322,0.6188,0.5667,0.5770,0.5612,0.5493,0.5919,0.5604,0.6054,0.5935
|
10 |
+
35k,0.6472,0.6456,0.6196,0.5935,,,0.6440,0.6227,0.6030,0.6172,0.6393,0.6243,0.6259,0.6093,0.6172,0.6164,0.6212,0.6188,0.5817,0.5572,0.5848,0.5612,0.5777,0.5714,0.6172,0.5975
|
11 |
+
40k,0.6606,0.6630,0.6369,0.6259,,,0.6496,0.6077,0.6338,0.5951,,,0.6267,0.6101,0.6417,0.6235,0.6433,0.6361,0.5856,0.5770,0.5817,0.5777,0.5927,0.5635,0.6117,0.5959
|
12 |
+
45k,0.6717,0.6496,0.6196,0.6425,,,0.6456,0.6417,0.6172,0.6109,0.6448,0.6235,0.6393,0.6417,0.6425,0.6369,0.6393,0.6346,0.5777,0.5699,0.5872,0.5809,0.5951,0.5833,0.6156,0.6085
|
13 |
+
50k,0.6756,0.6725,0.6369,0.6377,,,0.6464,0.6275,0.6401,0.5975,0.6322,0.6322,0.6164,0.614,0.6425,0.6425,0.6472,0.6393,0.5833,0.5683,0.5872,0.5675,0.5967,0.5580,0.6338,0.6022
|
14 |
+
55k,0.6661,0.6590,0.6496,0.6614,,,0.6567,0.6314,0.6235,0.6062,0.6519,0.6133,0.6314,0.6235,0.6377,0.648,0.6464,0.6322,0.5738,0.5612,0.5935,0.5722,0.6148,0.5533,0.6140,0.614
|
15 |
+
60k,0.6622,0.6511,0.6377,0.6582,,,0.6480,0.6464,0.6251,0.6117,0.6535,0.6251,0.6219,0.6235,0.6480,0.6196,0.6369,0.6338,0.5864,0.5730,0.5746,0.5683,0.5896,0.5785,0.6062,0.6283
|
16 |
+
65k,0.6669,0.6772,0.6590,0.6685,,,0.6654,0.6354,0.6283,0.6180,0.6519,0.6196,0.6401,0.6393,0.6559,0.633,0.6504,0.6275,0.5919,0.5754,0.5825,0.5793,0.6101,0.5959,0.6290,0.6125
|
17 |
+
70k,0.6827,0.6811,0.6567,0.6701,,,0.6709,0.6401,0.6322,0.6235,0.6622,,0.6417,0.6409,0.6433,0.6338,0.6559,0.6361,0.5975,0.5927,0.5738,0.5588,0.5983,0.5738,0.6330,0.6101
|
18 |
+
75k,0.6788,0.6819,0.6543,0.6685,,,0.6709,0.6283,0.6480,0.6172,0.6654,0.6409,0.6527,0.6267,0.6488,0.6551,0.6527,0.6638,0.5959,0.5817,0.5880,0.5517,0.6164,0.5793,0.6196,0.6164
|
19 |
+
80k,0.6835,0.6882,0.6748,0.6638,,,0.6843,0.6464,0.6504,0.6275,0.6677,0.6401,0.6369,0.6298,0.6606,0.6488,0.6519,0.6440,0.5872,0.5675,0.5856,0.5564,0.6148,0.6038,0.6188,0.5983
|
20 |
+
85k,0.6867,0.6882,0.6638,0.6590,,,0.6875,0.6504,0.6409,0.6188,,0.6535,0.6575,0.6283,0.6606,0.6393,0.6393,0.6543,0.6085,0.5919,0.5872,0.5738,0.6156,0.5817,0.6361,0.6196
|
21 |
+
90k,0.6827,0.6803,0.6740,0.6598,,,0.6740,0.6393,0.6369,0.6306,0.6496,0.6543,0.6488,0.6409,0.6606,0.6314,0.6527,0.6409,0.5927,0.5825,0.5951,0.5888,0.6148,0.6077,0.6259,0.6164
|
22 |
+
95k,0.6859,0.6859,0.6764,0.6575,,,0.6835,0.6401,0.6369,0.6361,0.6551,0.6322,0.6654,0.6338,0.6630,0.6322,0.6409,0.6582,0.6156,0.5927,0.5864,0.5919,0.6164,0.5809,0.6283,0.6014
|
23 |
+
100k,0.6898,,0.6661,0.6851,,,0.6756,0.6567,,,0.6567,0.6472,0.6590,0.6488,0.6748,0.6204,0.6511,0.6519,0.6109,0.5817,0.5919,0.5746,0.6133,0.6030,0.6488,0.6338
|
24 |
+
105k,0.6811,0.6772,0.6654,0.6646,,,0.6772,0.6519,,,0.6661,0.6472,0.6732,0.6369,0.6638,0.633,0.6740,0.6638,0.6140,0.5991,0.5912,0.5833,0.6046,0.5943,0.6496,0.618
|
25 |
+
110k,0.7017,0.6867,0.6701,0.6654,,,0.6669,0.6480,0.6559,0.6456,0.6756,0.6551,0.6567,0.6401,0.6661,0.6456,0.6551,0.6535,0.6196,0.6006,,,0.6219,0.6069,0.6417,0.6338
|
26 |
+
115k,0.6890,0.7040,0.6701,0.6654,,,0.6732,0.6511,0.6456,0.6227,0.6559,0.6456,0.6661,0.6488,0.6748,0.6527,0.6622,0.6448,0.6156,0.6014,,,0.6338,0.6069,0.6575,0.6196
|
27 |
+
120k,0.6930,0.6953,0.6717,0.6701,,,0.6764,0.6464,0.6519,0.6275,0.6622,0.6480,0.6590,0.6322,0.6732,0.6377,0.6519,0.6622,0.5872,0.5612,,,0.6227,0.6077,0.6504,0.6275
|
28 |
+
125k,0.6961,0.6977,0.6811,0.6819,,,0.6985,0.6433,0.6393,0.6417,0.6685,0.6433,0.6646,0.6338,0.6740,0.6559,0.6803,0.6693,0.6014,0.5888,,,0.6298,0.6243,0.6488,0.6117
|
29 |
+
130k,0.6922,0.7056,0.6859,0.6717,,,0.6811,0.6330,0.6614,0.6393,0.6780,0.6322,0.6590,0.6361,0.6748,0.6456,0.6559,0.6472,0.6085,0.5880,,,,,0.6614,0.6235
|
30 |
+
135k,0.6961,0.6953,0.6788,0.6756,,,0.6827,0.6614,,,0.6606,0.6575,0.6551,0.6464,0.6748,0.629,0.6677,0.6535,0.5991,0.5959,,,,,0.6433,0.6417
|
31 |
+
140k,,,0.6819,0.6827,,,0.6867,0.6630,,,0.6598,0.6551,0.6567,0.6369,0.6709,0.6551,0.6638,0.6519,0.6038,0.5809,,,,,0.6472,0.6314
|
32 |
+
145k,,,,,,,0.6819,0.6504,,,0.6717,0.6480,0.6669,0.6551,0.6661,0.6433,0.6725,0.6630,0.6180,0.5801,,,,,0.6606,0.644
|
33 |
+
150k,,,,,,,0.6835,0.6646,,,0.6693,0.6654,,,0.6732,0.6148,0.6788,0.6409,0.6062,0.5991,,,,,0.6567,0.6361
|
34 |
+
155k,,,,,,,0.6748,0.6590,,,0.6772,0.6677,0.6630,0.648,0.6851,0.6409,0.6922,0.6764,0.6204,0.6006,,,,,0.6677,0.6401
|
35 |
+
160k,,,,,,,0.6875,0.6614,,,0.6709,0.6669,0.6748,0.648,0.6622,0.6638,0.6811,0.6803,0.6133,0.5864,,,,,0.6567,0.6322
|
36 |
+
165k,,,,,,,0.6788,0.6661,,,0.6709,0.6717,0.6725,0.6433,,,,,0.6006,0.5856,,,,,0.6669,0.6472
|
37 |
+
170k,,,,,,,0.6938,0.6709,,,0.6701,0.6598,0.6725,0.6354,,,0.6717,0.6867,0.6085,0.5833,,,,,0.6685,0.6575
|
38 |
+
175k,,,,,,,0.6938,0.6693,,,0.6590,0.6622,0.6693,0.6614,,,0.6890,0.6867,0.6133,0.5754,,,,,0.6622,0.6598
|
39 |
+
180k,,,,,,,0.6977,0.6646,,,0.6661,0.6646,0.6740,0.6661,,,0.6685,0.6504,0.6235,0.5967,,,,,0.6732,0.6535
|
40 |
+
185k,,,,,,,0.6875,0.6519,,,0.6930,0.6535,0.6811,0.663,,,0.6851,0.6819,0.6156,0.5833,,,,,0.6661,0.648
|
41 |
+
190k,,,,,,,0.6914,0.6859,,,0.6819,0.6606,,,,,0.6693,0.6638,0.6361,0.6006,,,,,0.6701,0.6488
|
42 |
+
195k,,,,,,,0.6859,0.6614,,,0.6946,0.6732,,,,,0.6756,0.6638,0.6259,0.5841,,,,,0.6606,0.6543
|
43 |
+
200k,,,,,,,0.6875,0.6669,,,0.6898,0.6780,,,,,0.7017,0.6701,0.6227,0.5872,,,,,0.6590,0.648
|
44 |
+
205k,,,,,,,0.7072,0.6906,,,0.6969,0.6780,,,,,0.6827,0.6748,0.6306,0.5888,,,,,0.6725,0.659
|
45 |
+
210k,,,,,,,0.6859,0.6661,,,0.6827,0.6748,,,,,0.6882,0.6717,0.6322,0.5919,,,,,0.6669,0.6488
|
46 |
+
215k,,,,,,,0.7017,0.6780,,,0.6748,0.6772,,,,,0.6922,0.6709,0.6346,0.6006,,,,,0.6709,0.6661
|
47 |
+
220k,,,,,,,0.7040,0.6788,,,0.6859,0.6732,,,,,0.6969,0.6638,0.6346,0.5983,,,,,,
|
48 |
+
225k,,,,,,,0.7111,0.6717,,,0.6843,0.6685,,,,,0.6756,0.6606,0.6188,0.5935,,,,,,
|
49 |
+
230k,,,,,,,0.7103,0.6811,,,,,,,,,0.7096,0.6701,0.6235,0.5935,,,,,,
|
50 |
+
235k,,,,,,,0.7040,0.6772,,,,,,,,,0.7096,0.6764,0.6306,0.6062,,,,,,
|
51 |
+
240k,,,,,,,0.7080,0.6851,,,,,,,,,,,0.6219,,,,,,,
|
52 |
+
245k,,,,,,,0.6985,0.6938,,,,,,,,,,,0.6267,0.5888,,,,,,
|
53 |
+
250k,,,,,,,0.7127,0.6938,,,,,,,,,,,0.6361,0.6006,,,,,,
|
54 |
+
255k,,,,,,,0.7119,0.6827,,,,,,,,,,,0.6440,0.5998,,,,,,
|
55 |
+
260k,,,,,,,0.7056,0.6867,,,,,,,,,,,0.6322,0.5975,,,,,,
|
56 |
+
265k,,,,,,,0.7040,0.6756,,,,,,,,,,,0.6338,0.6069,,,,,,
|
57 |
+
270k,,,,,,,0.7111,0.6819,,,,,,,,,,,0.6314,0.5991,,,,,,
|
58 |
+
275k,,,,,,,0.7127,0.6811,,,,,,,,,,,0.6306,0.6148,,,,,,
|
59 |
+
280k,,,,,,,0.7064,0.6914,,,,,,,,,,,0.6251,0.6054,,,,,,
|
60 |
+
285k,,,,,,,0.7096,0.6977,,,,,,,,,,,0.6385,,,,,,,
|
61 |
+
290k,,,,,,,,,,,,,,,,,,,0.6338,0.6077,,,,,,
|
62 |
+
300k,,,,,,,,,,,,,,,,,,,0.6227,0.6093,,,,,,
|
63 |
+
305k,,,,,,,,,,,,,,,,,,,0.6290,0.6069,,,,,,
|
64 |
+
310k,,,,,,,,,,,,,,,,,,,0.6267,0.6156,,,,,,
|
65 |
+
315k,,,,,,,,,,,,,,,,,,,0.6314,0.6101,,,,,,
|
66 |
+
320k,,,,,,,,,,,,,,,,,,,0.6401,0.5991,,,,,,
|
67 |
+
325k,,,,,,,,,,,,,,,,,,,,,,,,,,
|
68 |
+
330k,,,,,,,,,,,,,,,,,,,,,,,,,,
|
69 |
+
335k,,,,,,,,,,,,,,,,,,,,,,,,,,
|
main.py
CHANGED
@@ -150,7 +150,7 @@ def main():
|
|
150 |
return Div(
|
151 |
D_title(
|
152 |
H1(
|
153 |
-
"TxT360:
|
154 |
cls="l-body",
|
155 |
style="text-align: center;",
|
156 |
),
|
@@ -192,7 +192,7 @@ def main():
|
|
192 |
),
|
193 |
Li(
|
194 |
A(
|
195 |
-
"Motivation Behind
|
196 |
href="/intro#section2",
|
197 |
hx_get="/intro#section2",
|
198 |
hx_target="#inner-text",
|
@@ -298,7 +298,7 @@ def main():
|
|
298 |
),
|
299 |
Div(
|
300 |
A(
|
301 |
-
"
|
302 |
href="/common#section1",
|
303 |
hx_get="/common#section1",
|
304 |
hx_target="#inner-text",
|
@@ -883,18 +883,17 @@ def intro():
|
|
883 |
return Div(
|
884 |
Section(
|
885 |
H2("About TxT360"),
|
886 |
-
P(
|
887 |
-
B(
|
888 |
-
"We introduce TxT360 (Trillion eXtracted Text) the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 commonly used non-web data sources (e.g. FreeLaw, PG-19, etc.) providing pretraining teams with a recipe to easily adjust data weighting and train the most performant models."
|
889 |
)
|
890 |
),
|
891 |
P(
|
892 |
-
"Building on top of the prior studies on pre-training data
|
893 |
D_cite(bibtex_key="refinedweb"),
|
894 |
D_cite(bibtex_key="fineweb"),
|
895 |
D_cite(bibtex_key="c4"),
|
896 |
D_cite(bibtex_key="muennighoff2023scaling"),
|
897 |
-
"TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps.",
|
898 |
),
|
899 |
P(
|
900 |
"Metadata is stored to recover the raw distribution for each dataset, enabling fine-grained control to create data distributions and corpus of desired size. As an example, we present one simple upsampling scheme that takes into account the duplication counts, resulting in a 15~16 trillion token corpus, outperforming FineWeb and our non-upsampling baselines, on diverse evaluations. Unlike DCLM",
|
@@ -909,7 +908,7 @@ def intro():
|
|
909 |
id="section1",
|
910 |
),
|
911 |
Section(
|
912 |
-
H2("Motivation Behind
|
913 |
H3(
|
914 |
"TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
|
915 |
),
|
@@ -939,7 +938,7 @@ def intro():
|
|
939 |
),
|
940 |
table_div_data,
|
941 |
P(
|
942 |
-
"We provide details and context for the choices behind TxT360 in the respective Web Data Processing and Curated Source Processing section. A deep dive describing the deduplication process can be found in the
|
943 |
),
|
944 |
# Img(src="images/pipeline.png", height="300", width="600"),
|
945 |
# P(
|
|
|
150 |
return Div(
|
151 |
D_title(
|
152 |
H1(
|
153 |
+
"TxT360: A Top-Quality LLM Pre-training Dataset Requires the Perfect Blend",
|
154 |
cls="l-body",
|
155 |
style="text-align: center;",
|
156 |
),
|
|
|
192 |
),
|
193 |
Li(
|
194 |
A(
|
195 |
+
"Motivation Behind TxT360",
|
196 |
href="/intro#section2",
|
197 |
hx_get="/intro#section2",
|
198 |
hx_target="#inner-text",
|
|
|
298 |
),
|
299 |
Div(
|
300 |
A(
|
301 |
+
"Shared Processing Steps",
|
302 |
href="/common#section1",
|
303 |
hx_get="/common#section1",
|
304 |
hx_target="#inner-text",
|
|
|
883 |
return Div(
|
884 |
Section(
|
885 |
H2("About TxT360"),
|
886 |
+
P( "TL;DR ",
|
887 |
+
B("We introduce TxT360 (Trillion eXtracted Text), the first dataset to globally deduplicate 99 CommonCrawl snapshots and 14 high-quality data sources from diverse domains (e.g., FreeLaw, PG-19, etc.). Our large-scale deduplication process enables precise control over data weighting. In addition to document selection, TxT360, along with its rich metadata, allows for the assignment of optimal data weights. We demonstrate a simple but effective upsampling recipe that creates a 15+ trillion-token corpus, outperforming FineWeb 15T. Furthermore, TxT360 empowers pre-trainers to explore more advanced weighting techniques, a capability not commonly available in previous pre-training datasets."
|
|
|
888 |
)
|
889 |
),
|
890 |
P(
|
891 |
+
"Building on top of the prior studies on pre-training data",
|
892 |
D_cite(bibtex_key="refinedweb"),
|
893 |
D_cite(bibtex_key="fineweb"),
|
894 |
D_cite(bibtex_key="c4"),
|
895 |
D_cite(bibtex_key="muennighoff2023scaling"),
|
896 |
+
", TxT360 carefully implements data processing steps including extraction, filtering, deduplication, personally identifiable information removal, and other steps.",
|
897 |
),
|
898 |
P(
|
899 |
"Metadata is stored to recover the raw distribution for each dataset, enabling fine-grained control to create data distributions and corpus of desired size. As an example, we present one simple upsampling scheme that takes into account the duplication counts, resulting in a 15~16 trillion token corpus, outperforming FineWeb and our non-upsampling baselines, on diverse evaluations. Unlike DCLM",
|
|
|
908 |
id="section1",
|
909 |
),
|
910 |
Section(
|
911 |
+
H2("Motivation Behind TxT360"),
|
912 |
H3(
|
913 |
"TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."
|
914 |
),
|
|
|
938 |
),
|
939 |
table_div_data,
|
940 |
P(
|
941 |
+
"We provide details and context for the choices behind TxT360 in the respective Web Data Processing and Curated Source Processing section. A deep dive describing the deduplication process can be found in the Shared Processing Steps section."
|
942 |
),
|
943 |
# Img(src="images/pipeline.png", height="300", width="600"),
|
944 |
# P(
|
overview.py
CHANGED
@@ -276,7 +276,7 @@ overview_div = Div(
|
|
276 |
Li("The Highlevel Data Process Approach", style = "margin-bottom: 5px"),
|
277 |
Li("Introduction to Global Deduplication", style = "margin-bottom: 5px"),
|
278 |
),
|
279 |
-
H2("Motivation Behind
|
280 |
H3("TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."),
|
281 |
P("The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). Data is often referred as low quality if it has not been filtered to review unwanted text. The community has introduced a variety of filtered datasets including purely web-based datasets. Commonly used pretraining datasets are:"),
|
282 |
Ul(
|
|
|
276 |
Li("The Highlevel Data Process Approach", style = "margin-bottom: 5px"),
|
277 |
Li("Introduction to Global Deduplication", style = "margin-bottom: 5px"),
|
278 |
),
|
279 |
+
H2("Motivation Behind TxT360"),
|
280 |
H3("TxT360 is the first dataset to combine both web and curated data sources commonly used in pretraining."),
|
281 |
P("The quality and size of a pre-training dataset play a crucial role in the performance of large language models (LLMs). Data is often referred as low quality if it has not been filtered to review unwanted text. The community has introduced a variety of filtered datasets including purely web-based datasets. Commonly used pretraining datasets are:"),
|
282 |
Ul(
|
results.py
CHANGED
@@ -1,5 +1,10 @@
|
|
1 |
from fasthtml.common import *
|
2 |
from fasthtml.components import *
|
|
|
|
|
|
|
|
|
|
|
3 |
import json
|
4 |
from fh_plotly import plotly2fasthtml
|
5 |
from plotly import graph_objects as go
|
@@ -7,6 +12,74 @@ import pandas as pd
|
|
7 |
import plotly.express as px
|
8 |
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
##upsampling validation loss graph
|
11 |
|
12 |
# Data
|
@@ -45,7 +118,7 @@ txt360 = [2.589649677, 2.438303471, 2.383416414, 2.337049007, 2.300292492,
|
|
45 |
fig_val = go.Figure()
|
46 |
|
47 |
# Add lines
|
48 |
-
fig_val.add_trace(go.Scatter(x=steps, y=fineweb, mode='lines', name='FineWeb
|
49 |
fig_val.add_trace(go.Scatter(x=steps, y=txt360, mode='lines', name='TxT360'))
|
50 |
|
51 |
# Update layout
|
@@ -56,8 +129,6 @@ fig_val.update_layout(
|
|
56 |
legend_title='Models'
|
57 |
)
|
58 |
|
59 |
-
# Show plot
|
60 |
-
|
61 |
# Show the plot
|
62 |
validation_loss_graph = fig_val
|
63 |
|
@@ -716,7 +787,7 @@ dataset_comparison = pd.DataFrame(
|
|
716 |
"28.04",
|
717 |
"25.61",
|
718 |
],
|
719 |
-
"FineWeb
|
720 |
"71.5",
|
721 |
"82.1",
|
722 |
"79.46",
|
@@ -763,24 +834,61 @@ table_div_1 = Div(NotStr(table_html),
|
|
763 |
intro_div = Div(
|
764 |
H2("TxT360 Studies"),
|
765 |
H3("What This Section Contains"),
|
766 |
-
P("This section
|
|
|
767 |
Ul(
|
768 |
-
Li("
|
769 |
-
Li("Perplexity
|
770 |
),
|
771 |
)
|
772 |
|
773 |
|
774 |
upsampling_exp = Div(
|
775 |
-
H2("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
776 |
H3("Experiment Setup"),
|
777 |
-
P(
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
783 |
plotly2fasthtml(lm_loss_graph),
|
|
|
784 |
)
|
785 |
|
786 |
preplexity_intro_div = Div(
|
|
|
1 |
from fasthtml.common import *
|
2 |
from fasthtml.components import *
|
3 |
+
from fasthtml.components import (
|
4 |
+
D_cite,
|
5 |
+
)
|
6 |
+
|
7 |
+
import os
|
8 |
import json
|
9 |
from fh_plotly import plotly2fasthtml
|
10 |
from plotly import graph_objects as go
|
|
|
12 |
import plotly.express as px
|
13 |
|
14 |
|
15 |
+
## Evaluation Graphs
|
16 |
+
|
17 |
+
# Load the data
|
18 |
+
all_eval_results = {}
|
19 |
+
for fname in os.listdir("data/txt360_eval"):
|
20 |
+
if fname.endswith(".csv"):
|
21 |
+
metric_name = fname.replace("CKPT Eval - ", "").replace(".csv", "")
|
22 |
+
all_eval_results[metric_name] = {}
|
23 |
+
|
24 |
+
# with open(os.path.join("data/txt360_eval", fname)) as f:
|
25 |
+
df = pd.read_csv(os.path.join("data/txt360_eval", fname))
|
26 |
+
|
27 |
+
# slimpajama_res = df.iloc[2:, 2].astype(float).fillna(0.0) # slimpajama
|
28 |
+
fineweb_res = df.iloc[2:, 4].astype(float).fillna(method="bfill") # fineweb
|
29 |
+
txt360_base = df.iloc[2:, 5].astype(float).fillna(method="bfill") # txt360-dedup-only
|
30 |
+
txt360_web_up = df.iloc[2:, 7].astype(float).fillna(method="bfill") # txt360-web-only-upsampled
|
31 |
+
txt360_all_up_stack = df.iloc[2:, 9].astype(float).fillna(method="bfill") # txt360-all-upsampled + stackv2
|
32 |
+
|
33 |
+
# each row is 20B tokens.
|
34 |
+
# all_eval_results[metric_name]["slimpajama"] = slimpajama_res
|
35 |
+
all_eval_results[metric_name]["fineweb"] = fineweb_res
|
36 |
+
all_eval_results[metric_name]["txt360-dedup-only"] = txt360_base
|
37 |
+
all_eval_results[metric_name]["txt360-web-only-upsampled"] = txt360_web_up
|
38 |
+
all_eval_results[metric_name]["txt360-all-upsampled + stackv2"] = txt360_all_up_stack
|
39 |
+
all_eval_results[metric_name]["token"] = [20 * i for i in range(len(fineweb_res))]
|
40 |
+
|
41 |
+
|
42 |
+
# Eval Result Plots
|
43 |
+
all_eval_res_figs = {}
|
44 |
+
for metric_name, res in all_eval_results.items():
|
45 |
+
fig_res = go.Figure()
|
46 |
+
|
47 |
+
# Add lines
|
48 |
+
fig_res.add_trace(go.Scatter(
|
49 |
+
x=all_eval_results[metric_name]["token"],
|
50 |
+
y=all_eval_results[metric_name]["fineweb"],
|
51 |
+
mode='lines', name='FineWeb'
|
52 |
+
))
|
53 |
+
fig_res.add_trace(go.Scatter(
|
54 |
+
x=all_eval_results[metric_name]["token"],
|
55 |
+
y=all_eval_results[metric_name]["txt360-web-only-upsampled"],
|
56 |
+
mode='lines', name='TxT360 - CC Data Upsampled'
|
57 |
+
))
|
58 |
+
fig_res.add_trace(go.Scatter(
|
59 |
+
x=all_eval_results[metric_name]["token"],
|
60 |
+
y=all_eval_results[metric_name]["txt360-dedup-only"],
|
61 |
+
mode='lines', name='TxT360 - CC Data Dedup'
|
62 |
+
))
|
63 |
+
fig_res.add_trace(go.Scatter(
|
64 |
+
x=all_eval_results[metric_name]["token"],
|
65 |
+
y=all_eval_results[metric_name]["txt360-all-upsampled + stackv2"],
|
66 |
+
mode='lines', name='TxT360 - Full Upsampled + Stack V2'
|
67 |
+
))
|
68 |
+
|
69 |
+
print(all_eval_results[metric_name]["token"])
|
70 |
+
print(all_eval_results[metric_name]["fineweb"].tolist())
|
71 |
+
print(all_eval_results[metric_name]["txt360-web-only-upsampled"].tolist())
|
72 |
+
|
73 |
+
# Update layout
|
74 |
+
fig_res.update_layout(
|
75 |
+
title=f"{metric_name} Performance",
|
76 |
+
title_x=0.5, # Centers the title
|
77 |
+
xaxis_title="Billion Tokens",
|
78 |
+
yaxis_title=metric_name,
|
79 |
+
legend_title="Dataset",
|
80 |
+
)
|
81 |
+
all_eval_res_figs[metric_name] = fig_res
|
82 |
+
|
83 |
##upsampling validation loss graph
|
84 |
|
85 |
# Data
|
|
|
118 |
fig_val = go.Figure()
|
119 |
|
120 |
# Add lines
|
121 |
+
fig_val.add_trace(go.Scatter(x=steps, y=fineweb, mode='lines', name='FineWeb'))
|
122 |
fig_val.add_trace(go.Scatter(x=steps, y=txt360, mode='lines', name='TxT360'))
|
123 |
|
124 |
# Update layout
|
|
|
129 |
legend_title='Models'
|
130 |
)
|
131 |
|
|
|
|
|
132 |
# Show the plot
|
133 |
validation_loss_graph = fig_val
|
134 |
|
|
|
787 |
"28.04",
|
788 |
"25.61",
|
789 |
],
|
790 |
+
"FineWeb": [
|
791 |
"71.5",
|
792 |
"82.1",
|
793 |
"79.46",
|
|
|
834 |
intro_div = Div(
|
835 |
H2("TxT360 Studies"),
|
836 |
H3("What This Section Contains"),
|
837 |
+
P("This section shows the learning curve when pre-training on TxT360, with a proper upsampling approach. We compare several simple strategies and demonstrate that one particular upsampling method, inspired by the natural data distribution, performs exceptionally well. In our preliminary experiments, the model learns significantly faster on TxT360 compared to a similarly scaled dataset, FineWeb. We believe that a more carefully designed upsampling strategy could further enhance the use of our data."),
|
838 |
+
P("In addition to the training results, we also provide an analysis of the dataset, including perplexity trends over time across the CommonCrawl snapshots. This section is organized into the following topic areas:"),
|
839 |
Ul(
|
840 |
+
Li("The Learning Curve of TxT360 with an Upsampling Recipe", style = "margin-bottom: 5px"),
|
841 |
+
Li("Perplexity Analysis across time", style = "margin-bottom: 5px"),
|
842 |
),
|
843 |
)
|
844 |
|
845 |
|
846 |
upsampling_exp = Div(
|
847 |
+
H2("A Simple Data Mix Creates a Good Learning Curve"),
|
848 |
+
P(
|
849 |
+
"As discussed in prior sections, duplicated documents can significantly reduce training efficiency (i.e., the ratio of model performance to the number of pre-trained tokens). Previous work, such as RefinedWeb",
|
850 |
+
D_cite(bibtex_key="refinedweb"),
|
851 |
+
", emphasizes the importance of deduplication. Recently, the FineWeb study conducted an interesting analysis, comparing LLM performance when pre-trained on globally deduplicated versus locally deduplicated datasets. They found that training efficiency with a globally deduplicated dataset can be worse",
|
852 |
+
D_cite(bibtex_key="fineweb"),
|
853 |
+
". Fineweb hypothesize that global deduplication may remove a higher proportion of high-quality documents."
|
854 |
+
),
|
855 |
+
P(
|
856 |
+
"This finding led us to consider that a pre-training corpus based on crawled websites is naturally upsampled for a variety of reasons. For example, commonly used templates or boilerplates may appear millions of times; a well-regarded article reposted by different users may surface across multiple sites; and the same web pages, crawled by CommonCrawl at different times, will duplicate each other. The reasons behind these duplications vary: some may serve as indirect indicators of high-quality content, while others may not. Therefore, curating a pre-training dataset should involve leveraging these signals and considering data weighting schemes — or at the very least, provide users with the necessary information to control it effectively."
|
857 |
+
),
|
858 |
+
P(
|
859 |
+
"To this end, we store rich metadata for each document source, including features like user votes from StackExchange. One crucial piece of metadata is the number of duplicates detected for a document. This information allows users to reconstruct the natural web distribution, but more importantly, we will demonstrate that a simple upsampling recipe based on this metadata can create a high-quality data mix."
|
860 |
+
),
|
861 |
H3("Experiment Setup"),
|
862 |
+
P(
|
863 |
+
"Motivated by the FineWeb study, we opted to upsample documents based on their natural distribution. However, since duplication is only an indirect indicator of quality, we upsample documents to a few predefined levels rather than using their exact count. Specifically, we set the upsampling weight to 3 for documents with 2 to 5 duplicates, 5 for those with 5 to 100 duplicates, 8 for 101 to 1000 duplicates, and 10 for documents with over 1000 duplicates. These values were selected heuristically and informed by preliminary small-scale experiments. For non-CommonCrawl data sources, we assign a weight of 2 if the document appears more than once. This straightforward approach results in a corpus exceeding 15 trillion tokens, making it one of the largest open-access pre-training datasets available."
|
864 |
+
),
|
865 |
+
P(
|
866 |
+
"To evaluate the training efficiency of our dataset, we sampled 1.5T tokens from both FineWeb and TxT360 (using the aforementioned weighting) and conducted a training ablation on an 8x8B Mixture-of-Experts architecture, similar to Mixtral. We compared the learning curves by tracking training loss, validation scores, and performance across a wide array of diverse evaluation benchmarks. The validation set was sampled independently from SlimPajama",
|
867 |
+
D_cite(bibtex_key="cerebras2023slimpajama"),
|
868 |
+
". Note that this experiment is done on a slightly earlier version of the dataset."
|
869 |
+
),
|
870 |
+
H3("Learning Curves on the Evaluation Metrics"),
|
871 |
+
P(
|
872 |
+
"Evaluation results are the most direct indicator of model quality. We assess the intermediate results of the models across multiple metrics and plot the learning curves. Our findings indicate that the model learns significantly faster with TxT360. For a fair comparison, we evaluate TxT360 against FineWeb using only the CommonCrawl data sources, and we also show the curves after incorporating the 14 curated sources and coding data (Stack V2), demonstrating the full potential of the dataset. Due to computation resource constraints, we stop running experiments when we can observe clear trends."
|
873 |
+
),
|
874 |
+
P(
|
875 |
+
"Based on the metrics, we find that TxT360’s CommonCrawl portion consistently outperforms FineWeb after upsampling, particularly on challenging tasks like MMLU and generation tasks such as NQ. Similar to the findings in DCLM, adding non-CommonCrawl data sources produces mixed results, especially when testing with that specific version of the data. We have since updated the non-CC data to further reduce noise."
|
876 |
+
),
|
877 |
+
plotly2fasthtml(all_eval_res_figs["MMLU"]),
|
878 |
+
plotly2fasthtml(all_eval_res_figs["NQ"]),
|
879 |
+
# plotly2fasthtml(all_eval_res_figs["GSM8K"]),
|
880 |
+
plotly2fasthtml(all_eval_res_figs["HellaSwag"]),
|
881 |
+
plotly2fasthtml(all_eval_res_figs["MedQA"]),
|
882 |
+
plotly2fasthtml(all_eval_res_figs["PIQA"]),
|
883 |
+
plotly2fasthtml(all_eval_res_figs["TriviaQA"]),
|
884 |
+
plotly2fasthtml(all_eval_res_figs["WinoGrande"]),
|
885 |
+
|
886 |
+
H3("Comparing the Loss Curves"),
|
887 |
+
P(
|
888 |
+
"We also plot the training and validation loss curves for each dataset, showing that TxT360 achieves both lower training and validation losses compared to FineWeb. Although training loss may not correlate directly with final model performance, we observe that the loss curve for TxT360 exhibits fewer spikes compared to FineWeb, indicating more stable training dynamics."
|
889 |
+
),
|
890 |
plotly2fasthtml(lm_loss_graph),
|
891 |
+
plotly2fasthtml(validation_loss_graph),
|
892 |
)
|
893 |
|
894 |
preplexity_intro_div = Div(
|