optimize the iteration when tokenizeing large datasets (#332)
Browse files- src/axolotl/utils/data.py +10 -1
src/axolotl/utils/data.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
"""Module containing data utilities"""
|
2 |
import functools
|
|
|
3 |
import logging
|
4 |
from hashlib import md5
|
5 |
from pathlib import Path
|
@@ -264,8 +265,16 @@ def load_tokenized_prepared_datasets(
|
|
264 |
LOG.info("tokenizing, merging, and shuffling master dataset")
|
265 |
|
266 |
samples: List[int] = []
|
|
|
267 |
for d in datasets:
|
268 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
dataset = Dataset.from_list(samples).shuffle(seed=seed)
|
270 |
if cfg.local_rank == 0:
|
271 |
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
|
|
1 |
"""Module containing data utilities"""
|
2 |
import functools
|
3 |
+
import itertools
|
4 |
import logging
|
5 |
from hashlib import md5
|
6 |
from pathlib import Path
|
|
|
265 |
LOG.info("tokenizing, merging, and shuffling master dataset")
|
266 |
|
267 |
samples: List[int] = []
|
268 |
+
chunk_size = 1000
|
269 |
for d in datasets:
|
270 |
+
d_iter = iter(d)
|
271 |
+
while True:
|
272 |
+
chunk = list(itertools.islice(d_iter, chunk_size))
|
273 |
+
if not chunk:
|
274 |
+
break
|
275 |
+
samples.extend(chunk)
|
276 |
+
|
277 |
+
LOG.info("shuffle")
|
278 |
dataset = Dataset.from_list(samples).shuffle(seed=seed)
|
279 |
if cfg.local_rank == 0:
|
280 |
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|