NEOX / tools /datasets /dataset_token_count.py
akswelh's picture
Upload 251 files
d90b3a8 verified
# Script counts tokens in a pretokenized dataset from preprocess_data.py
# Necessary for setting batch size, train_iters, etc
import sys
import os
## Necessary for the import
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
sys.path.insert(0, project_root)
from megatron.data import indexed_dataset
import numpy as np
if len(sys.argv) < 2:
print(
"Usage: python dataset_token_count.py /absolute/file/path/to/dataset1 /absolute/file/path/to/dataset2 ..."
)
sys.exit(1)
# Access the command-line arguments
arguments = sys.argv[1:]
for arg in arguments:
print("Checking file", arg)
try:
dataset = indexed_dataset.make_dataset(arg, "mmap")
size = np.sum(dataset.sizes)
print("Dataset size in tokens is", size)
except AttributeError:
print("Dataset could not be loaded", arg)