File size: 876 Bytes
d90b3a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# Script counts tokens in a pretokenized dataset from preprocess_data.py
# Necessary for setting batch size, train_iters, etc

import sys
import os

## Necessary for the import
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
sys.path.insert(0, project_root)

from megatron.data import indexed_dataset
import numpy as np

if len(sys.argv) < 2:
    print(
        "Usage: python dataset_token_count.py /absolute/file/path/to/dataset1 /absolute/file/path/to/dataset2 ..."
    )
    sys.exit(1)

# Access the command-line arguments
arguments = sys.argv[1:]

for arg in arguments:
    print("Checking file", arg)
    try:
        dataset = indexed_dataset.make_dataset(arg, "mmap")
        size = np.sum(dataset.sizes)
        print("Dataset size in tokens is", size)
    except AttributeError:
        print("Dataset could not be loaded", arg)