File size: 2,314 Bytes
d90b3a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!/bin/bash

# USAGE:
# This script allows you to prepare your dataset using multiple nodes by chunking the individual files and distributed the chunks
# over the processes.
# This bash script takes a single text file as input argument.
# The text file contains a valid filepath in each line, leading to a jsonl-file.
# Furthermore an environment variable for the rank and the world size needs to be set.
# These default to the SLURM and OMPI variables in this order of priority, but they can be set manually as well
# using the variables $RANK and $WORLD_SIZE, which will overwrite the cluster-specific variables.
# You can also add all arguments of the prepare_data.py script to this script and it will simply pass them through.

# Parse command-line arguments
text_file="$1"
rank="${RANK:-${SLURM_PROCID:-$OMPI_COMM_WORLD_RANK}}"
world_size="${WORLD_SIZE:-${SLURM_NTASKS:-$OMPI_COMM_WORLD_SIZE}}"
num_lines=$(wc -l < "$text_file")
chunk_size=$((num_lines / world_size))
start_line=$((rank * chunk_size + 1))
end_line=$((start_line + chunk_size - 1))

# Make sure the last chunk includes all remaining lines
if [[ $rank == $((world_size - 1)) ]]; then
    end_line=$num_lines
fi

# Select the chunk of the text file that corresponds to the rank
chunk_file="chunk_${rank}.txt"
sed -n "${start_line},${end_line}p" "$text_file" > "$chunk_file"

# Parse additional flags to be passed to the Python script
shift 1  # Shift past the first three arguments
py_args=""
prefix_arg=""
while [[ $# -gt 0 ]]; do
    case "$1" in
        --output-prefix=*) prefix_arg="$1"; shift;;
        --output-prefix) prefix_arg="$1 $2"; shift 2;;
        --*) py_args="$py_args $1 $2"; shift 2;;
        *) echo "Unknown argument: $1"; exit 1;;
    esac
done

# Add the rank to the --output-prefix argument if it is set
if [[ -n "$prefix_arg" ]]; then
    py_args="$py_args $prefix_arg$rank"
else
    # Inject a default --output-prefix argument containing the rank
    py_args="$py_args --output-prefix rank${rank}"
fi


echo "processing $chunk_file with rank $rank at world size $world_size"
echo "using the following args: $py_args"
# Call the Python script with the list of file paths in the chunk
python tools/datasets/preprocess_data.py --input $(tr '\n' ',' < "$chunk_file" | sed 's/,$/\n/') $py_args

# Clean up
rm "$chunk_file"