akswelh
/

NEOX

Model card Files Files and versions Community

NEOX / tools /datasets /multinode_prepare_data.sh

akswelh's picture

Upload 251 files

d90b3a8 verified 26 days ago

history blame contribute delete

2.31 kB

	#!/bin/bash

	# USAGE:
	# This script allows you to prepare your dataset using multiple nodes by chunking the individual files and distributed the chunks
	# over the processes.
	# This bash script takes a single text file as input argument.
	# The text file contains a valid filepath in each line, leading to a jsonl-file.
	# Furthermore an environment variable for the rank and the world size needs to be set.
	# These default to the SLURM and OMPI variables in this order of priority, but they can be set manually as well
	# using the variables $RANK and $WORLD_SIZE, which will overwrite the cluster-specific variables.
	# You can also add all arguments of the prepare_data.py script to this script and it will simply pass them through.

	# Parse command-line arguments
	text_file="$1"
	rank="${RANK:-${SLURM_PROCID:-$OMPI_COMM_WORLD_RANK}}"
	world_size="${WORLD_SIZE:-${SLURM_NTASKS:-$OMPI_COMM_WORLD_SIZE}}"
	num_lines=$(wc -l < "$text_file")
	chunk_size=$((num_lines / world_size))
	start_line=$((rank * chunk_size + 1))
	end_line=$((start_line + chunk_size - 1))

	# Make sure the last chunk includes all remaining lines
	if [[ $rank == $((world_size - 1)) ]]; then
	end_line=$num_lines
	fi

	# Select the chunk of the text file that corresponds to the rank
	chunk_file="chunk_${rank}.txt"
	sed -n "${start_line},${end_line}p" "$text_file" > "$chunk_file"

	# Parse additional flags to be passed to the Python script
	shift 1 # Shift past the first three arguments
	py_args=""
	prefix_arg=""
	while [[ $# -gt 0 ]]; do
	case "$1" in
	--output-prefix=*) prefix_arg="$1"; shift;;
	--output-prefix) prefix_arg="$1 $2"; shift 2;;
	--*) py_args="$py_args $1 $2"; shift 2;;
	*) echo "Unknown argument: $1"; exit 1;;
	esac
	done

	# Add the rank to the --output-prefix argument if it is set
	if [[ -n "$prefix_arg" ]]; then
	py_args="$py_args $prefix_arg$rank"
	else
	# Inject a default --output-prefix argument containing the rank
	py_args="$py_args --output-prefix rank${rank}"
	fi


	echo "processing $chunk_file with rank $rank at world size $world_size"
	echo "using the following args: $py_args"
	# Call the Python script with the list of file paths in the chunk
	python tools/datasets/preprocess_data.py --input $(tr '\n' ',' < "$chunk_file" \| sed 's/,$/\n/') $py_args

	# Clean up
	rm "$chunk_file"