mazpie's picture
Initial commit
2d9a728
raw
history blame
335 Bytes
#!/bin/bash
MASTER_NODE=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
ALL_NODES=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
MASTER_PORT=$((10660 + $RANDOM % 10))
echo "All nodes used:"
echo ${ALL_NODES}
echo "Master node:"
echo ${MASTER_NODE}
echo "Args:"
echo $@
torchrun --rdzv_endpoint=${MASTER_NODE}:10069 $@