domtts / melo /train.sh
CDOM201's picture
Created app
a7d2bd1
raw
history blame
391 Bytes
CONFIG=$1
GPUS=$2
MODEL_NAME=$(basename "$(dirname $CONFIG)")
PORT=10902
while : # auto-resume: the code sometimes crash due to bug of gloo on some gpus
do
torchrun --nproc_per_node=$GPUS \
--master_port=$PORT \
train.py --c $CONFIG --model $MODEL_NAME
for PID in $(ps -aux | grep $CONFIG | grep python | awk '{print $2}')
do
echo $PID
kill -9 $PID
done
sleep 30
done