CONFIG=$1 | |
GPUS=$2 | |
MODEL_NAME=$(basename "$(dirname $CONFIG)") | |
PORT=10902 | |
while : # auto-resume: the code sometimes crash due to bug of gloo on some gpus | |
do | |
torchrun --nproc_per_node=$GPUS \ | |
--master_port=$PORT \ | |
train.py --c $CONFIG --model $MODEL_NAME | |
for PID in $(ps -aux | grep $CONFIG | grep python | awk '{print $2}') | |
do | |
echo $PID | |
kill -9 $PID | |
done | |
sleep 30 | |
done |