aiben / tests /test4gpus.sh
abugaber's picture
Upload folder using huggingface_hub
3943768 verified
raw
history blame
2.09 kB
#!/bin/bash
# CHOOSE:
ngpus=4
# below has to match GPUs for A6000s due to long context tests
export TESTMODULOTOTAL=4
pip install pytest-instafail || true
docker ps | grep text-generation-inference | awk '{print $1}' | xargs docker stop
killall -s SIGINT pytest
killall -s SIGTERM pytest
killall -s 9 pytest
pkill --signal 9 -f weaviate-embedded/weaviate
NPHYSICAL=`lscpu -p | egrep -v '^\#' | sort -u -t, -k 2,4 | wc -l`
NPROCS=`lscpu -p | egrep -v '^\#' | wc -l`
#
n_jobs=$(($NPROCS / $TESTMODULOTOTAL))
echo "CORES: $NPHYSICAL $NPROCS $n_jobs"
# GENERAL:
lowergpuid=0
low=0
high=$(($TESTMODULOTOTAL-1))
pids=""
for mod in $(seq $low $high)
do
# in some cases launch gradio server, TGI server, or gradio server as inference server with +1 and +2 off base port
# ports always increment by 3
export GRADIO_SERVER_PORT=$((7860+$(($mod*3))))
export TESTMODULO=$mod
# CVD loops over number of GPUs
export CUDA_VISIBLE_DEVICES=$(($lowergpuid+$(($mod % $ngpus))))
export n_jobs=$n_jobs
export OMP_NUM_THREADS=$n_jobs
export NUMEXPR_MAX_THREADS=$n_jobs
export OPENBLAS_NUM_THREADS=$n_jobs
# By default, OpenBLAS will restrict the Cpus_allowed to be 0x1.
export OPENBLAS_MAIN_FREE=$n_jobs
export MKL_NUM_THREADS=$n_jobs
export H2OGPT_BASE_PATH="./base_$mod"
# huggyllama test uses alot of memory, requires TESTMODULOTOTAL=ngpus for even A6000s
# pytest --instafail -s -v -n 1 tests -k "not test_huggyllama_transformers_pr" &> testsparallel"${mod}".log &
pytest --instafail -s -v -n 1 tests &> testsparallel"${mod}".log &
pid=$!
echo "MODS: $mod $GRADIO_SERVER_PORT $CUDA_VISIBLE_DEVICES $H2OGPT_BASE_PATH"
pids="$pids $pid"
done
trap "kill $pids; exit 1" INT
echo "to check on results while running, do:"
echo "grep -a PASSED testsparallel*.log | sed 's/.*PASSED//g' | sort | uniq |wc -l"
echo "grep -a FAILED testsparallel*.log | sed 's/.*FAILED//g' | sort | uniq |wc -l"
echo "to interrupt but still get some results, do:"
#echo "ps -auxwf | grep -v "[g]rep" | grep pytest | awk '{print $2}' |xargs kill -s SIGINT"
echo "kill -s SIGINT $pids"
wait