Spaces:
Runtime error
Runtime error
stevengrove
commited on
Commit
•
54cb9a9
1
Parent(s):
e6a8b63
Delete taiji
Browse files- taiji/drun +0 -35
- taiji/erun +0 -23
- taiji/etorchrun +0 -51
- taiji/jizhi_run_vanilla +0 -105
taiji/drun
DELETED
@@ -1,35 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
DOCKER_IMAGE="mirrors.tencent.com/ronnysong_rd/fastdet:torch2.0.1-cuda11.7"
|
3 |
-
|
4 |
-
if [ ! -n "$DEBUG" ]; then
|
5 |
-
COMMAND_PREFIX="pip3 install -e ."
|
6 |
-
else
|
7 |
-
COMMAND_PREFIX="pip3 install -q -e third_party/mmengine;
|
8 |
-
pip3 install -q -e third_party/mmdetection;
|
9 |
-
pip3 install -q -e third_party/mmcv;
|
10 |
-
pip3 install -q -e third_party/mmyolo;
|
11 |
-
pip3 install -q -e ."
|
12 |
-
fi
|
13 |
-
|
14 |
-
sudo nvidia-docker run \
|
15 |
-
--rm \
|
16 |
-
-it \
|
17 |
-
-e NVIDIA_VISIBLE_DEVICES=all \
|
18 |
-
--env="DISPLAY" \
|
19 |
-
--env="QT_X11_NO_MITSHM=1" \
|
20 |
-
--volume="$HOME/.Xauthority:/root/.Xauthority:rw" \
|
21 |
-
--shm-size=20gb \
|
22 |
-
--network=host \
|
23 |
-
-v /apdcephfs/:/apdcephfs/ \
|
24 |
-
-v /apdcephfs_cq2/:/apdcephfs_cq2/ \
|
25 |
-
-v /apdcephfs_cq3/:/apdcephfs_cq3/ \
|
26 |
-
-v /data/:/data/ \
|
27 |
-
-w $PWD \
|
28 |
-
$DOCKER_IMAGE \
|
29 |
-
bash -c "export TRANSFORMERS_CACHE=$PWD/work_dirs/.cache/transformers;
|
30 |
-
export TORCH_HOME=$PWD/work_dirs/.cache/torch;
|
31 |
-
export CLIP_CACHE=$PWD/work_dirs/.cache/clip;
|
32 |
-
export HF_HOME=$PWD/work_dirs/.cache/hf;
|
33 |
-
export TOKENIZERS_PARALLELISM=false;
|
34 |
-
$COMMAND_PREFIX
|
35 |
-
$*"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
taiji/erun
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
export NCCL_IB_GID_INDEX=3
|
3 |
-
|
4 |
-
export TRANSFORMERS_CACHE=$PWD/work_dirs/.cache/transformers
|
5 |
-
export TORCH_HOME=$PWD/work_dirs/.cache/torch
|
6 |
-
export CLIP_CACHE=$PWD/work_dirs/.cache/clip
|
7 |
-
export HF_HOME=$PWD/work_dirs/.cache/hf
|
8 |
-
export TOKENIZERS_PARALLELISM=false
|
9 |
-
export MKL_NUM_THREADS=1
|
10 |
-
export OMP_NUM_THREADS=1
|
11 |
-
export TORCH_DISTRIBUTED_DEBUG=INFO
|
12 |
-
export HF_DATASETS_OFFLINE=1
|
13 |
-
export TRANSFORMERS_OFFLINE=1
|
14 |
-
export http_proxy="http://star-proxy.oa.com:3128"
|
15 |
-
export https_proxy="http://star-proxy.oa.com:3128"
|
16 |
-
export ftp_proxy="http://star-proxy.oa.com:3128"
|
17 |
-
export no_proxy=".woa.com,mirrors.cloud.tencent.com,tlinux-mirror.tencent-cloud.com,tlinux-mirrorlist.tencent-cloud.com,localhost,127.0.0.1,mirrors-tlinux.tencentyun.com,.oa.com,.local,.3gqq.com,.7700.org,.ad.com,.ada_sixjoy.com,.addev.com,.app.local,.apps.local,.aurora.com,.autotest123.com,.bocaiwawa.com,.boss.com,.cdc.com,.cdn.com,.cds.com,.cf.com,.cjgc.local,.cm.com,.code.com,.datamine.com,.dvas.com,.dyndns.tv,.ecc.com,.expochart.cn,.expovideo.cn,.fms.com,.great.com,.hadoop.sec,.heme.com,.home.com,.hotbar.com,.ibg.com,.ied.com,.ieg.local,.ierd.com,.imd.com,.imoss.com,.isd.com,.isoso.com,.itil.com,.kao5.com,.kf.com,.kitty.com,.lpptp.com,.m.com,.matrix.cloud,.matrix.net,.mickey.com,.mig.local,.mqq.com,.oiweb.com,.okbuy.isddev.com,.oss.com,.otaworld.com,.paipaioa.com,.qqbrowser.local,.qqinternal.com,.qqwork.com,.rtpre.com,.sc.oa.com,.sec.com,.server.com,.service.com,.sjkxinternal.com,.sllwrnm5.cn,.sng.local,.soc.com,.t.km,.tcna.com,.teg.local,.tencentvoip.com,.tenpayoa.com,.test.air.tenpay.com,.tr.com,.tr_autotest123.com,.vpn.com,.wb.local,.webdev.com,.webdev2.com,.wizard.com,.wqq.com,.wsd.com,.sng.com,.music.lan,.mnet2.com,.tencentb2.com,.tmeoa.com,.pcg.com,www.wip3.adobe.com,www-mm.wip3.adobe.com,mirrors.tencent.com,csighub.tencentyun.com"
|
18 |
-
sed -i 's/np.float/float/g' /usr/local/python/lib/python3.8/site-packages/lvis/eval.py
|
19 |
-
touch /tmp/.unhold
|
20 |
-
|
21 |
-
pip3 install -e .
|
22 |
-
$*
|
23 |
-
rm /tmp/.unhold
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
taiji/etorchrun
DELETED
@@ -1,51 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
if [ ! -n "$SH" ]; then
|
3 |
-
#export NCCL_IB_GID_INDEX=3
|
4 |
-
export NCCL_IB_DISABLE=1
|
5 |
-
export NCCL_P2P_DISABLE=1
|
6 |
-
export NCCL_SOCKET_IFNAME=eth1
|
7 |
-
else
|
8 |
-
export NCCL_IB_GID_INDEX=3
|
9 |
-
export NCCL_IB_SL=3
|
10 |
-
export NCCL_CHECKS_DISABLE=1
|
11 |
-
export NCCL_P2P_DISABLE=0
|
12 |
-
export NCCL_IB_DISABLE=0
|
13 |
-
export NCCL_LL_THRESHOLD=16384
|
14 |
-
export NCCL_IB_CUDA_SUPPORT=1
|
15 |
-
export NCCL_SOCKET_IFNAME=bond1
|
16 |
-
export UCX_NET_DEVICES=bond1
|
17 |
-
export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6
|
18 |
-
export NCCL_COLLNET_ENABLE=0
|
19 |
-
export SHARP_COLL_ENABLE_SAT=0
|
20 |
-
export NCCL_NET_GDR_LEVEL=2
|
21 |
-
export NCCL_IB_QPS_PER_CONNECTION=4
|
22 |
-
export NCCL_IB_TC=160
|
23 |
-
export NCCL_PXN_DISABLE=1
|
24 |
-
export GLOO_SOCKET_IFNAME=bond1
|
25 |
-
export NCCL_DEBUG=info
|
26 |
-
fi
|
27 |
-
|
28 |
-
export TRANSFORMERS_CACHE=$PWD/work_dirs/.cache/transformers
|
29 |
-
export TORCH_HOME=$PWD/work_dirs/.cache/torch
|
30 |
-
export CLIP_CACHE=$PWD/work_dirs/.cache/clip
|
31 |
-
export HF_HOME=$PWD/work_dirs/.cache/hf
|
32 |
-
export TOKENIZERS_PARALLELISM=false
|
33 |
-
export MKL_NUM_THREADS=1
|
34 |
-
export OMP_NUM_THREADS=1
|
35 |
-
export TORCH_DISTRIBUTED_DEBUG=INFO
|
36 |
-
export HF_DATASETS_OFFLINE=1
|
37 |
-
export TRANSFORMERS_OFFLINE=1
|
38 |
-
|
39 |
-
export http_proxy="http://star-proxy.oa.com:3128"
|
40 |
-
export https_proxy="http://star-proxy.oa.com:3128"
|
41 |
-
export ftp_proxy="http://star-proxy.oa.com:3128"
|
42 |
-
export no_proxy=".woa.com,mirrors.cloud.tencent.com,tlinux-mirror.tencent-cloud.com,tlinux-mirrorlist.tencent-cloud.com,localhost,127.0.0.1,mirrors-tlinux.tencentyun.com,.oa.com,.local,.3gqq.com,.7700.org,.ad.com,.ada_sixjoy.com,.addev.com,.app.local,.apps.local,.aurora.com,.autotest123.com,.bocaiwawa.com,.boss.com,.cdc.com,.cdn.com,.cds.com,.cf.com,.cjgc.local,.cm.com,.code.com,.datamine.com,.dvas.com,.dyndns.tv,.ecc.com,.expochart.cn,.expovideo.cn,.fms.com,.great.com,.hadoop.sec,.heme.com,.home.com,.hotbar.com,.ibg.com,.ied.com,.ieg.local,.ierd.com,.imd.com,.imoss.com,.isd.com,.isoso.com,.itil.com,.kao5.com,.kf.com,.kitty.com,.lpptp.com,.m.com,.matrix.cloud,.matrix.net,.mickey.com,.mig.local,.mqq.com,.oiweb.com,.okbuy.isddev.com,.oss.com,.otaworld.com,.paipaioa.com,.qqbrowser.local,.qqinternal.com,.qqwork.com,.rtpre.com,.sc.oa.com,.sec.com,.server.com,.service.com,.sjkxinternal.com,.sllwrnm5.cn,.sng.local,.soc.com,.t.km,.tcna.com,.teg.local,.tencentvoip.com,.tenpayoa.com,.test.air.tenpay.com,.tr.com,.tr_autotest123.com,.vpn.com,.wb.local,.webdev.com,.webdev2.com,.wizard.com,.wqq.com,.wsd.com,.sng.com,.music.lan,.mnet2.com,.tencentb2.com,.tmeoa.com,.pcg.com,www.wip3.adobe.com,www-mm.wip3.adobe.com,mirrors.tencent.com,csighub.tencentyun.com"
|
43 |
-
|
44 |
-
sed -i 's/np.float/float/g' /usr/local/python/lib/python3.8/site-packages/lvis/eval.py
|
45 |
-
|
46 |
-
touch /tmp/.unhold
|
47 |
-
|
48 |
-
pip3 install -e .
|
49 |
-
torchrun --nnodes=$1 --nproc_per_node=$2 --node_rank=$INDEX --master_addr=$CHIEF_IP ${@:3}
|
50 |
-
|
51 |
-
rm /tmp/.unhold
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
taiji/jizhi_run_vanilla
DELETED
@@ -1,105 +0,0 @@
|
|
1 |
-
#!/bin/bash
|
2 |
-
if [[ $1 = "--help" ]] || [[ $1 = "-h" ]]
|
3 |
-
then
|
4 |
-
echo "Usage: jizhi_run NUM_MECHINES NUM_GPUS TASK_NAME <CMDS>"
|
5 |
-
fi
|
6 |
-
|
7 |
-
# user configuration
|
8 |
-
TOKEN=$TOKEN
|
9 |
-
if [ ! -n "$IMAGE_FULL_NAME" ]; then
|
10 |
-
IMAGE_FULL_NAME="mirrors.tencent.com/ronnysong_rd/fastdet:torch2.0.1-cuda11.7"
|
11 |
-
fi
|
12 |
-
if [ ! -n "$BUSINESS_FLAG" ]; then
|
13 |
-
BUSINESS_FLAG="TEG_AILab_CVC_chongqing"
|
14 |
-
fi
|
15 |
-
if [ ! -n "$CEPH_BUSINESS_FLAG" ]; then
|
16 |
-
CEPH_BUSINESS_FLAG="TEG_AILab_CVC_chongqing"
|
17 |
-
fi
|
18 |
-
if [ ! -n "$GPU_NAME" ]; then
|
19 |
-
GPU_NAME="V100"
|
20 |
-
fi
|
21 |
-
if [ ! -n "$PRIORITY_LEVEL" ]; then
|
22 |
-
PRIORITY_LEVEL="HIGH"
|
23 |
-
fi
|
24 |
-
if [ ! -n "$ELASTIC_LEVEL" ]; then
|
25 |
-
ELASTIC_LEVEL=1
|
26 |
-
fi
|
27 |
-
if [ ! -n "$RDMA" ]; then
|
28 |
-
RDMA="false"
|
29 |
-
fi
|
30 |
-
if [ ! -n "$CUDA" ]; then
|
31 |
-
CUDA="11.0"
|
32 |
-
fi
|
33 |
-
|
34 |
-
CMD_PATH="start.sh"
|
35 |
-
CONF_PATH="jizhi_conf.json"
|
36 |
-
ROOT_PATH=$PWD
|
37 |
-
UUID=$(date +%s)
|
38 |
-
|
39 |
-
rm -f $CMD_PATH
|
40 |
-
|
41 |
-
echo 'cd '$ROOT_PATH >> $CMD_PATH
|
42 |
-
echo 'export HF_HOME="'$ROOT_PATH'/work_dirs/.cache/hf"' >> $CMD_PATH
|
43 |
-
echo 'export TORCH_HOME="'$ROOT_PATH'/work_dirs/.cache/torch"' >> $CMD_PATH
|
44 |
-
echo 'export CLIP_CACHE="'$ROOT_PATH'/work_dirs/.cache/clip"' >> $CMD_PATH
|
45 |
-
echo 'export TRANSFORMERS_CACHE="'$ROOT_PATH'/work_dirs/.cache/transformers"' >> $CMD_PATH
|
46 |
-
echo 'export MKL_NUM_THREADS=1' >> $CMD_PATH
|
47 |
-
echo 'export OMP_NUM_THREADS=1' >> $CMD_PATH
|
48 |
-
echo 'export TOKENIZERS_PARALLELISM=false' >> $CMD_PATH
|
49 |
-
echo 'export TORCH_DISTRIBUTED_DEBUG=INFO' >> $CMD_PATH
|
50 |
-
echo 'export NCCL_IB_GID_INDEX=3' >> $CMD_PATH
|
51 |
-
if [ $BUSINESS_FLAG = "TaiJi_HYAide_BUFFER_SH_A800H" ]; then
|
52 |
-
echo 'export NCCL_IB_GID_INDEX=3' >> $CMD_PATH
|
53 |
-
echo 'export NCCL_IB_SL=3' >> $CMD_PATH
|
54 |
-
echo 'export NCCL_CHECKS_DISABLE=1' >> $CMD_PATH
|
55 |
-
echo 'export NCCL_P2P_DISABLE=0' >> $CMD_PATH
|
56 |
-
echo 'export NCCL_IB_DISABLE=0' >> $CMD_PATH
|
57 |
-
echo 'export NCCL_LL_THRESHOLD=16384' >> $CMD_PATH
|
58 |
-
echo 'export NCCL_IB_CUDA_SUPPORT=1' >> $CMD_PATH
|
59 |
-
echo 'export NCCL_SOCKET_IFNAME=bond1' >> $CMD_PATH
|
60 |
-
echo 'export UCX_NET_DEVICES=bond1' >> $CMD_PATH
|
61 |
-
echo 'export NCCL_IB_HCA=mlx5_bond_1,mlx5_bond_5,mlx5_bond_3,mlx5_bond_7,mlx5_bond_4,mlx5_bond_8,mlx5_bond_2,mlx5_bond_6' >> $CMD_PATH
|
62 |
-
echo 'export NCCL_COLLNET_ENABLE=0' >> $CMD_PATH
|
63 |
-
echo 'export SHARP_COLL_ENABLE_SAT=0' >> $CMD_PATH
|
64 |
-
echo 'export NCCL_NET_GDR_LEVEL=2' >> $CMD_PATH
|
65 |
-
echo 'export NCCL_IB_QPS_PER_CONNECTION=4' >> $CMD_PATH
|
66 |
-
echo 'export NCCL_IB_TC=160' >> $CMD_PATH
|
67 |
-
echo 'export NCCL_PXN_DISABLE=1' >> $CMD_PATH
|
68 |
-
fi
|
69 |
-
echo ${@:4} >> $CMD_PATH
|
70 |
-
|
71 |
-
chmod +x $CMD_PATH
|
72 |
-
|
73 |
-
rm -f $CONF_PATH
|
74 |
-
|
75 |
-
#INIT_CMD="jizhi_client mount -bf TEG_AILab_CVC_chongqing -tk $TOKEN"
|
76 |
-
INIT_CMD=""
|
77 |
-
|
78 |
-
echo '{' > $CONF_PATH
|
79 |
-
echo '"Token": "'$TOKEN'",' >> $CONF_PATH
|
80 |
-
echo '"business_flag": "'$BUSINESS_FLAG'",' >> $CONF_PATH
|
81 |
-
echo '"model_local_file_path": "'$ROOT_PATH'/'$CMD_PATH'",' >> $CONF_PATH
|
82 |
-
echo '"host_num": '$1',' >> $CONF_PATH
|
83 |
-
echo '"host_gpu_num": '$2',' >> $CONF_PATH
|
84 |
-
echo '"task_flag": "'$3'_'$UUID'",' >> $CONF_PATH
|
85 |
-
echo '"priority_level": "'$PRIORITY_LEVEL'",' >> $CONF_PATH
|
86 |
-
echo '"elastic_level": '$ELASTIC_LEVEL',' >> $CONF_PATH
|
87 |
-
echo '"cuda_version": "'$CUDA'",' >> $CONF_PATH
|
88 |
-
echo '"image_full_name": "'$IMAGE_FULL_NAME'",' >> $CONF_PATH
|
89 |
-
echo '"GPUName": "'$GPU_NAME'",' >> $CONF_PATH
|
90 |
-
echo '"mount_ceph_business_flag": "'$CEPH_BUSINESS_FLAG'",' >> $CONF_PATH
|
91 |
-
echo '"exec_start_in_all_mpi_pods": true,' >> $CONF_PATH
|
92 |
-
echo '"enable_rdma": '$RDMA',' >> $CONF_PATH
|
93 |
-
echo '"init_cmd": "'$INIT_CMD'",' >> $CONF_PATH
|
94 |
-
echo '"envs": {' >> $CONF_PATH
|
95 |
-
echo ' "HUNYUAN_TASK_CATEGORY": "LLM",' >> $CONF_PATH
|
96 |
-
echo ' "HUNYUAN_TASK_MODEL_TYPE": "SFT",' >> $CONF_PATH
|
97 |
-
echo ' "HUNYUAN_TASK_DOMAIN": "NLP",' >> $CONF_PATH
|
98 |
-
echo ' "HUNYUAN_TASK_START_MODEL_TYPE": "7B冷启"}' >> $CONF_PATH
|
99 |
-
echo '}' >> $CONF_PATH
|
100 |
-
|
101 |
-
jizhi_client start -scfg $CONF_PATH
|
102 |
-
|
103 |
-
rm -f $CMD_PATH
|
104 |
-
rm -f $CONF_PATH
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|