alvarobartt HF staff commited on
Commit
59af7ca
1 Parent(s): 57492d6

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +2 -2
README.md CHANGED
@@ -135,7 +135,7 @@ Then you just need to run the TGI v2.2.0 (or higher) Docker container as follows
135
  docker run --gpus all --shm-size 1g -ti -p 8080:80 \
136
  -v hf_cache:/data \
137
  -e MODEL_ID=hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4 \
138
- -e NUM_SHARD=4 \
139
  -e QUANTIZE=gptq \
140
  -e HF_TOKEN=$(cat ~/.cache/huggingface/token) \
141
  -e MAX_INPUT_LENGTH=4000 \
@@ -214,7 +214,7 @@ docker run --runtime nvidia --gpus all --ipc=host -p 8000:8000 \
214
  vllm/vllm-openai:latest \
215
  --model hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4 \
216
  --quantization gptq_marlin \
217
- --tensor-parallel-size 4 \
218
  --max-model-len 4096
219
  ```
220
 
 
135
  docker run --gpus all --shm-size 1g -ti -p 8080:80 \
136
  -v hf_cache:/data \
137
  -e MODEL_ID=hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4 \
138
+ -e NUM_SHARD=8 \
139
  -e QUANTIZE=gptq \
140
  -e HF_TOKEN=$(cat ~/.cache/huggingface/token) \
141
  -e MAX_INPUT_LENGTH=4000 \
 
214
  vllm/vllm-openai:latest \
215
  --model hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4 \
216
  --quantization gptq_marlin \
217
+ --tensor-parallel-size 8 \
218
  --max-model-len 4096
219
  ```
220