Commit
•
59af7ca
1
Parent(s):
57492d6
Update README.md
Browse files
README.md
CHANGED
@@ -135,7 +135,7 @@ Then you just need to run the TGI v2.2.0 (or higher) Docker container as follows
|
|
135 |
docker run --gpus all --shm-size 1g -ti -p 8080:80 \
|
136 |
-v hf_cache:/data \
|
137 |
-e MODEL_ID=hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4 \
|
138 |
-
-e NUM_SHARD=
|
139 |
-e QUANTIZE=gptq \
|
140 |
-e HF_TOKEN=$(cat ~/.cache/huggingface/token) \
|
141 |
-e MAX_INPUT_LENGTH=4000 \
|
@@ -214,7 +214,7 @@ docker run --runtime nvidia --gpus all --ipc=host -p 8000:8000 \
|
|
214 |
vllm/vllm-openai:latest \
|
215 |
--model hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4 \
|
216 |
--quantization gptq_marlin \
|
217 |
-
--tensor-parallel-size
|
218 |
--max-model-len 4096
|
219 |
```
|
220 |
|
|
|
135 |
docker run --gpus all --shm-size 1g -ti -p 8080:80 \
|
136 |
-v hf_cache:/data \
|
137 |
-e MODEL_ID=hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4 \
|
138 |
+
-e NUM_SHARD=8 \
|
139 |
-e QUANTIZE=gptq \
|
140 |
-e HF_TOKEN=$(cat ~/.cache/huggingface/token) \
|
141 |
-e MAX_INPUT_LENGTH=4000 \
|
|
|
214 |
vllm/vllm-openai:latest \
|
215 |
--model hugging-quants/Meta-Llama-3.1-405B-Instruct-GPTQ-INT4 \
|
216 |
--quantization gptq_marlin \
|
217 |
+
--tensor-parallel-size 8 \
|
218 |
--max-model-len 4096
|
219 |
```
|
220 |
|