nm-research
commited on
Commit
•
936ddd4
1
Parent(s):
554cf8b
Update README.md
Browse files
README.md
CHANGED
@@ -47,7 +47,7 @@ This model can be deployed efficiently using the [vLLM](https://docs.vllm.ai/en/
|
|
47 |
from vllm import LLM, SamplingParams
|
48 |
from transformers import AutoTokenizer
|
49 |
|
50 |
-
model_id = "neuralmagic/Qwen2.5-1.5B-quantized.w8a8"
|
51 |
number_gpus = 1
|
52 |
max_model_len = 8192
|
53 |
|
@@ -74,7 +74,7 @@ The model was evaluated on the [OpenLLM](https://huggingface.co/spaces/open-llm-
|
|
74 |
```
|
75 |
lm_eval \
|
76 |
--model vllm \
|
77 |
-
--model_args pretrained="neuralmagic/Qwen2.5-1.5B-quantized.w8a8",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
|
78 |
--tasks openllm \
|
79 |
--batch_size auto
|
80 |
```
|
|
|
47 |
from vllm import LLM, SamplingParams
|
48 |
from transformers import AutoTokenizer
|
49 |
|
50 |
+
model_id = "neuralmagic-ent/Qwen2.5-1.5B-quantized.w8a8"
|
51 |
number_gpus = 1
|
52 |
max_model_len = 8192
|
53 |
|
|
|
74 |
```
|
75 |
lm_eval \
|
76 |
--model vllm \
|
77 |
+
--model_args pretrained="neuralmagic-ent/Qwen2.5-1.5B-quantized.w8a8",dtype=auto,gpu_memory_utilization=0.9,add_bos_token=True,max_model_len=4096,enable_chunk_prefill=True,tensor_parallel_size=1 \
|
78 |
--tasks openllm \
|
79 |
--batch_size auto
|
80 |
```
|