chuanli-lambda
commited on
Commit
•
a70257c
1
Parent(s):
e841326
Update README.md
Browse files
README.md
CHANGED
@@ -24,4 +24,48 @@ model.quantize(tokenizer, quant_config=quant_config)
|
|
24 |
# Save quantized model
|
25 |
model.save_quantized(quant_path)
|
26 |
tokenizer.save_pretrained(quant_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
```
|
|
|
24 |
# Save quantized model
|
25 |
model.save_quantized(quant_path)
|
26 |
tokenizer.save_pretrained(quant_path)
|
27 |
+
```
|
28 |
+
|
29 |
+
|
30 |
+
vLLM serve
|
31 |
+
```
|
32 |
+
vllm serve lambdalabs/Llama-3.3-70B-Instruct-AWQ-4bit \
|
33 |
+
--swap-space 16 \
|
34 |
+
--disable-log-requests \
|
35 |
+
--tokenizer meta-llama/Llama-3.3-70B-Instruct \
|
36 |
+
--tensor-parallel-size 2
|
37 |
+
```
|
38 |
+
|
39 |
+
|
40 |
+
Benchmark
|
41 |
+
```
|
42 |
+
python benchmark_serving.py \
|
43 |
+
--backend vllm \
|
44 |
+
--model lambdalabs/Llama-3.3-70B-Instruct-AWQ-4bit \
|
45 |
+
--tokenizer meta-llama/Meta-Llama-3-70B \
|
46 |
+
--dataset-name sharegpt \
|
47 |
+
--dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
|
48 |
+
--num-prompts 1000
|
49 |
+
|
50 |
+
============ Serving Benchmark Result ============
|
51 |
+
Successful requests: 902
|
52 |
+
Benchmark duration (s): 128.07
|
53 |
+
Total input tokens: 177877
|
54 |
+
Total generated tokens: 182359
|
55 |
+
Request throughput (req/s): 7.04
|
56 |
+
Output token throughput (tok/s): 1423.85
|
57 |
+
Total Token throughput (tok/s): 2812.71
|
58 |
+
---------------Time to First Token----------------
|
59 |
+
Mean TTFT (ms): 47225.59
|
60 |
+
Median TTFT (ms): 43313.95
|
61 |
+
P99 TTFT (ms): 105587.66
|
62 |
+
-----Time per Output Token (excl. 1st token)------
|
63 |
+
Mean TPOT (ms): 141.01
|
64 |
+
Median TPOT (ms): 148.94
|
65 |
+
P99 TPOT (ms): 174.16
|
66 |
+
---------------Inter-token Latency----------------
|
67 |
+
Mean ITL (ms): 131.55
|
68 |
+
Median ITL (ms): 150.82
|
69 |
+
P99 ITL (ms): 344.50
|
70 |
+
==================================================
|
71 |
```
|