Update README.md
Browse files
README.md
CHANGED
@@ -22,6 +22,37 @@ FP8 (F8_E4M3) quantized version of Mistral-Nemo-Instruct-2407 with 512 epochs.
|
|
22 |
Should work with transformers, but you need this patch to use it with vLLM : https://github.com/vllm-project/vllm/pull/6548
|
23 |
Or simply wait for vLLM 0.5.3...
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
# Original model README.md file:
|
27 |
|
|
|
22 |
Should work with transformers, but you need this patch to use it with vLLM : https://github.com/vllm-project/vllm/pull/6548
|
23 |
Or simply wait for vLLM 0.5.3...
|
24 |
|
25 |
+
```diff
|
26 |
+
--- vllm/model_executor/models/llama.py 2024-07-19 02:01:59.192831673 +0200
|
27 |
+
+++ vllm/model_executor/models/llama.py 2024-07-19 02:01:36.752721235 +0200
|
28 |
+
@@ -89,6 +89,7 @@
|
29 |
+
|
30 |
+
def __init__(
|
31 |
+
self,
|
32 |
+
+ config: LlamaConfig,
|
33 |
+
hidden_size: int,
|
34 |
+
num_heads: int,
|
35 |
+
num_kv_heads: int,
|
36 |
+
@@ -115,7 +116,8 @@
|
37 |
+
# the KV heads across multiple tensor parallel GPUs.
|
38 |
+
assert tp_size % self.total_num_kv_heads == 0
|
39 |
+
self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
|
40 |
+
- self.head_dim = hidden_size // self.total_num_heads
|
41 |
+
+ # MistralConfig has an optional head_dim introduced by Mistral-Nemo
|
42 |
+
+ self.head_dim = getattr(config, "head_dim", self.hidden_size // self.total_num_heads)
|
43 |
+
self.q_size = self.num_heads * self.head_dim
|
44 |
+
self.kv_size = self.num_kv_heads * self.head_dim
|
45 |
+
self.scaling = self.head_dim**-0.5
|
46 |
+
@@ -189,6 +191,7 @@
|
47 |
+
attention_bias = getattr(config, "attention_bias", False) or getattr(
|
48 |
+
config, "bias", False)
|
49 |
+
self.self_attn = LlamaAttention(
|
50 |
+
+ config=config,
|
51 |
+
hidden_size=self.hidden_size,
|
52 |
+
num_heads=config.num_attention_heads,
|
53 |
+
num_kv_heads=getattr(config, "num_key_value_heads",
|
54 |
+
```
|
55 |
+
|
56 |
|
57 |
# Original model README.md file:
|
58 |
|