FlorianJc commited on
Commit
69e3ae5
1 Parent(s): 749e4ee

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +31 -0
README.md CHANGED
@@ -22,6 +22,37 @@ FP8 (F8_E4M3) quantized version of Mistral-Nemo-Instruct-2407 with 512 epochs.
22
  Should work with transformers, but you need this patch to use it with vLLM : https://github.com/vllm-project/vllm/pull/6548
23
  Or simply wait for vLLM 0.5.3...
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # Original model README.md file:
27
 
 
22
  Should work with transformers, but you need this patch to use it with vLLM : https://github.com/vllm-project/vllm/pull/6548
23
  Or simply wait for vLLM 0.5.3...
24
 
25
+ ```diff
26
+ --- vllm/model_executor/models/llama.py 2024-07-19 02:01:59.192831673 +0200
27
+ +++ vllm/model_executor/models/llama.py 2024-07-19 02:01:36.752721235 +0200
28
+ @@ -89,6 +89,7 @@
29
+
30
+ def __init__(
31
+ self,
32
+ + config: LlamaConfig,
33
+ hidden_size: int,
34
+ num_heads: int,
35
+ num_kv_heads: int,
36
+ @@ -115,7 +116,8 @@
37
+ # the KV heads across multiple tensor parallel GPUs.
38
+ assert tp_size % self.total_num_kv_heads == 0
39
+ self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
40
+ - self.head_dim = hidden_size // self.total_num_heads
41
+ + # MistralConfig has an optional head_dim introduced by Mistral-Nemo
42
+ + self.head_dim = getattr(config, "head_dim", self.hidden_size // self.total_num_heads)
43
+ self.q_size = self.num_heads * self.head_dim
44
+ self.kv_size = self.num_kv_heads * self.head_dim
45
+ self.scaling = self.head_dim**-0.5
46
+ @@ -189,6 +191,7 @@
47
+ attention_bias = getattr(config, "attention_bias", False) or getattr(
48
+ config, "bias", False)
49
+ self.self_attn = LlamaAttention(
50
+ + config=config,
51
+ hidden_size=self.hidden_size,
52
+ num_heads=config.num_attention_heads,
53
+ num_kv_heads=getattr(config, "num_key_value_heads",
54
+ ```
55
+
56
 
57
  # Original model README.md file:
58