msr2000
commited on
Commit
•
413b83f
1
Parent(s):
896a24c
Update code & readme
Browse files- README.md +4 -2
- modeling_deepseek.py +0 -12
README.md
CHANGED
@@ -189,7 +189,8 @@ model_name = "deepseek-ai/DeepSeek-V2"
|
|
189 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
190 |
# `max_memory` should be set based on your devices
|
191 |
max_memory = {i: "75GB" for i in range(8)}
|
192 |
-
|
|
|
193 |
model.generation_config = GenerationConfig.from_pretrained(model_name)
|
194 |
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
195 |
|
@@ -210,7 +211,8 @@ model_name = "deepseek-ai/DeepSeek-V2-Chat"
|
|
210 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
211 |
# `max_memory` should be set based on your devices
|
212 |
max_memory = {i: "75GB" for i in range(8)}
|
213 |
-
|
|
|
214 |
model.generation_config = GenerationConfig.from_pretrained(model_name)
|
215 |
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
216 |
|
|
|
189 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
190 |
# `max_memory` should be set based on your devices
|
191 |
max_memory = {i: "75GB" for i in range(8)}
|
192 |
+
# `device_map` cannot be set to `auto`
|
193 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="sequential", torch_dtype=torch.bfloat16, max_memory=max_memory, attn_implementation="eager")
|
194 |
model.generation_config = GenerationConfig.from_pretrained(model_name)
|
195 |
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
196 |
|
|
|
211 |
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
212 |
# `max_memory` should be set based on your devices
|
213 |
max_memory = {i: "75GB" for i in range(8)}
|
214 |
+
# `device_map` cannot be set to `auto`
|
215 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="sequential", torch_dtype=torch.bfloat16, max_memory=max_memory, attn_implementation="eager")
|
216 |
model.generation_config = GenerationConfig.from_pretrained(model_name)
|
217 |
model.generation_config.pad_token_id = model.generation_config.eos_token_id
|
218 |
|
modeling_deepseek.py
CHANGED
@@ -34,7 +34,6 @@ from transformers.modeling_attn_mask_utils import (
|
|
34 |
AttentionMaskConverter,
|
35 |
_prepare_4d_attention_mask,
|
36 |
_prepare_4d_causal_attention_mask,
|
37 |
-
_prepare_4d_causal_attention_mask_for_sdpa,
|
38 |
)
|
39 |
from transformers.modeling_outputs import (
|
40 |
BaseModelOutputWithPast,
|
@@ -1295,7 +1294,6 @@ class DeepseekV2PreTrainedModel(PreTrainedModel):
|
|
1295 |
_no_split_modules = ["DeepseekV2DecoderLayer"]
|
1296 |
_skip_keys_device_placement = "past_key_values"
|
1297 |
_supports_flash_attn_2 = True
|
1298 |
-
_supports_sdpa = True
|
1299 |
_supports_cache_class = True
|
1300 |
|
1301 |
def _init_weights(self, module):
|
@@ -1406,7 +1404,6 @@ class DeepseekV2Model(DeepseekV2PreTrainedModel):
|
|
1406 |
for layer_idx in range(config.num_hidden_layers)
|
1407 |
]
|
1408 |
)
|
1409 |
-
self._use_sdpa = config._attn_implementation == "sdpa"
|
1410 |
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
1411 |
self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
1412 |
|
@@ -1495,15 +1492,6 @@ class DeepseekV2Model(DeepseekV2PreTrainedModel):
|
|
1495 |
if (attention_mask is not None and 0 in attention_mask)
|
1496 |
else None
|
1497 |
)
|
1498 |
-
elif self._use_sdpa and not output_attentions:
|
1499 |
-
# output_attentions=True can not be supported when using SDPA, and we fall back on
|
1500 |
-
# the manual implementation that requires a 4D causal mask in all cases.
|
1501 |
-
attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
|
1502 |
-
attention_mask,
|
1503 |
-
(batch_size, seq_length),
|
1504 |
-
inputs_embeds,
|
1505 |
-
past_key_values_length,
|
1506 |
-
)
|
1507 |
else:
|
1508 |
# 4d mask is passed through the layers
|
1509 |
attention_mask = _prepare_4d_causal_attention_mask(
|
|
|
34 |
AttentionMaskConverter,
|
35 |
_prepare_4d_attention_mask,
|
36 |
_prepare_4d_causal_attention_mask,
|
|
|
37 |
)
|
38 |
from transformers.modeling_outputs import (
|
39 |
BaseModelOutputWithPast,
|
|
|
1294 |
_no_split_modules = ["DeepseekV2DecoderLayer"]
|
1295 |
_skip_keys_device_placement = "past_key_values"
|
1296 |
_supports_flash_attn_2 = True
|
|
|
1297 |
_supports_cache_class = True
|
1298 |
|
1299 |
def _init_weights(self, module):
|
|
|
1404 |
for layer_idx in range(config.num_hidden_layers)
|
1405 |
]
|
1406 |
)
|
|
|
1407 |
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
1408 |
self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
1409 |
|
|
|
1492 |
if (attention_mask is not None and 0 in attention_mask)
|
1493 |
else None
|
1494 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1495 |
else:
|
1496 |
# 4d mask is passed through the layers
|
1497 |
attention_mask = _prepare_4d_causal_attention_mask(
|