deepseek-ai
/

DeepSeek-V2-Chat

@@ -189,7 +189,8 @@ model_name = "deepseek-ai/DeepSeek-V2"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 # `max_memory` should be set based on your devices
 max_memory = {i: "75GB" for i in range(8)}
-model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16, max_memory=max_memory)
 model.generation_config = GenerationConfig.from_pretrained(model_name)
 model.generation_config.pad_token_id = model.generation_config.eos_token_id
@@ -210,7 +211,8 @@ model_name = "deepseek-ai/DeepSeek-V2-Chat"
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 # `max_memory` should be set based on your devices
 max_memory = {i: "75GB" for i in range(8)}
-model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="auto", torch_dtype=torch.bfloat16, max_memory=max_memory)
 model.generation_config = GenerationConfig.from_pretrained(model_name)
 model.generation_config.pad_token_id = model.generation_config.eos_token_id

 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 # `max_memory` should be set based on your devices
 max_memory = {i: "75GB" for i in range(8)}
+# `device_map` cannot be set to `auto`
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="sequential", torch_dtype=torch.bfloat16, max_memory=max_memory, attn_implementation="eager")
 model.generation_config = GenerationConfig.from_pretrained(model_name)
 model.generation_config.pad_token_id = model.generation_config.eos_token_id
 tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 # `max_memory` should be set based on your devices
 max_memory = {i: "75GB" for i in range(8)}
+# `device_map` cannot be set to `auto`
+model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, device_map="sequential", torch_dtype=torch.bfloat16, max_memory=max_memory, attn_implementation="eager")
 model.generation_config = GenerationConfig.from_pretrained(model_name)
 model.generation_config.pad_token_id = model.generation_config.eos_token_id

modeling_deepseek.py CHANGED Viewed

@@ -34,7 +34,6 @@ from transformers.modeling_attn_mask_utils import (
     AttentionMaskConverter,
     _prepare_4d_attention_mask,
     _prepare_4d_causal_attention_mask,
-    _prepare_4d_causal_attention_mask_for_sdpa,
 )
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
@@ -1295,7 +1294,6 @@ class DeepseekV2PreTrainedModel(PreTrainedModel):
     _no_split_modules = ["DeepseekV2DecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
-    _supports_sdpa = True
     _supports_cache_class = True
     def _init_weights(self, module):
@@ -1406,7 +1404,6 @@ class DeepseekV2Model(DeepseekV2PreTrainedModel):
                 for layer_idx in range(config.num_hidden_layers)
             ]
         )
-        self._use_sdpa = config._attn_implementation == "sdpa"
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -1495,15 +1492,6 @@ class DeepseekV2Model(DeepseekV2PreTrainedModel):
                 if (attention_mask is not None and 0 in attention_mask)
                 else None
             )
-        elif self._use_sdpa and not output_attentions:
-            # output_attentions=True can not be supported when using SDPA, and we fall back on
-            # the manual implementation that requires a 4D causal mask in all cases.
-            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
-                attention_mask,
-                (batch_size, seq_length),
-                inputs_embeds,
-                past_key_values_length,
-            )
         else:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(

     AttentionMaskConverter,
     _prepare_4d_attention_mask,
     _prepare_4d_causal_attention_mask,
 )
 from transformers.modeling_outputs import (
     BaseModelOutputWithPast,
     _no_split_modules = ["DeepseekV2DecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
     _supports_cache_class = True
     def _init_weights(self, module):
                 for layer_idx in range(config.num_hidden_layers)
             ]
         )
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self.norm = DeepseekV2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
                 if (attention_mask is not None and 0 in attention_mask)
                 else None
             )
         else:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(