internlm
/

internlm2-chat-20b-sft

Text Generation

Model card Files Files and versions Community

x54-729 commited on Aug 20

Commit

627be28

•

1 Parent(s): 6e22a64

update modeling file to newest

Files changed (2) hide show

configuration_internlm2.py +1 -1
modeling_internlm2.py +9 -1

configuration_internlm2.py CHANGED Viewed

@@ -177,4 +177,4 @@ class InternLM2Config(PretrainedConfig):
             raise ValueError(
                 f"`rope_scaling`'s factor field must be a number >= 1, got {rope_scaling_factor} "
                 f"of type {type(rope_scaling_factor)}"
-            )

             raise ValueError(
                 f"`rope_scaling`'s factor field must be a number >= 1, got {rope_scaling_factor} "
                 f"of type {type(rope_scaling_factor)}"
+            )

modeling_internlm2.py CHANGED Viewed

@@ -59,6 +59,10 @@ try:
 except:
     pass
 logger = logging.get_logger(__name__)
@@ -1093,7 +1097,11 @@ class InternLM2Model(InternLM2PreTrainedModel):
         else:
             causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
             if sequence_length != 1:
-                causal_mask = torch.triu(causal_mask, diagonal=1)
             causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
             causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
             if attention_mask is not None:

 except:
     pass
+try:
+    support_bf16_triu = torch.__version__ >= "2.1.0"
+except Exception:
+    support_bf16_triu = False
 logger = logging.get_logger(__name__)
         else:
             causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
             if sequence_length != 1:
+                if support_bf16_triu or dtype == torch.float32:
+                    causal_mask = torch.triu(causal_mask, diagonal=1)
+                else:
+                    triu_mask = torch.triu(torch.ones(causal_mask.size(), device=device), diagonal=1).bool()
+                    causal_mask.masked_fill_(~triu_mask, 0)
             causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
             causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
             if attention_mask is not None: