orionweller
/

test-flex-gpt

Model card Files Files and versions Community

oweller2 commited on 28 days ago

Commit

b9219f0

•

1 Parent(s): 322b01b

try unpadding in inferece

Files changed (1) hide show

modeling_flexbert.py +26 -6

modeling_flexbert.py CHANGED Viewed

@@ -1724,12 +1724,32 @@ class FlexBertForCausalLM(FlexBertPreTrainedModel):
                 if attention_mask is not None:
                     attention_mask = attention_mask[:, -1:]
-            return {
-                "input_ids": input_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache", True),
-                "attention_mask": attention_mask,
-            }
     def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
         """Returns the number of parameters in the model.

                 if attention_mask is not None:
                     attention_mask = attention_mask[:, -1:]
+            # Handle unpadding for the last token if needed
+            if self.unpad_embeddings:
+                batch_size, seq_len = input_ids.shape[:2]
+                if attention_mask is None:
+                    # create all ones, except for padding (TODO?)
+                    attention_mask = torch.ones_like(input_ids)
+                input_ids, indices, cu_seqlens, max_seqlen, position_ids, _ = self.unpad_inputs(
+                    input_ids, attention_mask, None, None
+                )
+                return {
+                    "input_ids": input_ids,
+                    "past_key_values": past_key_values,
+                    "use_cache": kwargs.get("use_cache", True),
+                    "attention_mask": None,  # FA handles this
+                    "indices": indices,
+                    "cu_seqlens": cu_seqlens,
+                    "max_seqlen": max_seqlen,
+                    "position_ids": position_ids,
+                }
+            else:
+                return {
+                    "input_ids": input_ids,
+                    "past_key_values": past_key_values,
+                    "use_cache": kwargs.get("use_cache", True),
+                    "attention_mask": attention_mask,
+                }
     def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
         """Returns the number of parameters in the model.