orionweller
/

test-flex-gpt

Model card Files Files and versions Community

oweller2 commited on 27 days ago

Commit

81b671b

•

1 Parent(s): b9219f0

no unpad at inference

Files changed (2) hide show

config.json +4 -4
modeling_flexbert.py +6 -26

config.json CHANGED Viewed

@@ -69,9 +69,9 @@
   "num_attention_heads": 12,
   "num_hidden_layers": 22,
   "num_initial_layers": 1,
-  "pad_logits": true,
-  "pad_token_id": 0,
-  "padding": "unpadded",
   "pooling_type": "cls",
   "position_embedding_type": "absolute",
   "rotary_emb_base": 10000.0,
@@ -82,7 +82,7 @@
   "sliding_window": 128,
   "transformers_version": "4.44.1",
   "type_vocab_size": 2,
-  "unpad_embeddings": true,
   "use_cache": true,
   "use_fa2": true,
   "use_sdpa_attn_mask": false,

   "num_attention_heads": 12,
   "num_hidden_layers": 22,
   "num_initial_layers": 1,
+  "pad_logits": false,
+  "pad_token_id": 50283,
+  "padding": "padded",
   "pooling_type": "cls",
   "position_embedding_type": "absolute",
   "rotary_emb_base": 10000.0,
   "sliding_window": 128,
   "transformers_version": "4.44.1",
   "type_vocab_size": 2,
+  "unpad_embeddings": false,
   "use_cache": true,
   "use_fa2": true,
   "use_sdpa_attn_mask": false,

modeling_flexbert.py CHANGED Viewed

@@ -1724,32 +1724,12 @@ class FlexBertForCausalLM(FlexBertPreTrainedModel):
                 if attention_mask is not None:
                     attention_mask = attention_mask[:, -1:]
-            # Handle unpadding for the last token if needed
-            if self.unpad_embeddings:
-                batch_size, seq_len = input_ids.shape[:2]
-                if attention_mask is None:
-                    # create all ones, except for padding (TODO?)
-                    attention_mask = torch.ones_like(input_ids)
-                input_ids, indices, cu_seqlens, max_seqlen, position_ids, _ = self.unpad_inputs(
-                    input_ids, attention_mask, None, None
-                )
-                return {
-                    "input_ids": input_ids,
-                    "past_key_values": past_key_values,
-                    "use_cache": kwargs.get("use_cache", True),
-                    "attention_mask": None,  # FA handles this
-                    "indices": indices,
-                    "cu_seqlens": cu_seqlens,
-                    "max_seqlen": max_seqlen,
-                    "position_ids": position_ids,
-                }
-            else:
-                return {
-                    "input_ids": input_ids,
-                    "past_key_values": past_key_values,
-                    "use_cache": kwargs.get("use_cache", True),
-                    "attention_mask": attention_mask,
-                }
     def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
         """Returns the number of parameters in the model.

                 if attention_mask is not None:
                     attention_mask = attention_mask[:, -1:]
+            return {
+                "input_ids": input_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache", True),
+                "attention_mask": attention_mask,
+            }
     def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
         """Returns the number of parameters in the model.