orionweller
/

test-flex-gpt

Model card Files Files and versions Community

oweller2 commited on 26 days ago

Commit

f66abc1

•

1 Parent(s): 6d1817e

udpdate

Files changed (2) hide show

attention.py +1 -1
modeling_flexbert.py +19 -11

attention.py CHANGED Viewed

@@ -863,7 +863,7 @@ class FlexBertUnpadRopeAttention(FlexBertAttentionBase):
         qkv = self.Wqkv(hidden_states)
         # only needed for inference when we have KV cache
-        seqlen_offset = 0
         # (total_seqlen, 3, nheads, headdim)
         qkv = qkv.view(-1, 3, self.num_attention_heads, self.attn_head_size)

         qkv = self.Wqkv(hidden_states)
         # only needed for inference when we have KV cache
+        seqlen_offset = max_seqlen * (len(cu_seqlens) - 2) if len(cu_seqlens) > 1 else 0
         # (total_seqlen, 3, nheads, headdim)
         qkv = qkv.view(-1, 3, self.num_attention_heads, self.attn_head_size)

modeling_flexbert.py CHANGED Viewed

@@ -1715,20 +1715,28 @@ class FlexBertForCausalLM(FlexBertPreTrainedModel):
     def prepare_inputs_for_generation(
         self,
         input_ids: torch.Tensor,
-        past_key_values: Optional[torch.FloatTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         **kwargs
     ) -> dict:
-            # only last token for inputs if past is defined
-            if past_key_values is not None:
-                input_ids = input_ids[:, -1].unsqueeze(-1)
-            return {
-                "input_ids": input_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache", True),
-                "attention_mask": None,
-            }
     def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
         """Returns the number of parameters in the model.

     def prepare_inputs_for_generation(
         self,
         input_ids: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
         **kwargs
     ) -> dict:
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        batch_size, seq_len = input_ids.shape[:2]
+        input_ids, indices, cu_seqlens, max_seqlen, position_ids, _ = self.unpad_inputs(
+            input_ids, attention_mask, position_ids, None
+        )
+        breakpoint()
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "indices": indices,
+            "cu_seqlens": cu_seqlens,
+            "max_seqlen": max_seqlen,
+            "batch_size": batch_size,
+            "seq_len": seq_len
+        }
     def get_number_parameters(self, count_embeddings: bool = True, trainable: bool = True) -> int:
         """Returns the number of parameters in the model.