lightonai
/

FlexBert

Model card Files Files and versions Community

NohTow commited on Oct 8, 2024

Commit

e800526

1 Parent(s): 6088290

fix bert_padding

Browse files

Files changed (4) hide show

__pycache__/attention.cpython-311.pyc +0 -0
__pycache__/layers.cpython-311.pyc +0 -0
attention.py +13 -13
layers.py +5 -5

__pycache__/attention.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/attention.cpython-311.pyc and b/__pycache__/attention.cpython-311.pyc differ

__pycache__/layers.cpython-311.pyc CHANGED Viewed

Binary files a/__pycache__/layers.cpython-311.pyc and b/__pycache__/layers.cpython-311.pyc differ

attention.py CHANGED Viewed

@@ -24,7 +24,7 @@ import sys
 import os
 # Add src folder root to path to allow us to use relative imports regardless of what directory the script is run from
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
-import bert_padding
 from .configuration_bert import FlexBertConfig, maybe_add_padding
 from .normalization import get_norm_layer
 from .initialization import ModuleType, init_weights
@@ -161,7 +161,7 @@ class BertAlibiUnpadSelfAttention(nn.Module):
                     alibi_slopes=slopes,
                 )
         else:
-            qkv = bert_padding.pad_input(qkv, indices, cu_seqlens.shape[0] - 1, max_seqlen)  # batch, max_seqlen, thd
             unpad_bs, *_ = qkv.shape
             qkv = qkv.view(unpad_bs, -1, 3, self.num_attention_heads, self.attention_head_size)
             # if we have nonzero attention dropout (e.g. during fine-tuning) or no Triton, compute attention in PyTorch
@@ -174,7 +174,7 @@ class BertAlibiUnpadSelfAttention(nn.Module):
             attention_probs = self.dropout(attention_probs)
             attention = torch.matmul(attention_probs, v).permute(0, 2, 1, 3)  # b s h d
-            attention = bert_padding.unpad_input_only(attention, torch.squeeze(attn_mask) == 1)
         return attention.view(bs, dim)
@@ -240,8 +240,8 @@ class BertAlibiUnpadAttention(nn.Module):
         self_output = self.self(input_tensor, cu_seqlens, max_s, indices, attn_mask, bias, slopes)
         if subset_idx is not None:
             return self.output(
-                bert_padding.index_first_axis(self_output, subset_idx),
-                bert_padding.index_first_axis(input_tensor, subset_idx),
             )
         else:
             return self.output(self_output, input_tensor)
@@ -415,7 +415,7 @@ class FlexBertUnpadAttention(FlexBertAttentionBase):
                 )
             attn = attn.view(bs, dim)
         else:
-            qkv = bert_padding.pad_input(qkv, indices, cu_seqlens.shape[0] - 1, max_seqlen)  # batch, max_seqlen, thd
             unpad_bs, seqlen, _ = qkv.shape
             qkv = qkv.view(unpad_bs, -1, 3, self.num_attention_heads, self.attn_head_size)
@@ -430,7 +430,7 @@ class FlexBertUnpadAttention(FlexBertAttentionBase):
                 else None,
             )
             attn = attn.transpose(1, 2).view(unpad_bs, -1, dim)  # b s h d
-            attn = bert_padding.unpad_input_only(attn, torch.squeeze(attn_mask) == 1)
         return self.out_drop(self.Wo(attn))
@@ -565,7 +565,7 @@ class FlexBertUnpadParallelAttention(FlexBertAttentionBase):
                 )
             attn = attn.view(bs, dim)
         else:
-            qkv = bert_padding.pad_input(qkv, indices, cu_seqlens.shape[0] - 1, max_seqlen)  # batch, max_seqlen, thd
             unpad_bs, seqlen, _ = qkv.shape
             qkv = qkv.view(unpad_bs, -1, 3, self.num_attention_heads, self.attn_head_size)
@@ -580,7 +580,7 @@ class FlexBertUnpadParallelAttention(FlexBertAttentionBase):
                 else None,
             )
             attn = attn.transpose(1, 2).view(unpad_bs, -1, dim)  # b s h d
-            attn = bert_padding.unpad_input_only(attn, torch.squeeze(attn_mask) == 1)
         return self.out_drop(self.Wo(attn.view(bs, dim)))
@@ -913,7 +913,7 @@ class FlexBertUnpadRopeAttention(FlexBertAttentionBase):
                 )
             attn = attn.view(bs, dim)
         else:
-            qkv = bert_padding.pad_input(
                 qkv, indices, cu_seqlens.shape[0] - 1, attn_mask.shape[-1]
             )  # batch, max_seqlen, thd
             unpad_bs, seqlen, *_ = qkv.shape
@@ -929,7 +929,7 @@ class FlexBertUnpadRopeAttention(FlexBertAttentionBase):
                 else None,
             )
             attn = attn.transpose(1, 2).view(unpad_bs, -1, dim)  # b s h d
-            attn = bert_padding.unpad_input_only(attn, torch.squeeze(attn_mask) == 1)
         return self.out_drop(self.Wo(attn))
@@ -1244,7 +1244,7 @@ class FlexBertUnpadRopeParallelAttention(FlexBertAttentionBase):
                 )
             attn = attn.view(bs, dim)
         else:
-            qkv = bert_padding.pad_input(
                 qkv, indices, cu_seqlens.shape[0] - 1, attn_mask.shape[-1]
             )  # batch, max_seqlen, thd
             unpad_bs, seqlen, *_ = qkv.shape
@@ -1260,7 +1260,7 @@ class FlexBertUnpadRopeParallelAttention(FlexBertAttentionBase):
                 else None,
             )
             attn = attn.transpose(1, 2).view(unpad_bs, -1, dim)  # b s h d
-            attn = bert_padding.unpad_input_only(attn, torch.squeeze(attn_mask) == 1)
         return self.out_drop(self.Wo(attn))

 import os
 # Add src folder root to path to allow us to use relative imports regardless of what directory the script is run from
 sys.path.append(os.path.dirname(os.path.realpath(__file__)))
+from .bert_padding import pad_input, unpad_input_only, index_first_axis
 from .configuration_bert import FlexBertConfig, maybe_add_padding
 from .normalization import get_norm_layer
 from .initialization import ModuleType, init_weights
                     alibi_slopes=slopes,
                 )
         else:
+            qkv = pad_input(qkv, indices, cu_seqlens.shape[0] - 1, max_seqlen)  # batch, max_seqlen, thd
             unpad_bs, *_ = qkv.shape
             qkv = qkv.view(unpad_bs, -1, 3, self.num_attention_heads, self.attention_head_size)
             # if we have nonzero attention dropout (e.g. during fine-tuning) or no Triton, compute attention in PyTorch
             attention_probs = self.dropout(attention_probs)
             attention = torch.matmul(attention_probs, v).permute(0, 2, 1, 3)  # b s h d
+            attention = unpad_input_only(attention, torch.squeeze(attn_mask) == 1)
         return attention.view(bs, dim)
         self_output = self.self(input_tensor, cu_seqlens, max_s, indices, attn_mask, bias, slopes)
         if subset_idx is not None:
             return self.output(
+                index_first_axis(self_output, subset_idx),
+                index_first_axis(input_tensor, subset_idx),
             )
         else:
             return self.output(self_output, input_tensor)
                 )
             attn = attn.view(bs, dim)
         else:
+            qkv = pad_input(qkv, indices, cu_seqlens.shape[0] - 1, max_seqlen)  # batch, max_seqlen, thd
             unpad_bs, seqlen, _ = qkv.shape
             qkv = qkv.view(unpad_bs, -1, 3, self.num_attention_heads, self.attn_head_size)
                 else None,
             )
             attn = attn.transpose(1, 2).view(unpad_bs, -1, dim)  # b s h d
+            attn = unpad_input_only(attn, torch.squeeze(attn_mask) == 1)
         return self.out_drop(self.Wo(attn))
                 )
             attn = attn.view(bs, dim)
         else:
+            qkv = pad_input(qkv, indices, cu_seqlens.shape[0] - 1, max_seqlen)  # batch, max_seqlen, thd
             unpad_bs, seqlen, _ = qkv.shape
             qkv = qkv.view(unpad_bs, -1, 3, self.num_attention_heads, self.attn_head_size)
                 else None,
             )
             attn = attn.transpose(1, 2).view(unpad_bs, -1, dim)  # b s h d
+            attn = unpad_input_only(attn, torch.squeeze(attn_mask) == 1)
         return self.out_drop(self.Wo(attn.view(bs, dim)))
                 )
             attn = attn.view(bs, dim)
         else:
+            qkv = pad_input(
                 qkv, indices, cu_seqlens.shape[0] - 1, attn_mask.shape[-1]
             )  # batch, max_seqlen, thd
             unpad_bs, seqlen, *_ = qkv.shape
                 else None,
             )
             attn = attn.transpose(1, 2).view(unpad_bs, -1, dim)  # b s h d
+            attn = unpad_input_only(attn, torch.squeeze(attn_mask) == 1)
         return self.out_drop(self.Wo(attn))
                 )
             attn = attn.view(bs, dim)
         else:
+            qkv = pad_input(
                 qkv, indices, cu_seqlens.shape[0] - 1, attn_mask.shape[-1]
             )  # batch, max_seqlen, thd
             unpad_bs, seqlen, *_ = qkv.shape
                 else None,
             )
             attn = attn.transpose(1, 2).view(unpad_bs, -1, dim)  # b s h d
+            attn = unpad_input_only(attn, torch.squeeze(attn_mask) == 1)
         return self.out_drop(self.Wo(attn))

layers.py CHANGED Viewed

@@ -20,7 +20,7 @@ from typing import Optional, Union, List
 import torch
 import torch.nn as nn
-import bert_padding
 from .activation import get_act_fn
 from .attention import FlexBertAttentionBase, BertAlibiUnpadAttention, get_attention_layer
@@ -155,7 +155,7 @@ class BertAlibiEncoder(nn.Module):
         # and ntokens_unpad is total number of non-padded tokens.
         # Then unpadding performs the following compression of the inputs:
         # hidden_states[ntokens,hidden] -> hidden_states[ntokens_unpad,hidden]
-        hidden_states, indices, cu_seqlens, _ = bert_padding.unpad_input(hidden_states, attention_mask_bool)
         # Add alibi matrix to extended_attention_mask
         if self._current_alibi_size < seqlen:
@@ -190,7 +190,7 @@ class BertAlibiEncoder(nn.Module):
             # and ntokens_unpad is total number of non-padded tokens.
             # Then padding performs the following de-compression:
             #     hidden_states[ntokens_unpad,hidden] -> hidden_states[ntokens,hidden]
-            hidden_states = bert_padding.pad_input(hidden_states, indices, batch, seqlen)
         else:
             for i in range(len(self.layer) - 1):
                 layer_module = self.layer[i]
@@ -636,7 +636,7 @@ class FlexBertUnpadEncoder(FlexBertEncoderBase):
         if indices is None and cu_seqlens is None and max_seqlen is None:
             attention_mask_bool = attention_mask.bool()
             batch, seqlen = hidden_states.shape[:2]
-            hidden_states, indices, cu_seqlens, max_seqlen = bert_padding.unpad_input(
                 hidden_states, attention_mask_bool
             )
@@ -649,7 +649,7 @@ class FlexBertUnpadEncoder(FlexBertEncoderBase):
                     attn_mask=attention_mask,
                 )
-            return bert_padding.pad_input(hidden_states, indices, batch, seqlen)
         else:
             for layer_module in self.layers:
                 hidden_states = layer_module(

 import torch
 import torch.nn as nn
+from .bert_padding import unpad_input, pad_input
 from .activation import get_act_fn
 from .attention import FlexBertAttentionBase, BertAlibiUnpadAttention, get_attention_layer
         # and ntokens_unpad is total number of non-padded tokens.
         # Then unpadding performs the following compression of the inputs:
         # hidden_states[ntokens,hidden] -> hidden_states[ntokens_unpad,hidden]
+        hidden_states, indices, cu_seqlens, _ = unpad_input(hidden_states, attention_mask_bool)
         # Add alibi matrix to extended_attention_mask
         if self._current_alibi_size < seqlen:
             # and ntokens_unpad is total number of non-padded tokens.
             # Then padding performs the following de-compression:
             #     hidden_states[ntokens_unpad,hidden] -> hidden_states[ntokens,hidden]
+            hidden_states = pad_input(hidden_states, indices, batch, seqlen)
         else:
             for i in range(len(self.layer) - 1):
                 layer_module = self.layer[i]
         if indices is None and cu_seqlens is None and max_seqlen is None:
             attention_mask_bool = attention_mask.bool()
             batch, seqlen = hidden_states.shape[:2]
+            hidden_states, indices, cu_seqlens, max_seqlen = unpad_input(
                 hidden_states, attention_mask_bool
             )
                     attn_mask=attention_mask,
                 )
+            return pad_input(hidden_states, indices, batch, seqlen)
         else:
             for layer_module in self.layers:
                 hidden_states = layer_module(