jinaai
/

jina-bert-flash-implementation

Markus28 commited on Feb 21, 2024

Commit

a2c07ba

1 Parent(s): bfc0b2d

feat: added back option to disable flash attention

Files changed (2) hide show

configuration_bert.py CHANGED Viewed

@@ -57,6 +57,7 @@ class JinaBertConfig(PretrainedConfig):
         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
         window_size (`tuple`, *optional*, defaults to `(-1, -1)`): If not the default, use local attention
     """
     model_type = "bert"
@@ -76,6 +77,7 @@ class JinaBertConfig(PretrainedConfig):
         layer_norm_eps=1e-12,
         pad_token_id=0,
         window_size=(-1, -1),
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
@@ -92,4 +94,5 @@ class JinaBertConfig(PretrainedConfig):
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
         self.window_size = window_size

         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
             The epsilon used by the layer normalization layers.
         window_size (`tuple`, *optional*, defaults to `(-1, -1)`): If not the default, use local attention
+        use_flash_attn (`bool`, *optional*, defaults to `True`): Whether or not to use flash attention
     """
     model_type = "bert"
         layer_norm_eps=1e-12,
         pad_token_id=0,
         window_size=(-1, -1),
+        use_flash_attn=True,
         **kwargs,
     ):
         super().__init__(pad_token_id=pad_token_id, **kwargs)
         self.initializer_range = initializer_range
         self.layer_norm_eps = layer_norm_eps
         self.window_size = window_size
+        self.use_flash_attn = use_flash_attn

modeling_bert.py CHANGED Viewed

@@ -62,6 +62,7 @@ logger = logging.getLogger(__name__)
 def create_mixer_cls(config, cross_attn=False, return_residual=False):
     fused_bias_fc = getattr(config, "fused_bias_fc", False)
     window_size = getattr(config, "window_size", (-1, -1))
     mixer_cls = partial(
@@ -71,7 +72,7 @@ def create_mixer_cls(config, cross_attn=False, return_residual=False):
         dropout=config.attention_probs_dropout_prob,
         causal=False,
         fused_bias_fc=fused_bias_fc,
-        use_flash_attn=True,
         return_residual=return_residual,
         use_alibi=True,
         window_size=window_size,
@@ -154,6 +155,7 @@ def _init_weights(module, initializer_range=0.02):
 class BertEncoder(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
         self.layers = nn.ModuleList(
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
@@ -163,7 +165,7 @@ class BertEncoder(nn.Module):
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
         """
-        if key_padding_mask is None:
             mixer_kwargs = (
                 {"key_padding_mask": key_padding_mask} if key_padding_mask is not None else None
             )

 def create_mixer_cls(config, cross_attn=False, return_residual=False):
+    use_flash_attn = getattr(config, "use_flash_attn", True)
     fused_bias_fc = getattr(config, "fused_bias_fc", False)
     window_size = getattr(config, "window_size", (-1, -1))
     mixer_cls = partial(
         dropout=config.attention_probs_dropout_prob,
         causal=False,
         fused_bias_fc=fused_bias_fc,
+        use_flash_attn=use_flash_attn,
         return_residual=return_residual,
         use_alibi=True,
         window_size=window_size,
 class BertEncoder(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
+        self.use_flash_attn = getattr(config, "use_flash_attn", True)
         self.layers = nn.ModuleList(
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
         """
+        if key_padding_mask is None or not self.use_flash_attn:
             mixer_kwargs = (
                 {"key_padding_mask": key_padding_mask} if key_padding_mask is not None else None
             )