jinaai
/

jina-bert-flash-implementation

Inference Endpoints

🇪🇺 Region: EU

Model card Files Files and versions Community

Markus28 commited on Feb 21, 2024

Commit

bfc0b2d

·

1 Parent(s): 953f39e

fix: always use flash attention

Files changed (1) hide show

modeling_bert.py +1 -3

modeling_bert.py CHANGED Viewed

@@ -154,7 +154,6 @@ def _init_weights(module, initializer_range=0.02):
 class BertEncoder(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
-        self.use_flash_attn = getattr(config, "use_flash_attn", False)
         self.layers = nn.ModuleList(
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
@@ -164,13 +163,12 @@ class BertEncoder(nn.Module):
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
         """
-        if key_padding_mask is None or not self.use_flash_attn:
             mixer_kwargs = (
                 {"key_padding_mask": key_padding_mask} if key_padding_mask is not None else None
             )
             for layer in self.layers:
                 hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
-                print(hidden_states)
             if subset_mask is not None:
                 hidden_states = hidden_states[subset_mask]
         else:

 class BertEncoder(nn.Module):
     def __init__(self, config: JinaBertConfig):
         super().__init__()
         self.layers = nn.ModuleList(
             [create_block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
         )
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
         """
+        if key_padding_mask is None:
             mixer_kwargs = (
                 {"key_padding_mask": key_padding_mask} if key_padding_mask is not None else None
             )
             for layer in self.layers:
                 hidden_states = layer(hidden_states, mixer_kwargs=mixer_kwargs)
             if subset_mask is not None:
                 hidden_states = hidden_states[subset_mask]
         else: