oweller2 commited on
Commit
e9e8f85
1 Parent(s): 46797c8
Files changed (2) hide show
  1. config.json +3 -3
  2. modeling_flexbert.py +3 -2
config.json CHANGED
@@ -69,9 +69,9 @@
69
  "num_attention_heads": 12,
70
  "num_hidden_layers": 22,
71
  "num_initial_layers": 1,
72
- "pad_logits": false,
73
  "pad_token_id": 50283,
74
- "padding": "padded",
75
  "pooling_type": "cls",
76
  "position_embedding_type": "absolute",
77
  "rotary_emb_base": 10000.0,
@@ -82,7 +82,7 @@
82
  "sliding_window": 128,
83
  "transformers_version": "4.44.1",
84
  "type_vocab_size": 2,
85
- "unpad_embeddings": false,
86
  "use_cache": true,
87
  "use_fa2": true,
88
  "use_sdpa_attn_mask": false,
 
69
  "num_attention_heads": 12,
70
  "num_hidden_layers": 22,
71
  "num_initial_layers": 1,
72
+ "pad_logits": true,
73
  "pad_token_id": 50283,
74
+ "padding": "unpadded",
75
  "pooling_type": "cls",
76
  "position_embedding_type": "absolute",
77
  "rotary_emb_base": 10000.0,
 
82
  "sliding_window": 128,
83
  "transformers_version": "4.44.1",
84
  "type_vocab_size": 2,
85
+ "unpad_embeddings": true,
86
  "use_cache": true,
87
  "use_fa2": true,
88
  "use_sdpa_attn_mask": false,
modeling_flexbert.py CHANGED
@@ -935,6 +935,7 @@ class FlexBertModel(FlexBertPreTrainedModel):
935
  else:
936
  self.final_norm = None
937
  self.unpad_embeddings = config.unpad_embeddings
 
938
 
939
  def post_init(self):
940
  self._init_weights(reset_params=False)
@@ -956,7 +957,7 @@ class FlexBertModel(FlexBertPreTrainedModel):
956
  max_seqlen: Optional[int] = None,
957
  **kwargs,
958
  ) -> Tuple[Union[List[torch.Tensor], torch.Tensor], Optional[torch.Tensor]]:
959
- if attention_mask is None:
960
  attention_mask = torch.ones_like(input_ids)
961
 
962
  embedding_output = self.embeddings(input_ids, position_ids)
@@ -1529,7 +1530,7 @@ class FlexBertForCausalLM(FlexBertPreTrainedModel):
1529
  self.unpad_embeddings = config.unpad_embeddings
1530
  self.pad_logits = config.pad_logits
1531
  self.compile_model = config.compile_model
1532
- # self.masked_prediction = config.masked_prediction
1533
 
1534
  # Initialize weights and apply final processing
1535
  self._init_weights(reset_params=False)
 
935
  else:
936
  self.final_norm = None
937
  self.unpad_embeddings = config.unpad_embeddings
938
+ self.is_decoder = config.causal_mask
939
 
940
  def post_init(self):
941
  self._init_weights(reset_params=False)
 
957
  max_seqlen: Optional[int] = None,
958
  **kwargs,
959
  ) -> Tuple[Union[List[torch.Tensor], torch.Tensor], Optional[torch.Tensor]]:
960
+ if attention_mask is None and not self.is_decoder:
961
  attention_mask = torch.ones_like(input_ids)
962
 
963
  embedding_output = self.embeddings(input_ids, position_ids)
 
1530
  self.unpad_embeddings = config.unpad_embeddings
1531
  self.pad_logits = config.pad_logits
1532
  self.compile_model = config.compile_model
1533
+ self.masked_prediction = config.masked_prediction
1534
 
1535
  # Initialize weights and apply final processing
1536
  self._init_weights(reset_params=False)