LeroyDyer commited on
Commit
d7c7584
·
verified ·
1 Parent(s): da43130

Upload 2 files

Browse files
configuration_mistral_advanced.py ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING, Any, Mapping, Optional, OrderedDict
2
+ from packaging import version
3
+ from transformers.auto.configuration_auto import AutoConfig
4
+ from transformers.configuration_utils import PretrainedConfig
5
+ from transformers.utils import logging
6
+
7
+
8
+
9
+ if TYPE_CHECKING:
10
+ from ... import PreTrainedTokenizerBase, TensorType
11
+
12
+ logger = logging.get_logger(__name__)
13
+
14
+ """ Mistral model configuration"""
15
+
16
+
17
+
18
+ MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
19
+ "mistralai/Mistral-7B-v0.1": "https://huggingface.co/mistralai/Mistral-7B-v0.1/resolve/main/config.json",
20
+ "mistralai/Mistral-7B-Instruct-v0.1": "https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1/resolve/main/config.json",
21
+ }
22
+
23
+ class EncoderDecoderConfig(PretrainedConfig):
24
+ is_composition = True
25
+
26
+ def __init__(self, **kwargs):
27
+ super().__init__(**kwargs)
28
+ if "encoder" not in kwargs or "decoder" not in kwargs:
29
+ raise ValueError(
30
+ f"A configuraton of type {self.model_type} cannot be instantiated because "
31
+ f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
32
+ )
33
+
34
+ encoder_config = kwargs.pop("encoder")
35
+ encoder_model_type = encoder_config.pop("model_type")
36
+ decoder_config = kwargs.pop("decoder")
37
+ decoder_model_type = decoder_config.pop("model_type")
38
+
39
+ self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
40
+ self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
41
+ self.is_encoder_decoder = True
42
+ @classmethod
43
+ def from_encoder_decoder_configs(
44
+ cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
45
+ ) -> PretrainedConfig:
46
+ r"""
47
+ Instantiate a [`SpeechEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
48
+ configuration and decoder model configuration.
49
+
50
+ Returns:
51
+ [`SpeechEncoderDecoderConfig`]: An instance of a configuration object
52
+ """
53
+ logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
54
+ decoder_config.is_decoder = True
55
+ decoder_config.add_cross_attention = True
56
+
57
+ return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
58
+
59
+ class VisionEncoderDecoderConfig(PretrainedConfig):
60
+ r"""
61
+ [`VisionEncoderDecoderConfig`] is the configuration class to store the configuration of a
62
+ [`VisionEncoderDecoderModel`]. It is used to instantiate a Vision-Encoder-Text-Decoder model according to the
63
+ specified arguments, defining the encoder and decoder configs.
64
+
65
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
66
+ documentation from [`PretrainedConfig`] for more information.
67
+
68
+ Args:
69
+ kwargs (*optional*):
70
+ Dictionary of keyword arguments. Notably:
71
+
72
+ - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
73
+ the encoder config.
74
+ - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
75
+ the decoder config.
76
+
77
+ Examples:
78
+
79
+ ```python
80
+ >>> from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel
81
+
82
+ >>> # Initializing a ViT & BERT style configuration
83
+ >>> config_encoder = ViTConfig()
84
+ >>> config_decoder = BertConfig()
85
+
86
+ >>> config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
87
+
88
+ >>> # Initializing a ViTBert model (with random weights) from a ViT & google-bert/bert-base-uncased style configurations
89
+ >>> model = VisionEncoderDecoderModel(config=config)
90
+
91
+ >>> # Accessing the model configuration
92
+ >>> config_encoder = model.config.encoder
93
+ >>> config_decoder = model.config.decoder
94
+ >>> # set decoder config to causal lm
95
+ >>> config_decoder.is_decoder = True
96
+ >>> config_decoder.add_cross_attention = True
97
+
98
+ >>> # Saving the model, including its configuration
99
+ >>> model.save_pretrained("my-model")
100
+
101
+ >>> # loading model and config from pretrained folder
102
+ >>> encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained("my-model")
103
+ >>> model = VisionEncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
104
+ ```"""
105
+
106
+ model_type = "vision-encoder-decoder"
107
+ is_composition = True
108
+
109
+ def __init__(self, **kwargs):
110
+ super().__init__(**kwargs)
111
+ if "encoder" not in kwargs or "decoder" not in kwargs:
112
+ raise ValueError(
113
+ f"A configuraton of type {self.model_type} cannot be instantiated because "
114
+ f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
115
+ )
116
+
117
+ encoder_config = kwargs.pop("encoder")
118
+ encoder_model_type = encoder_config.pop("model_type")
119
+ decoder_config = kwargs.pop("decoder")
120
+ decoder_model_type = decoder_config.pop("model_type")
121
+
122
+ self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
123
+ self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
124
+ self.is_encoder_decoder = True
125
+
126
+ @classmethod
127
+ def from_encoder_decoder_configs(
128
+ cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
129
+ ) -> PretrainedConfig:
130
+ r"""
131
+ Instantiate a [`VisionEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
132
+ configuration and decoder model configuration.
133
+
134
+ Returns:
135
+ [`VisionEncoderDecoderConfig`]: An instance of a configuration object
136
+ """
137
+ logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
138
+ decoder_config.is_decoder = True
139
+ decoder_config.add_cross_attention = True
140
+
141
+ return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
142
+
143
+ class SpeechEncoderDecoderConfig(PretrainedConfig):
144
+ r"""
145
+ [`SpeechEncoderDecoderConfig`] is the configuration class to store the configuration of a
146
+ [`SpeechEncoderDecoderModel`]. It is used to instantiate an Encoder Decoder model according to the specified
147
+ arguments, defining the encoder and decoder configs.
148
+
149
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
150
+ documentation from [`PretrainedConfig`] for more information.
151
+
152
+ Args:
153
+ kwargs (*optional*):
154
+ Dictionary of keyword arguments. Notably:
155
+
156
+ - **encoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
157
+ the encoder config.
158
+ - **decoder** ([`PretrainedConfig`], *optional*) -- An instance of a configuration object that defines
159
+ the decoder config.
160
+
161
+ Examples:
162
+
163
+ ```python
164
+ >>> from transformers import BertConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, SpeechEncoderDecoderModel
165
+
166
+ >>> # Initializing a Wav2Vec2 & BERT style configuration
167
+ >>> config_encoder = Wav2Vec2Config()
168
+ >>> config_decoder = BertConfig()
169
+
170
+ >>> config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
171
+
172
+ >>> # Initializing a Wav2Vec2Bert model from a Wav2Vec2 & google-bert/bert-base-uncased style configurations
173
+ >>> model = SpeechEncoderDecoderModel(config=config)
174
+
175
+ >>> # Accessing the model configuration
176
+ >>> config_encoder = model.config.encoder
177
+ >>> config_decoder = model.config.decoder
178
+ >>> # set decoder config to causal lm
179
+ >>> config_decoder.is_decoder = True
180
+ >>> config_decoder.add_cross_attention = True
181
+
182
+ >>> # Saving the model, including its configuration
183
+ >>> model.save_pretrained("my-model")
184
+
185
+ >>> # loading model and config from pretrained folder
186
+ >>> encoder_decoder_config = SpeechEncoderDecoderConfig.from_pretrained("my-model")
187
+ >>> model = SpeechEncoderDecoderModel.from_pretrained("my-model", config=encoder_decoder_config)
188
+ ```"""
189
+
190
+ model_type = "speech-encoder-decoder"
191
+ is_composition = True
192
+
193
+ def __init__(self, **kwargs):
194
+ super().__init__(**kwargs)
195
+ if "encoder" not in kwargs or "decoder" not in kwargs:
196
+ raise ValueError(
197
+ f"A configuraton of type {self.model_type} cannot be instantiated because not both `encoder` and"
198
+ f" `decoder` sub-configurations are passed, but only {kwargs}"
199
+ )
200
+
201
+ encoder_config = kwargs.pop("encoder")
202
+ encoder_model_type = encoder_config.pop("model_type")
203
+ decoder_config = kwargs.pop("decoder")
204
+ decoder_model_type = decoder_config.pop("model_type")
205
+
206
+ self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
207
+ self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
208
+ self.is_encoder_decoder = True
209
+
210
+ @classmethod
211
+ def from_encoder_decoder_configs(
212
+ cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
213
+ ) -> PretrainedConfig:
214
+ r"""
215
+ Instantiate a [`SpeechEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
216
+ configuration and decoder model configuration.
217
+
218
+ Returns:
219
+ [`SpeechEncoderDecoderConfig`]: An instance of a configuration object
220
+ """
221
+ logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
222
+ decoder_config.is_decoder = True
223
+ decoder_config.add_cross_attention = True
224
+
225
+ return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
226
+
227
+ class MistralConfig(PretrainedConfig):
228
+ is_composition = True
229
+
230
+ r"""
231
+ This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an
232
+ Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration
233
+ with the defaults will yield a similar configuration to that of the Mistral-7B-v0.1 or Mistral-7B-Instruct-v0.1.
234
+
235
+ [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
236
+ [mistralai/Mistral-7B-Instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1)
237
+
238
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
239
+ documentation from [`PretrainedConfig`] for more information.
240
+
241
+
242
+ Args:
243
+ vocab_size (`int`, *optional*, defaults to 32000):
244
+ Vocabulary size of the Mistral model. Defines the number of different tokens that can be represented by the
245
+ `inputs_ids` passed when calling [`MistralModel`]
246
+ hidden_size (`int`, *optional*, defaults to 4096):
247
+ Dimension of the hidden representations.
248
+ intermediate_size (`int`, *optional*, defaults to 14336):
249
+ Dimension of the MLP representations.
250
+ num_hidden_layers (`int`, *optional*, defaults to 32):
251
+ Number of hidden layers in the Transformer encoder.
252
+ num_attention_heads (`int`, *optional*, defaults to 32):
253
+ Number of attention heads for each attention layer in the Transformer encoder.
254
+ num_key_value_heads (`int`, *optional*, defaults to 8):
255
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
256
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
257
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
258
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
259
+ by meanpooling all the original heads within that group. For more details checkout [this
260
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
261
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
262
+ The non-linear activation function (function or string) in the decoder.
263
+ max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
264
+ The maximum sequence length that this model might ever be used with. Mistral's sliding window attention
265
+ allows sequence of up to 4096*32 tokens.
266
+ initializer_range (`float`, *optional*, defaults to 0.02):
267
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
268
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
269
+ The epsilon used by the rms normalization layers.
270
+ use_cache (`bool`, *optional*, defaults to `True`):
271
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
272
+ relevant if `config.is_decoder=True`.
273
+ pad_token_id (`int`, *optional*):
274
+ The id of the padding token.
275
+ bos_token_id (`int`, *optional*, defaults to 1):
276
+ The id of the "beginning-of-sequence" token.
277
+ eos_token_id (`int`, *optional*, defaults to 2):
278
+ The id of the "end-of-sequence" token.
279
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
280
+ Whether the model's input and output word embeddings should be tied.
281
+ rope_theta (`float`, *optional*, defaults to 10000.0):
282
+ The base period of the RoPE embeddings.
283
+ sliding_window (`int`, *optional*, defaults to 4096):
284
+ Sliding window attention window size. If not specified, will default to `4096`.
285
+ attention_dropout (`float`, *optional*, defaults to 0.0):
286
+ The dropout ratio for the attention probabilities.
287
+
288
+ ```python
289
+ >>> from transformers import MistralModel, MistralConfig
290
+
291
+ >>> # Initializing a Mistral 7B style configuration
292
+ >>> configuration = MistralConfig()
293
+
294
+ >>> # Initializing a model from the Mistral 7B style configuration
295
+ >>> model = MistralModel(configuration)
296
+
297
+ >>> # Accessing the model configuration
298
+ >>> configuration = model.config
299
+ ```"""
300
+
301
+ model_type = ["mistral","speech-encoder-decoder","image-encoder-decoder","mistral-encoder-decoder"]
302
+ # model_type = "mistral-encoder-decoder"
303
+ keys_to_ignore_at_inference = ["past_key_values"]
304
+
305
+ def __init__(
306
+ self,
307
+ vocab_size=32000,
308
+ hidden_size=4096,
309
+ intermediate_size=14336,
310
+ num_hidden_layers=32,
311
+ num_attention_heads=32,
312
+ num_key_value_heads=8,
313
+ hidden_act="silu",
314
+ max_position_embeddings=4096 * 32,
315
+ initializer_range=0.02,
316
+ rms_norm_eps=1e-6,
317
+ use_cache=True,
318
+ pad_token_id=None,
319
+ bos_token_id=1,
320
+ eos_token_id=2,
321
+ tie_word_embeddings=False,
322
+ sliding_window=4096,
323
+ attention_dropout=0.0,
324
+
325
+ # for yarn Later
326
+ rope_theta=10000.0,
327
+ rope_scaling=None,
328
+ # for thought generation Later
329
+ max_thoughts=16,
330
+ max_temperature=10,
331
+ complexity_factor = 0.5,
332
+ merged_talk_heads=True,
333
+ merged_lm_and_talk_heads=False,
334
+ merged_lm_and_think_heads=True,
335
+ use_concat_talk_head=True,
336
+ use_shallow_think=True,
337
+ use_shallow_talk=False,
338
+ use_complex_think_head=False,
339
+ use_complex_talk_head=True,
340
+ use_weighted_talk_head=True,
341
+ hidden_dropout_prob=0.00,
342
+
343
+ **kwargs,
344
+ ):
345
+ super().__init__(
346
+ pad_token_id=pad_token_id,
347
+ bos_token_id=bos_token_id,
348
+ eos_token_id=eos_token_id,
349
+ tie_word_embeddings=tie_word_embeddings,
350
+ **kwargs,
351
+ )
352
+
353
+ self.vocab_size = vocab_size
354
+ self.max_position_embeddings = max_position_embeddings
355
+ self.hidden_size = hidden_size
356
+ self.intermediate_size = intermediate_size
357
+ self.num_hidden_layers = num_hidden_layers
358
+ self.num_attention_heads = num_attention_heads
359
+ self.sliding_window = sliding_window
360
+
361
+ # for backward compatibility
362
+ if num_key_value_heads is None:
363
+ num_key_value_heads = num_attention_heads
364
+
365
+ self.num_key_value_heads = num_key_value_heads
366
+ self.hidden_act = hidden_act
367
+ self.initializer_range = initializer_range
368
+ self.rms_norm_eps = rms_norm_eps
369
+ self.use_cache = use_cache
370
+ self.attention_dropout = attention_dropout
371
+ # yarn
372
+ self.rope_scaling = rope_scaling
373
+ self._rope_scaling_validation()
374
+ self.rope_theta = rope_theta
375
+ #Thought gen
376
+ self.max_thoughts = max_thoughts
377
+ self.complexity_factor = complexity_factor
378
+ self.max_temperature = max_temperature
379
+ self.merged_talk_heads = merged_talk_heads
380
+ self.merged_lm_and_talk_heads = merged_lm_and_talk_heads
381
+ self.merged_lm_and_think_heads = merged_lm_and_think_heads
382
+ self.use_concat_talk_head = use_concat_talk_head
383
+ self.use_shallow_think = use_shallow_think
384
+ self.use_shallow_talk = use_shallow_talk
385
+ self.use_complex_think_head = use_complex_think_head
386
+ self.use_complex_talk_head = use_complex_talk_head
387
+ self.use_weighted_talk_head = use_weighted_talk_head
388
+ self.hidden_dropout_prob = hidden_dropout_prob
389
+ #Encoder Decoder - Currently only a single EncoderDecoder is supported -Later will be eXpanded to suport both models
390
+ if "encoder" not in kwargs or "decoder" not in kwargs:
391
+ raise ValueError(
392
+ f"A configuraton of type {self.model_type} cannot be instantiated because "
393
+ f"not both `encoder` and `decoder` sub-configurations are passed, but only {kwargs}"
394
+ )
395
+
396
+ encoder_config = kwargs.pop("encoder")
397
+ encoder_model_type = encoder_config.pop("model_type")
398
+ decoder_config = kwargs.pop("decoder")
399
+ decoder_model_type = decoder_config.pop("model_type")
400
+
401
+ self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config)
402
+ self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config)
403
+ self.is_encoder_decoder = True
404
+
405
+ @classmethod
406
+ def from_encoder_decoder_configs(
407
+ cls, encoder_config: PretrainedConfig, decoder_config: PretrainedConfig, **kwargs
408
+ ) -> PretrainedConfig:
409
+ r"""
410
+ Instantiate a [`SpeechEncoderDecoderConfig`] (or a derived class) from a pre-trained encoder model
411
+ configuration and decoder model configuration.
412
+
413
+ Returns:
414
+ [`SpeechEncoderDecoderConfig`]: An instance of a configuration object
415
+ """
416
+ logger.info("Setting `config.is_decoder=True` and `config.add_cross_attention=True` for decoder_config")
417
+ decoder_config.is_decoder = True
418
+ decoder_config.add_cross_attention = True
419
+
420
+ return cls(encoder=encoder_config.to_dict(), decoder=decoder_config.to_dict(), **kwargs)
421
+
422
+ def _rope_scaling_validation(self):
423
+ """
424
+ Validate the `rope_scaling` configuration.
425
+ """
426
+ if self.rope_scaling is None:
427
+ return
428
+
429
+ if not isinstance(self.rope_scaling, dict):
430
+ raise ValueError(
431
+ "`rope_scaling` must be a dictionary, "
432
+ f"got {self.rope_scaling}"
433
+ )
434
+ rope_scaling_type = self.rope_scaling.get("type", None)
435
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
436
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic", "yarn", "dynamic-yarn"]:
437
+ raise ValueError(
438
+ f"`rope_scaling`'s name field must be one of ['linear', 'dynamic', 'yarn', 'dynamic-yarn'], got {rope_scaling_type}"
439
+ )
440
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
441
+ raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
442
+ if rope_scaling_type == "yarn" or rope_scaling_type == "dynamic-yarn":
443
+ original_max_position_embeddings = self.rope_scaling.get("original_max_position_embeddings", None)
444
+ if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int):
445
+ raise ValueError(f"`rope_scaling.original_max_position_embeddings` must be set to an int when using yarn, and dynamic-yarn")
modeling_mistral_advanced.py ADDED
The diff for this file is too large to render. See raw diff