Text Generation
Transformers
Safetensors
English
doge
conversational
custom_code
JingzeShi commited on
Commit
a4361c2
·
verified ·
1 Parent(s): 24f92db

Upload DogeForCausalLM

Browse files
config.json CHANGED
@@ -8,28 +8,34 @@
8
  "AutoConfig": "configuration_doge.DogeConfig",
9
  "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
10
  },
11
- "bos_token_id": 1,
12
- "eos_token_id": 2,
13
  "expert_retrieval_size": 256,
14
  "hidden_act": "silu",
15
  "hidden_bias": false,
16
  "hidden_dropout": 0.0,
17
  "hidden_size": 512,
18
  "initializer_range": 0.02,
19
- "intermediate_size": 2048,
20
  "is_moe": false,
21
  "max_position_embeddings": 2048,
22
  "model_type": "doge",
23
  "num_attention_heads": 4,
24
- "num_cdmmoe_experts": 4096,
25
  "num_cdmmoe_experts_per_head": 8,
26
  "num_cdmmoe_heads": 4,
27
- "num_hidden_layers": 8,
28
- "pad_token_id": 0,
 
 
 
29
  "rms_norm_eps": 1e-06,
30
- "rope_scaling": null,
 
 
 
 
31
  "rope_theta": 10000.0,
32
- "tie_word_embeddings": false,
33
  "torch_dtype": "float32",
34
  "transformers_version": "4.46.1",
35
  "use_cache": true,
 
8
  "AutoConfig": "configuration_doge.DogeConfig",
9
  "AutoModelForCausalLM": "modeling_doge.DogeForCausalLM"
10
  },
11
+ "bos_token_id": 0,
12
+ "eos_token_id": 1,
13
  "expert_retrieval_size": 256,
14
  "hidden_act": "silu",
15
  "hidden_bias": false,
16
  "hidden_dropout": 0.0,
17
  "hidden_size": 512,
18
  "initializer_range": 0.02,
19
+ "intermediate_size": 1024,
20
  "is_moe": false,
21
  "max_position_embeddings": 2048,
22
  "model_type": "doge",
23
  "num_attention_heads": 4,
24
+ "num_cdmmoe_experts": 2048,
25
  "num_cdmmoe_experts_per_head": 8,
26
  "num_cdmmoe_heads": 4,
27
+ "num_channels": 3,
28
+ "num_hidden_layers": 16,
29
+ "num_key_value_heads": 2,
30
+ "pad_token_id": 2,
31
+ "patch_size": 16,
32
  "rms_norm_eps": 1e-06,
33
+ "rope_scaling": {
34
+ "factor": 4.0,
35
+ "original_max_position_embeddings": 2048,
36
+ "rope_type": "dynamic"
37
+ },
38
  "rope_theta": 10000.0,
 
39
  "torch_dtype": "float32",
40
  "transformers_version": "4.46.1",
41
  "use_cache": true,
configuration_doge.py CHANGED
@@ -25,20 +25,23 @@ from transformers.modeling_rope_utils import rope_config_validation
25
  class DogeConfig(PretrainedConfig):
26
  r"""
27
  This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
28
- model according to the specified arguments, defining the model architecture like [LoserCheems/doge-tiny-test](https://huggingface.co/LoserCheems/doge-tiny-test)
29
 
30
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
  documentation from [`PretrainedConfig`] for more information.
32
 
33
  Args:
34
  vocab_size (`int`, *optional*, defaults to 32768):
35
- Vocabulary size of the Doge model. Defines the number of different tokens that can be represented by the
36
- `inputs_ids` passed when calling [`DogeModel`]
 
 
 
37
  hidden_size (`int`, *optional*, defaults to 1024):
38
  Dimension of the hidden representations.
39
- intermediate_size (`int`, *optional*, defaults to 4096):
40
  Dimension of the CDMoE representations.
41
- num_hidden_layers (`int`, *optional*, defaults to 16):
42
  Number of hidden layers in the Transformer decoder.
43
  hidden_bias (`bool`, *optional*, defaults to `False`):
44
  Whether to use bias in the hidden layers.
@@ -51,24 +54,21 @@ class DogeConfig(PretrainedConfig):
51
  rope_theta (`float`, *optional*, defaults to 10000.0):
52
  The base period of the RoPE embeddings.
53
  rope_scaling (`Dict`, *optional*):
54
- Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
55
- and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
56
- accordingly.
57
  Expected contents:
58
  `rope_type` (`str`):
59
- The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
60
- 'llama3'], with 'default' being the original RoPE implementation.
61
  `factor` (`float`, *optional*):
62
- Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
63
- most scaling types, a `factor` of x will enable the model to handle sequences of length x *
64
- original maximum pre-trained length.
65
  `original_max_position_embeddings` (`int`, *optional*):
66
- Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
67
- pretraining.
68
  `attention_factor` (`float`, *optional*):
69
  Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
70
- computation. If unspecified, it defaults to value recommended by the implementation, using the
71
- `factor` field to infer the suggested value.
72
  `beta_fast` (`float`, *optional*):
73
  Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
74
  ramp function. If unspecified, it defaults to 32.
@@ -76,13 +76,11 @@ class DogeConfig(PretrainedConfig):
76
  Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
77
  ramp function. If unspecified, it defaults to 1.
78
  `short_factor` (`List[float]`, *optional*):
79
- Only used with 'longrope'. The scaling factor to be applied to short contexts (<
80
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
81
- size divided by the number of attention heads divided by 2
82
  `long_factor` (`List[float]`, *optional*):
83
- Only used with 'longrope'. The scaling factor to be applied to long contexts (<
84
- `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
85
- size divided by the number of attention heads divided by 2
86
  `low_freq_factor` (`float`, *optional*):
87
  Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
88
  `high_freq_factor` (`float`, *optional*):
@@ -100,15 +98,22 @@ class DogeConfig(PretrainedConfig):
100
  Beginning of stream token id.
101
  eos_token_id (`int`, *optional*, defaults to 2):
102
  End of stream token id.
103
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
104
  Whether to tie weight embeddings
105
  num_attention_heads (`int`, *optional*, defaults to 8):
106
  Number of attention heads for each attention layer in the Transformer decoder.
 
 
 
 
 
 
 
107
  attention_dropout (`float`, *optional*, defaults to 0.0):
108
  The dropout ratio for the attention probabilities.
109
  is_moe (`bool`, *optional*, defaults to `False`):
110
  Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
111
- num_cdmmoe_experts (`int`, *optional*, defaults to 4096):
112
  Number of Private Experts for the Cross Domain Mixture of Experts.
113
  num_cdmmoe_heads (`int`, *optional*, defaults to 4):
114
  Number of heads of Private Experts for the Cross Domain Mixture of Experts.
@@ -124,32 +129,41 @@ class DogeConfig(PretrainedConfig):
124
  def __init__(
125
  self,
126
  vocab_size=32768,
 
 
127
  hidden_size=1024,
128
- intermediate_size=4096,
129
- num_hidden_layers=16,
130
  hidden_bias=False,
131
  hidden_dropout=0.0,
132
  hidden_act="silu",
133
  max_position_embeddings=2048,
134
  rope_theta=10000.0,
135
- rope_scaling=None,
 
 
 
 
136
  initializer_range=0.02,
137
  rms_norm_eps=1e-06,
138
  use_cache=True,
139
- pad_token_id=0,
140
- bos_token_id=1,
141
- eos_token_id=2,
142
- tie_word_embeddings=False,
143
  num_attention_heads=8,
 
144
  attention_dropout=0.0,
145
  is_moe=False,
146
- num_cdmmoe_experts=4096,
147
  num_cdmmoe_heads=4,
148
  num_cdmmoe_experts_per_head=8,
149
  expert_retrieval_size=256,
150
  **kwargs,
151
  ):
152
  self.vocab_size = vocab_size
 
 
153
  self.hidden_size = hidden_size
154
  self.intermediate_size = intermediate_size
155
  self.num_hidden_layers = num_hidden_layers
@@ -162,11 +176,12 @@ class DogeConfig(PretrainedConfig):
162
  self.initializer_range = initializer_range
163
  self.rms_norm_eps = rms_norm_eps
164
  self.use_cache = use_cache
165
- self.pad_token_id = pad_token_id
166
  self.bos_token_id = bos_token_id
167
  self.eos_token_id = eos_token_id
 
168
  self.tie_word_embeddings = tie_word_embeddings
169
  self.num_attention_heads = num_attention_heads
 
170
  self.attention_dropout = attention_dropout
171
  self.is_moe = is_moe
172
  self.num_cdmmoe_experts = num_cdmmoe_experts
@@ -181,9 +196,9 @@ class DogeConfig(PretrainedConfig):
181
  rope_config_validation(self)
182
 
183
  super().__init__(
184
- pad_token_id=pad_token_id,
185
  bos_token_id=bos_token_id,
186
  eos_token_id=eos_token_id,
 
187
  tie_word_embeddings=tie_word_embeddings,
188
  **kwargs,
189
  )
 
25
  class DogeConfig(PretrainedConfig):
26
  r"""
27
  This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
28
+ model according to the specified arguments, defining the model architecture like [JingzeShi/Doge-20M](https://huggingface.co/JingzeShi/Doge-20M).
29
 
30
  Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
  documentation from [`PretrainedConfig`] for more information.
32
 
33
  Args:
34
  vocab_size (`int`, *optional*, defaults to 32768):
35
+ Vocabulary size of the Doge model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
36
+ num_channels (`int`, *optional*, defaults to 3):
37
+ Number of channels in the input image.
38
+ patch_size (`int`, *optional*, defaults to 16):
39
+ Patch size of Vision Transformer Embeddings.
40
  hidden_size (`int`, *optional*, defaults to 1024):
41
  Dimension of the hidden representations.
42
+ intermediate_size (`int`, *optional*, defaults to 2048):
43
  Dimension of the CDMoE representations.
44
+ num_hidden_layers (`int`, *optional*, defaults to 32):
45
  Number of hidden layers in the Transformer decoder.
46
  hidden_bias (`bool`, *optional*, defaults to `False`):
47
  Whether to use bias in the hidden layers.
 
54
  rope_theta (`float`, *optional*, defaults to 10000.0):
55
  The base period of the RoPE embeddings.
56
  rope_scaling (`Dict`, *optional*):
57
+ Dictionary containing the scaling configuration for the RoPE embeddings.
58
+ NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
 
59
  Expected contents:
60
  `rope_type` (`str`):
61
+ The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
 
62
  `factor` (`float`, *optional*):
63
+ Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
64
+ In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
 
65
  `original_max_position_embeddings` (`int`, *optional*):
66
+ Used with 'dynamic', 'longrope' and 'llama3'.
67
+ The original max position embeddings used during pretraining.
68
  `attention_factor` (`float`, *optional*):
69
  Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
70
+ computation.
71
+ If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
72
  `beta_fast` (`float`, *optional*):
73
  Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
74
  ramp function. If unspecified, it defaults to 32.
 
76
  Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
77
  ramp function. If unspecified, it defaults to 1.
78
  `short_factor` (`List[float]`, *optional*):
79
+ Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
80
+ Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
 
81
  `long_factor` (`List[float]`, *optional*):
82
+ Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
83
+ Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
 
84
  `low_freq_factor` (`float`, *optional*):
85
  Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
86
  `high_freq_factor` (`float`, *optional*):
 
98
  Beginning of stream token id.
99
  eos_token_id (`int`, *optional*, defaults to 2):
100
  End of stream token id.
101
+ tie_word_embeddings (`bool`, *optional*, defaults to `True`):
102
  Whether to tie weight embeddings
103
  num_attention_heads (`int`, *optional*, defaults to 8):
104
  Number of attention heads for each attention layer in the Transformer decoder.
105
+ num_key_value_heads (`int`, *optional*, defaults to `None`):
106
+ This is the number of key_value heads that should be used to implement Grouped Query Attention.
107
+ If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
108
+ `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
109
+ When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
110
+ For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf).
111
+ If it is not specified, will default to `num_attention_heads`.
112
  attention_dropout (`float`, *optional*, defaults to 0.0):
113
  The dropout ratio for the attention probabilities.
114
  is_moe (`bool`, *optional*, defaults to `False`):
115
  Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize
116
+ num_cdmmoe_experts (`int`, *optional*, defaults to 2048):
117
  Number of Private Experts for the Cross Domain Mixture of Experts.
118
  num_cdmmoe_heads (`int`, *optional*, defaults to 4):
119
  Number of heads of Private Experts for the Cross Domain Mixture of Experts.
 
129
  def __init__(
130
  self,
131
  vocab_size=32768,
132
+ num_channels=3,
133
+ patch_size=16,
134
  hidden_size=1024,
135
+ intermediate_size=2048,
136
+ num_hidden_layers=32,
137
  hidden_bias=False,
138
  hidden_dropout=0.0,
139
  hidden_act="silu",
140
  max_position_embeddings=2048,
141
  rope_theta=10000.0,
142
+ rope_scaling={
143
+ "rope_type": "dynamic",
144
+ "factor": 4.0,
145
+ "original_max_position_embeddings": 2048,
146
+ },
147
  initializer_range=0.02,
148
  rms_norm_eps=1e-06,
149
  use_cache=True,
150
+ bos_token_id=0,
151
+ eos_token_id=1,
152
+ pad_token_id=2,
153
+ tie_word_embeddings=True,
154
  num_attention_heads=8,
155
+ num_key_value_heads=None,
156
  attention_dropout=0.0,
157
  is_moe=False,
158
+ num_cdmmoe_experts=2048,
159
  num_cdmmoe_heads=4,
160
  num_cdmmoe_experts_per_head=8,
161
  expert_retrieval_size=256,
162
  **kwargs,
163
  ):
164
  self.vocab_size = vocab_size
165
+ self.num_channels = num_channels
166
+ self.patch_size = patch_size
167
  self.hidden_size = hidden_size
168
  self.intermediate_size = intermediate_size
169
  self.num_hidden_layers = num_hidden_layers
 
176
  self.initializer_range = initializer_range
177
  self.rms_norm_eps = rms_norm_eps
178
  self.use_cache = use_cache
 
179
  self.bos_token_id = bos_token_id
180
  self.eos_token_id = eos_token_id
181
+ self.pad_token_id = pad_token_id
182
  self.tie_word_embeddings = tie_word_embeddings
183
  self.num_attention_heads = num_attention_heads
184
+ self.num_key_value_heads = num_key_value_heads
185
  self.attention_dropout = attention_dropout
186
  self.is_moe = is_moe
187
  self.num_cdmmoe_experts = num_cdmmoe_experts
 
196
  rope_config_validation(self)
197
 
198
  super().__init__(
 
199
  bos_token_id=bos_token_id,
200
  eos_token_id=eos_token_id,
201
+ pad_token_id=pad_token_id,
202
  tie_word_embeddings=tie_word_embeddings,
203
  **kwargs,
204
  )
generation_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "_from_model_config": true,
3
- "bos_token_id": 1,
4
- "eos_token_id": 2,
5
- "pad_token_id": 0,
6
  "transformers_version": "4.46.1"
7
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 2,
6
  "transformers_version": "4.46.1"
7
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:26d80cdf90d4f053299b962b1ede76f0fe30ed31ebcb95e5dbd730ce23ffd36a
3
- size 268580408
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f6ff7db0f6721882934053a9c20eec73c33b55fc47ef428e20a0e91391738985
3
+ size 218391112
modeling_doge.py CHANGED
@@ -79,7 +79,7 @@ class Residual(nn.Module):
79
  def __init__(self, hidden_size):
80
  super().__init__()
81
  self.weight = nn.Parameter(torch.ones(hidden_size))
82
-
83
  def forward(self, residual_states, hidden_states):
84
  return self.weight * residual_states + hidden_states
85
 
@@ -92,10 +92,10 @@ class RotaryEmbedding(nn.Module):
92
  super().__init__()
93
  self.rope_kwargs = {}
94
 
95
- if config.rope_scaling is None:
96
- self.rope_type = "default"
97
  else:
98
- self.rope_type = config.rope_scaling
99
  self.max_seq_len_cached = config.max_position_embeddings
100
  self.original_max_seq_len = config.max_position_embeddings
101
  self.base = config.rope_theta
@@ -133,6 +133,7 @@ class RotaryEmbedding(nn.Module):
133
  # core RoPE block
134
  inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
135
  position_ids_expanded = position_ids[:, None, :].float()
 
136
  device_type = x.device.type
137
  device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
138
  with torch.autocast(device_type=device_type, enabled=False):
@@ -141,6 +142,7 @@ class RotaryEmbedding(nn.Module):
141
  cos = emb.cos()
142
  sin = emb.sin()
143
 
 
144
  cos = cos * self.attention_scaling
145
  sin = sin * self.attention_scaling
146
 
@@ -168,11 +170,10 @@ def apply_QK_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
168
  Deprecated and unused.
169
  unsqueeze_dim (`int`, *optional*, defaults to 1):
170
  The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
171
- sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
172
- that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
173
- k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
174
- cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
175
- the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
176
  Returns:
177
  `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
178
  """
@@ -183,6 +184,18 @@ def apply_QK_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
183
  return q_embed, k_embed
184
 
185
 
 
 
 
 
 
 
 
 
 
 
 
 
186
  class DogeDynamicMaskAttention(nn.Module):
187
  """Dynamic Mask Attention from 'Wonderful Matrices' paper."""
188
 
@@ -193,46 +206,25 @@ class DogeDynamicMaskAttention(nn.Module):
193
  self.layer_idx = layer_idx
194
  if layer_idx is None:
195
  logger.warning_once(
196
- f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
197
- "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
198
- "when creating this class."
199
  )
200
 
201
  self.hidden_dim = config.hidden_size
202
- self.num_attention_heads = config.num_attention_heads
 
 
 
203
  self.attention_dropout = config.attention_dropout
204
- self.attention_head_dim = self.hidden_dim // self.num_attention_heads
205
 
206
  # Q K V O projections
207
- self.q_proj = nn.Linear(
208
- self.hidden_dim,
209
- self.num_attention_heads * self.attention_head_dim,
210
- bias=config.hidden_bias,
211
- )
212
- self.k_proj = nn.Linear(
213
- self.hidden_dim,
214
- self.num_attention_heads * self.attention_head_dim,
215
- bias=config.hidden_bias,
216
- )
217
  # dynamic mask for the QK^T attention score matrix
218
- self.A = nn.Parameter(
219
- torch.ones(self.num_attention_heads)
220
- )
221
- self.dt_proj = nn.Linear(
222
- self.hidden_dim,
223
- self.num_attention_heads,
224
- bias=config.hidden_bias,
225
- )
226
- self.v_proj = nn.Linear(
227
- self.hidden_dim,
228
- self.num_attention_heads * self.attention_head_dim,
229
- bias=config.hidden_bias,
230
- )
231
- self.o_proj = nn.Linear(
232
- self.hidden_dim,
233
- self.hidden_dim,
234
- bias=config.hidden_bias,
235
- )
236
 
237
  def forward(
238
  self,
@@ -250,15 +242,9 @@ class DogeDynamicMaskAttention(nn.Module):
250
  key_states = self.k_proj(hidden_states)
251
  value_states = self.v_proj(hidden_states)
252
 
253
- query_states = query_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(
254
- 1, 2
255
- )
256
- key_states = key_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(
257
- 1, 2
258
- )
259
- value_states = value_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(
260
- 1, 2
261
- )
262
 
263
  cos, sin = position_embeddings
264
  query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
@@ -268,8 +254,12 @@ class DogeDynamicMaskAttention(nn.Module):
268
  cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
269
  key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
270
 
 
 
 
 
271
  # compute attention scores matrix
272
- attn_weights = torch.matmul(query_states, key_states.transpose(-1, -2)) / math.sqrt(self.attention_head_dim)
273
 
274
  # add mask to attention scores
275
  if attention_mask is not None:
@@ -311,9 +301,9 @@ class DogeSdpaDynamicMaskAttn(DogeDynamicMaskAttention):
311
  key_states = self.k_proj(hidden_states)
312
  value_states = self.v_proj(hidden_states)
313
 
314
- query_states = query_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(1, 2)
315
- key_states = key_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(1, 2)
316
- value_states = value_states.view(bsz, q_len, self.num_attention_heads, self.attention_head_dim).transpose(1, 2)
317
 
318
  cos, sin = position_embeddings
319
  query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
@@ -323,6 +313,11 @@ class DogeSdpaDynamicMaskAttn(DogeDynamicMaskAttention):
323
  cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
324
  key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
325
 
 
 
 
 
 
326
  if attention_mask is not None:
327
  dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
328
  dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
@@ -333,12 +328,18 @@ class DogeSdpaDynamicMaskAttn(DogeDynamicMaskAttention):
333
  key_states = key_states.contiguous()
334
  value_states = value_states.contiguous()
335
 
 
 
 
 
 
336
  attn_output = F.scaled_dot_product_attention(
337
  query_states,
338
  key_states,
339
  value_states,
340
  attn_mask=causal_mask,
341
- dropout_p=self.attention_dropout,
 
342
  )
343
 
344
  attn_output = attn_output.transpose(1, 2).contiguous()
@@ -362,21 +363,9 @@ class DogeMLP(nn.Module):
362
  self.intermediate_dim = config.intermediate_size
363
  self.act_fn = ACT2FN[config.hidden_act]
364
 
365
- self.gate_proj = nn.Linear(
366
- self.hidden_dim,
367
- self.intermediate_dim,
368
- bias=config.hidden_bias,
369
- )
370
- self.up_proj = nn.Linear(
371
- self.hidden_dim,
372
- self.intermediate_dim,
373
- bias=config.hidden_bias,
374
- )
375
- self.down_proj = nn.Linear(
376
- self.intermediate_dim,
377
- self.hidden_dim,
378
- bias=config.hidden_bias,
379
- )
380
 
381
  def forward(
382
  self,
@@ -402,30 +391,12 @@ class DogeCDMoE(DogeMLP):
402
  self.num_keys = int(math.sqrt(self.num_cdmmoe_experts))
403
 
404
  # queries and keys for retrieval experts
405
- self.queries = nn.Linear(
406
- self.hidden_dim,
407
- self.num_cdmmoe_heads * self.expert_retrieval_dim,
408
- bias=False,
409
- )
410
- self.keys = nn.Parameter(
411
- torch.zeros(
412
- self.num_cdmmoe_heads,
413
- self.num_keys,
414
- 2,
415
- self.expert_retrieval_dim // 2,
416
- )
417
- )
418
 
419
  # experts
420
- self.down_embed = nn.Embedding(
421
- self.num_cdmmoe_experts,
422
- self.hidden_dim,
423
- )
424
- self.up_embed = nn.Embedding(
425
- self.num_cdmmoe_experts,
426
- self.hidden_dim,
427
- )
428
-
429
 
430
  def forward(
431
  self,
@@ -468,13 +439,13 @@ class DogeDecoderLayer(nn.Module):
468
  super().__init__()
469
  self.hidden_dropout = config.hidden_dropout
470
 
471
- self.pre_sequence_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
472
- self.attn = DOGE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
473
- self.post_sequence_residual = Residual(config.hidden_size)
474
 
475
- self.pre_state_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
476
  self.feed_forward = DogeMLP(config) if config.is_moe == False else DogeCDMoE(config)
477
- self.post_state_residual = Residual(config.hidden_size)
478
 
479
  def forward(
480
  self,
@@ -492,29 +463,25 @@ class DogeDecoderLayer(nn.Module):
492
  Args:
493
  hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
494
  attention_mask (`torch.FloatTensor`, *optional*):
495
- attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
496
- query_sequence_length, key_sequence_length)` if default attention is used.
497
  output_attentions (`bool`, *optional*):
498
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
499
- returned tensors for more detail.
500
  use_cache (`bool`, *optional*):
501
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
502
- (see `past_key_values`).
503
  past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
504
  cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
505
  Indices depicting the position of the input sequence tokens in the sequence
506
  position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
507
- Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
508
- with `head_dim` being the embedding dimension of each attention head.
509
  kwargs (`dict`, *optional*):
510
- Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
511
- into the model
512
  """
513
 
514
  # sequence transformation
515
  residual = hidden_states
516
- hidden_states = self.pre_sequence_layernorm(hidden_states)
517
- hidden_states, present_key_value = self.attn(
518
  hidden_states=hidden_states,
519
  attention_mask=attention_mask,
520
  position_ids=position_ids,
@@ -525,14 +492,14 @@ class DogeDecoderLayer(nn.Module):
525
  )
526
  self_attn_weights = None
527
  hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
528
- hidden_states = self.post_sequence_residual(residual, hidden_states)
529
 
530
  # state transformation
531
  residual = hidden_states
532
- hidden_states = self.pre_state_layernorm(hidden_states)
533
  hidden_states = self.feed_forward(hidden_states)
534
  hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
535
- hidden_states = self.post_state_residual(residual, hidden_states)
536
 
537
  outputs = (hidden_states,)
538
 
@@ -572,11 +539,10 @@ class DogePreTrainedModel(PreTrainedModel):
572
  DOGE_INPUTS_DOCSTRING = r"""
573
  Args:
574
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
575
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
576
- it.
577
 
578
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
579
- [`PreTrainedTokenizer.__call__`] for details.
580
 
581
  [What are input IDs?](../glossary#input-ids)
582
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -587,60 +553,48 @@ DOGE_INPUTS_DOCSTRING = r"""
587
 
588
  [What are attention masks?](../glossary#attention-mask)
589
 
590
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
591
- [`PreTrainedTokenizer.__call__`] for details.
592
 
593
- If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
594
- `past_key_values`).
595
 
596
- If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
597
- and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
598
- information on the default strategy.
599
 
600
  - 1 indicates the head is **not masked**,
601
  - 0 indicates the head is **masked**.
602
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
603
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
604
- config.n_positions - 1]`.
605
 
606
  [What are position IDs?](../glossary#position-ids)
607
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
608
- Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
609
- blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
610
- returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
611
 
612
  Two formats are allowed:
613
- - a [`~cache_utils.Cache`] instance, see our
614
- [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
615
- - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
616
- shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
617
- cache format.
618
-
619
- The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
620
- legacy cache format will be returned.
621
-
622
- If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
623
- have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
624
- of shape `(batch_size, sequence_length)`.
625
  inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
626
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
627
- is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
628
- model's internal embedding lookup matrix.
629
  use_cache (`bool`, *optional*):
630
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
631
- `past_key_values`).
632
  output_attentions (`bool`, *optional*):
633
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
634
- tensors for more detail.
635
  output_hidden_states (`bool`, *optional*):
636
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
637
- more detail.
638
  return_dict (`bool`, *optional*):
639
  Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
640
  cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
641
- Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
642
- this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
643
- the complete sequence length.
644
  """
645
 
646
 
@@ -711,9 +665,9 @@ class DogeModel(DogePreTrainedModel):
711
  else:
712
  past_key_values = DynamicCache.from_legacy_cache(past_key_values)
713
  logger.warning_once(
714
- "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
715
- "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
716
- "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
717
  )
718
 
719
  if cache_position is None:
@@ -842,18 +796,15 @@ class DogeModel(DogePreTrainedModel):
842
  **kwargs,
843
  ):
844
  """
845
- Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
846
- `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
847
 
848
  Args:
849
  attention_mask (`torch.Tensor`):
850
- A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
851
- `(batch_size, 1, query_length, key_value_length)`.
852
  sequence_length (`int`):
853
  The sequence length being processed.
854
  target_length (`int`):
855
- The target length: when generating with static cache, the mask should be as long as the static cache,
856
- to account for the 0 padding, the part of the cache that is not filled yet.
857
  dtype (`torch.dtype`):
858
  The dtype to use for the 4D attention mask.
859
  device (`torch.device`):
@@ -912,13 +863,13 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
912
 
913
  def set_output_embeddings(self, new_embeddings):
914
  self.lm_head = new_embeddings
 
 
 
915
 
916
  def set_decoder(self, decoder):
917
  self.model = decoder
918
 
919
- def get_decoder(self):
920
- return self.model
921
-
922
  @add_start_docstrings_to_model_forward(DOGE_INPUTS_DOCSTRING)
923
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
924
  def forward(
@@ -940,14 +891,14 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
940
  r"""
941
  Args:
942
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
943
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
944
- config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
945
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
946
 
947
  num_logits_to_keep (`int`, *optional*):
948
- Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
949
- `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
950
- token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
951
 
952
  Returns:
953
  """
@@ -993,18 +944,98 @@ class DogeForCausalLM(DogePreTrainedModel, GenerationMixin):
993
  )
994
 
995
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
996
  @add_start_docstrings(
997
  """
998
  The Doge Model transformer with a sequence classification head on top (linear layer).
999
 
1000
- [`DogeForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1001
- (e.g. GPT-2) do.
1002
 
1003
- Since it does classification on the last token, it requires to know the position of the last token. If a
1004
- `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1005
- no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1006
- padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1007
- each row of the batch).
1008
  """
1009
  )
1010
  class DogeForSequenceClassification(DogePreTrainedModel):
@@ -1041,9 +1072,9 @@ class DogeForSequenceClassification(DogePreTrainedModel):
1041
  ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1042
  r"""
1043
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1044
- Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1045
- config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1046
- `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1047
  """
1048
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1049
 
 
79
  def __init__(self, hidden_size):
80
  super().__init__()
81
  self.weight = nn.Parameter(torch.ones(hidden_size))
82
+
83
  def forward(self, residual_states, hidden_states):
84
  return self.weight * residual_states + hidden_states
85
 
 
92
  super().__init__()
93
  self.rope_kwargs = {}
94
 
95
+ if config.rope_scaling is not None:
96
+ self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
97
  else:
98
+ self.rope_type = "default"
99
  self.max_seq_len_cached = config.max_position_embeddings
100
  self.original_max_seq_len = config.max_position_embeddings
101
  self.base = config.rope_theta
 
133
  # core RoPE block
134
  inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
135
  position_ids_expanded = position_ids[:, None, :].float()
136
+ # Force float32 (see https://github.com/huggingface/transformers/pull/29285)
137
  device_type = x.device.type
138
  device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
139
  with torch.autocast(device_type=device_type, enabled=False):
 
142
  cos = emb.cos()
143
  sin = emb.sin()
144
 
145
+ # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
146
  cos = cos * self.attention_scaling
147
  sin = sin * self.attention_scaling
148
 
 
170
  Deprecated and unused.
171
  unsqueeze_dim (`int`, *optional*, defaults to 1):
172
  The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
173
+ sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k.
174
+ For example, note that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim].
175
+ Then, if q and k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k.
176
+ Similarly, if q and k have the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
 
177
  Returns:
178
  `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
179
  """
 
184
  return q_embed, k_embed
185
 
186
 
187
+ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
188
+ """
189
+ This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep).
190
+ The hidden states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
191
+ """
192
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
193
+ if n_rep == 1:
194
+ return hidden_states
195
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
196
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
197
+
198
+
199
  class DogeDynamicMaskAttention(nn.Module):
200
  """Dynamic Mask Attention from 'Wonderful Matrices' paper."""
201
 
 
206
  self.layer_idx = layer_idx
207
  if layer_idx is None:
208
  logger.warning_once(
209
+ f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. "
210
+ "Please make sure to provide a `layer_idx` when creating this class."
 
211
  )
212
 
213
  self.hidden_dim = config.hidden_size
214
+ self.num_heads = config.num_attention_heads
215
+ self.head_dim = self.hidden_dim // self.num_heads
216
+ self.num_key_value_heads = config.num_key_value_heads
217
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
218
  self.attention_dropout = config.attention_dropout
 
219
 
220
  # Q K V O projections
221
+ self.q_proj = nn.Linear(self.hidden_dim, self.num_heads * self.head_dim, bias=config.hidden_bias)
222
+ self.k_proj = nn.Linear(self.hidden_dim, self.num_key_value_heads * self.head_dim, bias=config.hidden_bias)
 
 
 
 
 
 
 
 
223
  # dynamic mask for the QK^T attention score matrix
224
+ self.A = nn.Parameter(torch.ones(self.num_heads))
225
+ self.dt_proj = nn.Linear(self.hidden_dim, self.num_heads, bias=config.hidden_bias)
226
+ self.v_proj = nn.Linear(self.hidden_dim, self.num_key_value_heads * self.head_dim, bias=config.hidden_bias)
227
+ self.o_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=config.hidden_bias)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  def forward(
230
  self,
 
242
  key_states = self.k_proj(hidden_states)
243
  value_states = self.v_proj(hidden_states)
244
 
245
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
246
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
247
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
 
 
 
 
 
 
248
 
249
  cos, sin = position_embeddings
250
  query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
 
254
  cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
255
  key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
256
 
257
+ # repeat key and value states
258
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
259
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
260
+
261
  # compute attention scores matrix
262
+ attn_weights = torch.matmul(query_states, key_states.transpose(-1, -2)) / math.sqrt(self.head_dim)
263
 
264
  # add mask to attention scores
265
  if attention_mask is not None:
 
301
  key_states = self.k_proj(hidden_states)
302
  value_states = self.v_proj(hidden_states)
303
 
304
+ query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
305
+ key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
306
+ value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2)
307
 
308
  cos, sin = position_embeddings
309
  query_states, key_states = apply_QK_rotary_pos_emb(query_states, key_states, cos, sin)
 
313
  cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
314
  key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
315
 
316
+ # repeat key and value states
317
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
318
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
319
+
320
+ causal_mask = attention_mask
321
  if attention_mask is not None:
322
  dt_states = self.dt_proj(value_states.transpose(1, 2).reshape(bsz, value_states.shape[-2], -1))
323
  dynamic_mask = torch.exp(self.A * F.softplus(dt_states)).transpose(-1, -2)
 
328
  key_states = key_states.contiguous()
329
  value_states = value_states.contiguous()
330
 
331
+ # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
332
+ is_causal = True if causal_mask is None and q_len > 1 else False
333
+
334
+ # NOTE: As of pytorch 2.5.1, cuDNN's SDPA backward pass is still incorrect, so we disable cuDNN SDPA (see https://github.com/pytorch/pytorch/issues/138581)
335
+ torch.backends.cuda.enable_cudnn_sdp(False)
336
  attn_output = F.scaled_dot_product_attention(
337
  query_states,
338
  key_states,
339
  value_states,
340
  attn_mask=causal_mask,
341
+ dropout_p=self.attention_dropout if self.training else 0.0,
342
+ is_causal=is_causal,
343
  )
344
 
345
  attn_output = attn_output.transpose(1, 2).contiguous()
 
363
  self.intermediate_dim = config.intermediate_size
364
  self.act_fn = ACT2FN[config.hidden_act]
365
 
366
+ self.gate_proj = nn.Linear(self.hidden_dim, self.intermediate_dim, bias=config.hidden_bias)
367
+ self.up_proj = nn.Linear(self.hidden_dim, self.intermediate_dim, bias=config.hidden_bias)
368
+ self.down_proj = nn.Linear(self.intermediate_dim, self.hidden_dim, bias=config.hidden_bias)
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
  def forward(
371
  self,
 
391
  self.num_keys = int(math.sqrt(self.num_cdmmoe_experts))
392
 
393
  # queries and keys for retrieval experts
394
+ self.queries = nn.Linear(self.hidden_dim, self.num_cdmmoe_heads * self.expert_retrieval_dim, bias=False)
395
+ self.keys = nn.Parameter(torch.zeros(self.num_cdmmoe_heads, self.num_keys, 2, self.expert_retrieval_dim // 2))
 
 
 
 
 
 
 
 
 
 
 
396
 
397
  # experts
398
+ self.down_embed = nn.Embedding(self.num_cdmmoe_experts, self.hidden_dim)
399
+ self.up_embed = nn.Embedding(self.num_cdmmoe_experts, self.hidden_dim)
 
 
 
 
 
 
 
400
 
401
  def forward(
402
  self,
 
439
  super().__init__()
440
  self.hidden_dropout = config.hidden_dropout
441
 
442
+ self.pre_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
443
+ self.self_attn = DOGE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
444
+ self.pre_residual = Residual(config.hidden_size)
445
 
446
+ self.post_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
447
  self.feed_forward = DogeMLP(config) if config.is_moe == False else DogeCDMoE(config)
448
+ self.post_residual = Residual(config.hidden_size)
449
 
450
  def forward(
451
  self,
 
463
  Args:
464
  hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
465
  attention_mask (`torch.FloatTensor`, *optional*):
466
+ attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, query_sequence_length, key_sequence_length)` if default attention is used.
 
467
  output_attentions (`bool`, *optional*):
468
+ Whether or not to return the attentions tensors of all attention layers.
469
+ See `attentions` under returned tensors for more detail.
470
  use_cache (`bool`, *optional*):
471
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`).
 
472
  past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
473
  cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
474
  Indices depicting the position of the input sequence tokens in the sequence
475
  position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
476
+ Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, with `head_dim` being the embedding dimension of each attention head.
 
477
  kwargs (`dict`, *optional*):
478
+ Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code into the model
 
479
  """
480
 
481
  # sequence transformation
482
  residual = hidden_states
483
+ hidden_states = self.pre_layernorm(hidden_states)
484
+ hidden_states, present_key_value = self.self_attn(
485
  hidden_states=hidden_states,
486
  attention_mask=attention_mask,
487
  position_ids=position_ids,
 
492
  )
493
  self_attn_weights = None
494
  hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
495
+ hidden_states = self.pre_residual(residual, hidden_states)
496
 
497
  # state transformation
498
  residual = hidden_states
499
+ hidden_states = self.post_layernorm(hidden_states)
500
  hidden_states = self.feed_forward(hidden_states)
501
  hidden_states = F.dropout(hidden_states, p=self.hidden_dropout, training=self.training)
502
+ hidden_states = self.post_residual(residual, hidden_states)
503
 
504
  outputs = (hidden_states,)
505
 
 
539
  DOGE_INPUTS_DOCSTRING = r"""
540
  Args:
541
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
542
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide it.
 
543
 
544
+ Indices can be obtained using [`AutoTokenizer`].
545
+ See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
546
 
547
  [What are input IDs?](../glossary#input-ids)
548
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
 
553
 
554
  [What are attention masks?](../glossary#attention-mask)
555
 
556
+ Indices can be obtained using [`AutoTokenizer`].
557
+ See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
558
 
559
+ If `past_key_values` is used, optionally only the last `input_ids` have to be input (see `past_key_values`).
 
560
 
561
+ If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] and modify to your needs.
562
+ See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy.
 
563
 
564
  - 1 indicates the head is **not masked**,
565
  - 0 indicates the head is **masked**.
566
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
567
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, config.n_positions - 1]`.
 
568
 
569
  [What are position IDs?](../glossary#position-ids)
570
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
571
+ Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention blocks) that can be used to speed up sequential decoding.
572
+ This typically consists in the `past_key_values` returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
573
 
574
  Two formats are allowed:
575
+ - a [`~cache_utils.Cache`] instance, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
576
+ - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy cache format.
577
+
578
+ The model will output the same cache format that is fed as input.
579
+ If no `past_key_values` are passed, the legacy cache format will be returned.
580
+
581
+ If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` of shape `(batch_size, sequence_length)`.
 
 
 
 
 
582
  inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
583
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
584
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
 
585
  use_cache (`bool`, *optional*):
586
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see `past_key_values`).
 
587
  output_attentions (`bool`, *optional*):
588
+ Whether or not to return the attentions tensors of all attention layers.
589
+ See `attentions` under returned tensors for more detail.
590
  output_hidden_states (`bool`, *optional*):
591
+ Whether or not to return the hidden states of all layers.
592
+ See `hidden_states` under returned tensors for more detail.
593
  return_dict (`bool`, *optional*):
594
  Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
595
  cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
596
+ Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, this tensor is not affected by padding.
597
+ It is used to update the cache in the correct position and to infer the complete sequence length.
 
598
  """
599
 
600
 
 
665
  else:
666
  past_key_values = DynamicCache.from_legacy_cache(past_key_values)
667
  logger.warning_once(
668
+ "We detected that you are passing `past_key_values` as a tuple of tuples."
669
+ "This is deprecated and will be removed in v4.47."
670
+ "Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)"
671
  )
672
 
673
  if cache_position is None:
 
796
  **kwargs,
797
  ):
798
  """
799
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
 
800
 
801
  Args:
802
  attention_mask (`torch.Tensor`):
803
+ A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
 
804
  sequence_length (`int`):
805
  The sequence length being processed.
806
  target_length (`int`):
807
+ The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
 
808
  dtype (`torch.dtype`):
809
  The dtype to use for the 4D attention mask.
810
  device (`torch.device`):
 
863
 
864
  def set_output_embeddings(self, new_embeddings):
865
  self.lm_head = new_embeddings
866
+
867
+ def get_decoder(self):
868
+ return self.model
869
 
870
  def set_decoder(self, decoder):
871
  self.model = decoder
872
 
 
 
 
873
  @add_start_docstrings_to_model_forward(DOGE_INPUTS_DOCSTRING)
874
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
875
  def forward(
 
891
  r"""
892
  Args:
893
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
894
+ Labels for computing the masked language modeling loss.
895
+ Indices should either be in `[0, ..., config.vocab_size]` or -100 (see `input_ids` docstring).
896
+ Tokens with indices set to `-100` are ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
897
 
898
  num_logits_to_keep (`int`, *optional*):
899
+ Calculate logits for the last `num_logits_to_keep` tokens.
900
+ If `0`, calculate logits for all `input_ids` (special case).
901
+ Only last token logits are needed for generation, and calculating them only for that token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
902
 
903
  Returns:
904
  """
 
944
  )
945
 
946
 
947
+ class DogePatchEmbedding(nn.Module):
948
+ """
949
+ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial `hidden_states` of shape `(batch_size, seq_len, hidden_size)` to be consumed by a Transformer.
950
+ """
951
+
952
+ def __init__(self, config: DogeConfig):
953
+ super().__init__()
954
+
955
+ self.num_channels = config.num_channels
956
+ self.patch_size = config.patch_size
957
+ self.hidden_dim = config.hidden_size
958
+
959
+ self.sequence_proj = nn.Conv2d(self.num_channels, self.hidden_dim, kernel_size=self.patch_size, stride=self.patch_size)
960
+ self.state_proj = nn.Linear(self.hidden_dim, self.hidden_dim, bias=config.hidden_bias)
961
+
962
+ def forward(
963
+ self,
964
+ pixel_values: torch.Tensor,
965
+ ) -> torch.Tensor:
966
+ image_embedding = self.sequence_proj(pixel_values).flatten(2).transpose(1, 2)
967
+ image_embedding = self.state_proj(image_embedding)
968
+ return image_embedding
969
+
970
+
971
+ class DogeForCausalVLM(DogeForCausalLM):
972
+ _tied_weights_keys = ["lm_head.weight"]
973
+
974
+ def __init__(self, config: DogeConfig):
975
+ super().__init__(config)
976
+ self.config = config
977
+ self.pixel_embed = DogePatchEmbedding(config)
978
+
979
+ # Initialize weights and apply final processing
980
+ self.post_init()
981
+
982
+ def forward(
983
+ self,
984
+ input_ids: torch.LongTensor = None,
985
+ pixel_values: torch.FloatTensor = None,
986
+ attention_mask: Optional[torch.Tensor] = None,
987
+ position_ids: Optional[torch.LongTensor] = None,
988
+ past_key_values: Optional[torch.Tensor] = None,
989
+ inputs_embeds: Optional[torch.FloatTensor] = None,
990
+ labels: Optional[torch.LongTensor] = None,
991
+ use_cache: Optional[bool] = None,
992
+ output_attentions: Optional[bool] = None,
993
+ output_hidden_states: Optional[bool] = None,
994
+ return_dict: Optional[bool] = None,
995
+ cache_position: Optional[torch.LongTensor] = None,
996
+ num_logits_to_keep: int = 0,
997
+ **loss_kwargs,
998
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
999
+ # TODO: @wubingheng111: refer to Llava for implementating the forward method
1000
+ ...
1001
+
1002
+ def prepare_inputs_for_generation(
1003
+ self,
1004
+ input_ids=None,
1005
+ pixel_values=None,
1006
+ past_key_values=None,
1007
+ input_embeds=None,
1008
+ attention_mask=None,
1009
+ cache_position=None,
1010
+ num_logits_to_keep=None,
1011
+ **kwargs,
1012
+ ):
1013
+ model_inputs = self.model.prepare_inputs_for_generation(
1014
+ input_ids,
1015
+ past_key_values=past_key_values,
1016
+ inputs_embeds=input_embeds,
1017
+ attention_mask=attention_mask,
1018
+ cache_position=cache_position,
1019
+ num_logits_to_keep=num_logits_to_keep,
1020
+ **kwargs,
1021
+ )
1022
+
1023
+ if cache_position[0] == 0:
1024
+ model_inputs["pixel_values"] = pixel_values
1025
+
1026
+ return model_inputs
1027
+
1028
+
1029
  @add_start_docstrings(
1030
  """
1031
  The Doge Model transformer with a sequence classification head on top (linear layer).
1032
 
1033
+ [`DogeForSequenceClassification`] uses the last token in order to do the classification, as other causal models (e.g. GPT-2) do.
 
1034
 
1035
+ Since it does classification on the last token, it requires to know the position of the last token.
1036
+ If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row.
1037
+ If no `pad_token_id` is defined, it simply takes the last value in each row of the batch.
1038
+ Since it cannot guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in each row of the batch).
 
1039
  """
1040
  )
1041
  class DogeForSequenceClassification(DogePreTrainedModel):
 
1072
  ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1073
  r"""
1074
  labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1075
+ Labels for computing the sequence classification/regression loss.
1076
+ Indices should be in `[0, ..., config.num_labels - 1]`.
1077
+ If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1078
  """
1079
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1080