IvanHU commited on
Commit
13226fe
·
1 Parent(s): fa6ee02

Upload correct optimizer states

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. configuration_yulanmini.py +310 -0
  2. global_step243198_universal/mp_rank_00_model_states.pt +3 -0
  3. training_args.bin → global_step243198_universal/zero/lm_head_alpha/exp_avg.pt +2 -2
  4. global_step243198_universal/zero/lm_head_alpha/exp_avg_sq.pt +3 -0
  5. global_step243198_universal/zero/lm_head_alpha/fp32.pt +3 -0
  6. global_step243198_universal/zero/lm_head_alpha/step.pt +3 -0
  7. global_step243198_universal/zero/model.embed_tokens.weight/exp_avg.pt +3 -0
  8. global_step243198_universal/zero/model.embed_tokens.weight/exp_avg_sq.pt +3 -0
  9. global_step243198_universal/zero/model.embed_tokens.weight/fp32.pt +3 -0
  10. global_step243198_universal/zero/model.embed_tokens.weight/step.pt +3 -0
  11. global_step243198_universal/zero/model.layers.0.down_proj_alpha/exp_avg.pt +3 -0
  12. global_step243198_universal/zero/model.layers.0.down_proj_alpha/exp_avg_sq.pt +3 -0
  13. global_step243198_universal/zero/model.layers.0.down_proj_alpha/fp32.pt +3 -0
  14. global_step243198_universal/zero/model.layers.0.down_proj_alpha/step.pt +3 -0
  15. global_step243198_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg.pt +3 -0
  16. global_step243198_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg_sq.pt +3 -0
  17. global_step243198_universal/zero/model.layers.0.gate_up_proj_alpha/fp32.pt +3 -0
  18. global_step243198_universal/zero/model.layers.0.gate_up_proj_alpha/step.pt +3 -0
  19. global_step243198_universal/zero/model.layers.0.input_layernorm.weight/exp_avg.pt +3 -0
  20. global_step243198_universal/zero/model.layers.0.input_layernorm.weight/exp_avg_sq.pt +3 -0
  21. global_step243198_universal/zero/model.layers.0.input_layernorm.weight/fp32.pt +3 -0
  22. global_step243198_universal/zero/model.layers.0.input_layernorm.weight/step.pt +3 -0
  23. global_step243198_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg.pt +3 -0
  24. global_step243198_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg_sq.pt +3 -0
  25. global_step243198_universal/zero/model.layers.0.input_layernorm_alpha/fp32.pt +3 -0
  26. global_step243198_universal/zero/model.layers.0.input_layernorm_alpha/step.pt +3 -0
  27. global_step243198_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg.pt +3 -0
  28. global_step243198_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg_sq.pt +3 -0
  29. global_step243198_universal/zero/model.layers.0.mlp.down_proj.weight/fp32.pt +3 -0
  30. global_step243198_universal/zero/model.layers.0.mlp.down_proj.weight/step.pt +3 -0
  31. global_step243198_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg.pt +3 -0
  32. global_step243198_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg_sq.pt +3 -0
  33. global_step243198_universal/zero/model.layers.0.mlp.gate_proj.weight/fp32.pt +3 -0
  34. global_step243198_universal/zero/model.layers.0.mlp.gate_proj.weight/step.pt +3 -0
  35. global_step243198_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg.pt +3 -0
  36. global_step243198_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg_sq.pt +3 -0
  37. global_step243198_universal/zero/model.layers.0.mlp.up_proj.weight/fp32.pt +3 -0
  38. global_step243198_universal/zero/model.layers.0.mlp.up_proj.weight/step.pt +3 -0
  39. global_step243198_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg.pt +3 -0
  40. global_step243198_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg_sq.pt +3 -0
  41. global_step243198_universal/zero/model.layers.0.post_attention_layernorm.weight/fp32.pt +3 -0
  42. global_step243198_universal/zero/model.layers.0.post_attention_layernorm.weight/step.pt +3 -0
  43. global_step243198_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg.pt +3 -0
  44. global_step243198_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg_sq.pt +3 -0
  45. global_step243198_universal/zero/model.layers.0.post_attention_layernorm_alpha/fp32.pt +3 -0
  46. global_step243198_universal/zero/model.layers.0.post_attention_layernorm_alpha/step.pt +3 -0
  47. global_step243198_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg.pt +3 -0
  48. global_step243198_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg_sq.pt +3 -0
  49. global_step243198_universal/zero/model.layers.0.self_attn.k_proj.bias/fp32.pt +3 -0
  50. global_step243198_universal/zero/model.layers.0.self_attn.k_proj.bias/step.pt +3 -0
configuration_yulanmini.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
5
+ # and OPT implementations in this library. It has been modified from its
6
+ # original forms to accommodate minor architectural differences compared
7
+ # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
8
+ #
9
+ # Licensed under the Apache License, Version 2.0 (the "License");
10
+ # you may not use this file except in compliance with the License.
11
+ # You may obtain a copy of the License at
12
+ #
13
+ # http://www.apache.org/licenses/LICENSE-2.0
14
+ #
15
+ # Unless required by applicable law or agreed to in writing, software
16
+ # distributed under the License is distributed on an "AS IS" BASIS,
17
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
+ # See the License for the specific language governing permissions and
19
+ # limitations under the License.
20
+ """ YuLanMinimodel configuration"""
21
+
22
+ import math
23
+
24
+ from transformers.configuration_utils import PretrainedConfig
25
+ from transformers.utils import logging
26
+
27
+ logger = logging.get_logger(__name__)
28
+
29
+ YULANMINI_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
30
+
31
+
32
+ class YuLanMiniConfig(PretrainedConfig):
33
+ r"""
34
+ This is the configuration class to store the configuration of a [`YuLanMiniModel`]. It is used to instantiate an YuLanMini
35
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
36
+ defaults will yield a similar configuration to that of the YuLanMini-7B.
37
+
38
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
39
+ documentation from [`PretrainedConfig`] for more information.
40
+
41
+
42
+ Args:
43
+ vocab_size (`int`, *optional*, defaults to 32000):
44
+ Vocabulary size of the YuLanMinimodel. Defines the number of different tokens that can be represented by the
45
+ `inputs_ids` passed when calling [`YuLanMiniModel`]
46
+ hidden_size (`int`, *optional*, defaults to 4096):
47
+ Dimension of the hidden representations.
48
+ intermediate_size (`int`, *optional*, defaults to 11008):
49
+ Dimension of the MLP representations.
50
+ num_hidden_layers (`int`, *optional*, defaults to 32):
51
+ Number of hidden layers in the Transformer decoder.
52
+ num_attention_heads (`int`, *optional*, defaults to 32):
53
+ Number of attention heads for each attention layer in the Transformer decoder.
54
+ num_key_value_heads (`int`, *optional*):
55
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
56
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
57
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
58
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
59
+ by meanpooling all the original heads within that group. For more details checkout [this
60
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
61
+ `num_attention_heads`.
62
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
63
+ The non-linear activation function (function or string) in the decoder.
64
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
65
+ The maximum sequence length that this model might ever be used with. YuLanMini1 supports up to 2048 tokens,
66
+ YuLanMini2 up to 4096, CodeYuLanMiniup to 16384.
67
+ initializer_range (`float`, *optional*, defaults to 0.02):
68
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
69
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
70
+ The epsilon used by the rms normalization layers.
71
+ use_cache (`bool`, *optional*, defaults to `True`):
72
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
73
+ relevant if `config.is_decoder=True`.
74
+ pad_token_id (`int`, *optional*):
75
+ Padding token id.
76
+ bos_token_id (`int`, *optional*, defaults to 1):
77
+ Beginning of stream token id.
78
+ eos_token_id (`int`, *optional*, defaults to 2):
79
+ End of stream token id.
80
+ pretraining_tp (`int`, *optional*, defaults to 1):
81
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
82
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
83
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
84
+ issue](https://github.com/pytorch/pytorch/issues/76232).
85
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
86
+ Whether to tie weight embeddings
87
+ rope_theta (`float`, *optional*, defaults to 10000.0):
88
+ The base period of the RoPE embeddings.
89
+ rope_scaling (`Dict`, *optional*):
90
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
91
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
92
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
93
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
94
+ these scaling strategies behave:
95
+ https://www.reddit.com/r/LocalYuLanMini/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
96
+ experimental feature, subject to breaking API changes in future versions.
97
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
98
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
99
+ attention_dropout (`float`, *optional*, defaults to 0.0):
100
+ The dropout ratio for the attention probabilities.
101
+
102
+ ```python
103
+ >>> from transformers import YuLanMiniModel, YuLanMiniConfig
104
+
105
+ >>> # Initializing a YuLanMini-7b style configuration
106
+ >>> configuration = YuLanMiniConfig()
107
+
108
+ >>> # Initializing a model from the YuLanMini-7b style configuration
109
+ >>> model = YuLanMiniModel(configuration)
110
+
111
+ >>> # Accessing the model configuration
112
+ >>> configuration = model.config
113
+ ```"""
114
+
115
+ model_type = "yulanmini"
116
+ keys_to_ignore_at_inference = ["past_key_values"]
117
+
118
+ def __init__(
119
+ self,
120
+ vocab_size=99000,
121
+ hidden_size=1920,
122
+ intermediate_size=4800,
123
+ num_hidden_layers=56,
124
+ num_attention_heads=30,
125
+ num_key_value_heads=6,
126
+
127
+ # 不常用变量
128
+ hidden_act="silu",
129
+ max_position_embeddings=4096,
130
+ rms_norm_eps=1e-6,
131
+ use_cache=True,
132
+ pad_token_id=None, # /home/u20140041/pretrain-mini/preprocess/modify_tokenizer/1731
133
+ bos_token_id=1,
134
+ eos_token_id=2,
135
+ tie_word_embeddings=False,
136
+ rope_theta=10000.0,
137
+ use_sliding_window=False,
138
+ sliding_window=4096,
139
+ rope_scaling=None,
140
+ attention_bias=True, # qwen
141
+ attention_dropout=0.0,
142
+ # 放缩embedding grad
143
+ shrink_alpha=1,
144
+ shrink_alpha2=1,
145
+ use_liger=False,
146
+ # 初始化
147
+ initializer_range=0.014434,
148
+ init_scale_o=10.582218,
149
+ model_reproduce="transformer",
150
+ # 下面是为了muparam设置的参数,需要保证:默认值是不使用任何muparam的部分
151
+ hidden_states_shrink=1,
152
+ dim_model_base=None,
153
+ dim_ffn_base_init=None, # 新版muparam没有使用了
154
+ dim_model_base_init=None,
155
+ dim_model_base_attn=None,
156
+ dim_model_base_lmh=None,
157
+ dim_model_base_logits=None,
158
+ dim_model_base_lr=None,
159
+ scale_emb=1,
160
+ # qk_layernorm
161
+ qk_layernorm=False,
162
+ layer_norm_eps=1e-6,
163
+ embedding_ln=False,
164
+ embedding_rmsln=False,
165
+ ln_scale=1.,
166
+ z_loss=0.0001,
167
+ # wesar
168
+ wesar_weights=True,
169
+ embed_tokens_alpha=1,
170
+ q_proj_alpha=1,
171
+ k_proj_alpha=1,
172
+ v_proj_alpha=1,
173
+ o_proj_alpha=1,
174
+ down_proj_alpha=1,
175
+ gate_up_proj_alpha=1,
176
+ input_layernorm_alpha=1,
177
+ post_attention_layernorm_alpha=1,
178
+ norm_alpha=1,
179
+ lm_head_alpha=1,
180
+ use_norm_alpha=True,
181
+ use_emb_alpha=False,
182
+ rms_type="llama",
183
+ num_steps_trained_before_this_epoch=0,
184
+ num_epochs_trained_before_this_epoch=0,
185
+ # 加速
186
+ gradient_checkpointing_step=7,
187
+ **kwargs,
188
+ ):
189
+ # 训练states,每个epoch更新,epoch内部不会变。比如训练到第4轮数据,这两个的值都是第三轮最后一步的值(epochs=3, steps=xxx),只要是在第4轮,无论是多少步,都是第三轮的值,由update_trained_steps_and_epochs控制是否更新
190
+ self.num_steps_trained_before_this_epoch = num_steps_trained_before_this_epoch
191
+ self.num_epochs_trained_before_this_epoch = num_epochs_trained_before_this_epoch
192
+
193
+ self.vocab_size = vocab_size
194
+ self.max_position_embeddings = max_position_embeddings
195
+ self.hidden_size = hidden_size
196
+ self.intermediate_size = intermediate_size
197
+ self.num_hidden_layers = num_hidden_layers
198
+ self.num_attention_heads = num_attention_heads
199
+ self.use_sliding_window = use_sliding_window
200
+ self.sliding_window = sliding_window if use_sliding_window else None
201
+
202
+ # for backward compatibility
203
+ if num_key_value_heads is None:
204
+ num_key_value_heads = num_attention_heads
205
+
206
+ self.num_key_value_heads = num_key_value_heads
207
+ self.hidden_act = hidden_act
208
+ self.initializer_range = initializer_range
209
+ self.rms_norm_eps = rms_norm_eps
210
+ self.use_cache = use_cache
211
+ self.rope_theta = rope_theta
212
+ self.rope_scaling = rope_scaling
213
+ self._rope_scaling_validation()
214
+ self.attention_bias = attention_bias
215
+ self.attention_dropout = attention_dropout
216
+ self.shrink_alpha = shrink_alpha
217
+ self.use_liger = use_liger
218
+ self.init_scale_o = init_scale_o
219
+ self.hidden_states_shrink = 1 / math.sqrt(num_hidden_layers) if hidden_states_shrink == "muparam" else hidden_states_shrink
220
+ self.dim_model_base = dim_model_base if dim_model_base is not None else hidden_size
221
+ self.dim_model_base_init = dim_model_base_init
222
+ self.dim_model_base_attn = dim_model_base_attn if dim_model_base_attn is not None else (hidden_size // num_attention_heads) # 初始化为1则是使用1/H_dim
223
+ self.dim_model_base_lmh = dim_model_base_lmh if dim_model_base_lmh is not None else 1 # 初始化为1则是不放缩lm_head的init
224
+ self.scale_emb = scale_emb if scale_emb is not None else 1
225
+ self.model_reproduce=model_reproduce if model_reproduce is not None else "transformer"
226
+ self.dim_model_base_logits = dim_model_base_logits if dim_model_base_logits is not None else hidden_size
227
+ self.dim_model_base_lr = dim_model_base_lr if dim_model_base_lr is not None else hidden_size
228
+
229
+ self.qk_layernorm = qk_layernorm
230
+ self.layer_norm_eps = layer_norm_eps
231
+ self.embedding_ln = embedding_ln
232
+ self.embedding_rmsln = embedding_rmsln
233
+ self.ln_scale = ln_scale
234
+ self.z_loss = z_loss
235
+
236
+ if embedding_ln and embedding_rmsln:
237
+ raise ValueError("Only one of embedding_ln and embedding_rmsln should be True")
238
+
239
+ self.wesar_weights = wesar_weights
240
+ self.embed_tokens_alpha = embed_tokens_alpha
241
+ self.q_proj_alpha = q_proj_alpha
242
+ self.k_proj_alpha = k_proj_alpha
243
+ self.v_proj_alpha = v_proj_alpha
244
+ self.o_proj_alpha = o_proj_alpha
245
+ self.down_proj_alpha = down_proj_alpha
246
+ self.gate_up_proj_alpha = gate_up_proj_alpha
247
+ self.input_layernorm_alpha = input_layernorm_alpha
248
+ self.post_attention_layernorm_alpha = post_attention_layernorm_alpha
249
+ self.norm_alpha = norm_alpha
250
+ self.lm_head_alpha = lm_head_alpha
251
+ self.use_norm_alpha = use_norm_alpha
252
+ self.use_emb_alpha = use_emb_alpha
253
+ self.rms_type = rms_type
254
+
255
+ self.gradient_checkpointing_step = gradient_checkpointing_step
256
+
257
+ if self.dim_model_base != hidden_size or self.dim_model_base_init is not None or self.dim_model_base_attn != (hidden_size // num_attention_heads) or self.dim_model_base_lmh != 1:
258
+ if init_scale_o != 1:
259
+ raise ValueError("When using muparam, init_scale_o should be 1")
260
+
261
+ # multiplier
262
+ print("Attention放缩:", math.sqrt(self.dim_model_base_attn) / (hidden_size // num_attention_heads))
263
+ print("Residual链接处的Hidden States放缩:", hidden_states_shrink)
264
+ print("Logits放缩:", 1 / (hidden_size / self.dim_model_base))
265
+
266
+ # initializer
267
+ if dim_model_base_init is not None:
268
+ print("o_proj,down_proj初始化STD:", initializer_range / math.sqrt(2 * (hidden_size / dim_model_base_init) * num_hidden_layers))
269
+ print("gate_proj,up_proj,q_proj,k_proj,v_proj初始化STD:", initializer_range / math.sqrt(self.hidden_size / self.dim_model_base_init))
270
+ else:
271
+ print("o_proj,down_proj初始化STD:", initializer_range / init_scale_o)
272
+ print("gate_proj,up_proj,q_proj,k_proj,v_proj初始化STD:", initializer_range)
273
+ print("lm_head初始化STD:", initializer_range / math.sqrt(self.dim_model_base_lmh))
274
+
275
+ if not tie_word_embeddings and self.scale_emb != 1:
276
+ raise ValueError("When using scale_emb, tie_word_embeddings should be False")
277
+
278
+ super().__init__(
279
+ pad_token_id=pad_token_id,
280
+ bos_token_id=bos_token_id,
281
+ eos_token_id=eos_token_id,
282
+ tie_word_embeddings=tie_word_embeddings,
283
+ **kwargs,
284
+ )
285
+ try:
286
+ import flash_attn
287
+ self._attn_implementation = "flash_attention_2"
288
+ except:
289
+ pass
290
+
291
+ def _rope_scaling_validation(self):
292
+ """
293
+ Validate the `rope_scaling` configuration.
294
+ """
295
+ if self.rope_scaling is None:
296
+ return
297
+
298
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
299
+ raise ValueError(
300
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
301
+ f"got {self.rope_scaling}"
302
+ )
303
+ rope_scaling_type = self.rope_scaling.get("type", None)
304
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
305
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
306
+ raise ValueError(
307
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
308
+ )
309
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
310
+ raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
global_step243198_universal/mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db6985e1ae1d922aeb69f6ee9e4f3622bb7cc6bc39c84b4e7fb205bf424ffd16
3
+ size 4468641136
training_args.bin → global_step243198_universal/zero/lm_head_alpha/exp_avg.pt RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:27a990c962875cc639fbd23de403192229520a08145bf10c4a0d1cb426e63ccb
3
- size 10872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5eb07a5a76a984fd7c285015c8133863bc36be788dbcabbb37dadc5ee39daf25
3
+ size 1180
global_step243198_universal/zero/lm_head_alpha/exp_avg_sq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7922757b2cd71f4c6f9e8f22d5da4455206f1e8235d399154cba99a25aa10f7
3
+ size 1195
global_step243198_universal/zero/lm_head_alpha/fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c20c4ae233bac39d7f1c734eecf1ecfd49ecb24d4456e841c3277defee67c0cc
3
+ size 1165
global_step243198_universal/zero/lm_head_alpha/step.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40343668ce95ee7677821672b43b1c1c43ed7c6b638a4da642b389ad816ced6
3
+ size 852
global_step243198_universal/zero/model.embed_tokens.weight/exp_avg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66b366212fbe90c8a0f71848be5713b32b1b4afaa4ee7daebbb3a0267d4b34f3
3
+ size 760321244
global_step243198_universal/zero/model.embed_tokens.weight/exp_avg_sq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aaa812bc06938dc09913fd4af9b54fe35d5f43b06d58dd4769decf72b11bba34
3
+ size 760321259
global_step243198_universal/zero/model.embed_tokens.weight/fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dbfb5af6c84319b82b3bbb36115d30c429b49ff2bf73d021b18bcddf9d68d1c
3
+ size 760321165
global_step243198_universal/zero/model.embed_tokens.weight/step.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40343668ce95ee7677821672b43b1c1c43ed7c6b638a4da642b389ad816ced6
3
+ size 852
global_step243198_universal/zero/model.layers.0.down_proj_alpha/exp_avg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ae4c645770e222fb1f0f6523d5562f36611152c7f98c328ca9466acd40d1365
3
+ size 1180
global_step243198_universal/zero/model.layers.0.down_proj_alpha/exp_avg_sq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b354c1435a7f3425ab4e830d05db3cf4869e7bfc74b507c8e7116e493ec53c83
3
+ size 1195
global_step243198_universal/zero/model.layers.0.down_proj_alpha/fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6bcab85cc4015d87092df58f21e7cd1e742e9bb8206c231a56e1d5d4619f9183
3
+ size 1165
global_step243198_universal/zero/model.layers.0.down_proj_alpha/step.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40343668ce95ee7677821672b43b1c1c43ed7c6b638a4da642b389ad816ced6
3
+ size 852
global_step243198_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:65b57c152cbb08ce685446051cd06ff9f1e2d5bd6887714cdf660cf9a3de017d
3
+ size 1180
global_step243198_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg_sq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:565662ac6efe6c7cef53e77bc4e477ec5ec939596624953fb13a1010eee355d7
3
+ size 1195
global_step243198_universal/zero/model.layers.0.gate_up_proj_alpha/fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4df6e1db7f668d5fda482d2bfce264b78ddb7011e260994070a31d775aef2e5a
3
+ size 1165
global_step243198_universal/zero/model.layers.0.gate_up_proj_alpha/step.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40343668ce95ee7677821672b43b1c1c43ed7c6b638a4da642b389ad816ced6
3
+ size 852
global_step243198_universal/zero/model.layers.0.input_layernorm.weight/exp_avg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:56437f32584526c52e3e64cd6f8178abd65ae00f23d92cda8b270703a5303546
3
+ size 8860
global_step243198_universal/zero/model.layers.0.input_layernorm.weight/exp_avg_sq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ff19e5a59d8191c86ea05d045183068e57d96342047845828d558e9ceb15620
3
+ size 8875
global_step243198_universal/zero/model.layers.0.input_layernorm.weight/fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dda538f899b574fb54055ea16478fe46b19cf517d36da128993f92d05994a61
3
+ size 8781
global_step243198_universal/zero/model.layers.0.input_layernorm.weight/step.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40343668ce95ee7677821672b43b1c1c43ed7c6b638a4da642b389ad816ced6
3
+ size 852
global_step243198_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc159c005aacaf434ab98d149cf0285e5649a99db6aa92744b337b142b7dca34
3
+ size 1180
global_step243198_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg_sq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21322db8b68329175d238406561cf4d5bb5ca42ee4d7aa6b413bf1e1e39d1bf7
3
+ size 1195
global_step243198_universal/zero/model.layers.0.input_layernorm_alpha/fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9cf7b71ff8af16fd86bf824579093669cd08ca7647bdf58950c745f955dddb3
3
+ size 1165
global_step243198_universal/zero/model.layers.0.input_layernorm_alpha/step.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40343668ce95ee7677821672b43b1c1c43ed7c6b638a4da642b389ad816ced6
3
+ size 852
global_step243198_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a5a995a8f87848019493c2b98cef7baef0185dff5eb1127eff94fb97605e59f
3
+ size 36865244
global_step243198_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg_sq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af930bef14ae385ae5232c116600d1e943d8bafd585a1c47cff167613c7d9aa8
3
+ size 36865259
global_step243198_universal/zero/model.layers.0.mlp.down_proj.weight/fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b4fe272c9c3469c50e587a2cdba134c8853de0b8fbd7d434bbc32262e06579c
3
+ size 36865165
global_step243198_universal/zero/model.layers.0.mlp.down_proj.weight/step.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40343668ce95ee7677821672b43b1c1c43ed7c6b638a4da642b389ad816ced6
3
+ size 852
global_step243198_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09f5e7e4b6f4f76d9905221fdcb37126d77b4a81c97adab024b614eb53897f96
3
+ size 36865244
global_step243198_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg_sq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abb476f2a4f0a97f32014ac80c232a5af2af99e3406d47240a992501f1c8008c
3
+ size 36865259
global_step243198_universal/zero/model.layers.0.mlp.gate_proj.weight/fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32edb29fb545e7a6e65df5ce13f9ce82765a7acc34973fb7371e7a42f20aecd2
3
+ size 36865165
global_step243198_universal/zero/model.layers.0.mlp.gate_proj.weight/step.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40343668ce95ee7677821672b43b1c1c43ed7c6b638a4da642b389ad816ced6
3
+ size 852
global_step243198_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66908726e6ea3a2a9becc5d89c6f6112ff523f2a284b8a95b148bc5653677830
3
+ size 36865244
global_step243198_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg_sq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5a0bb8ff896f33ac32fda4a0172d34b2f50fced9dc23d3ca29fc001f08419c6
3
+ size 36865259
global_step243198_universal/zero/model.layers.0.mlp.up_proj.weight/fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e5b81eaf4bcc3879df3bc692d9e4652ad9ace3a7c80a197d7f949645d7ca314f
3
+ size 36865165
global_step243198_universal/zero/model.layers.0.mlp.up_proj.weight/step.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40343668ce95ee7677821672b43b1c1c43ed7c6b638a4da642b389ad816ced6
3
+ size 852
global_step243198_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:403047234e9fc921ecb9d0e4ac97e2cc169c94727c4370996e3d13ca0965d6ef
3
+ size 8860
global_step243198_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg_sq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a4346ce7447f73b7134e4156814493c77496ad05d62c13a5ae068b6f58f1c51f
3
+ size 8875
global_step243198_universal/zero/model.layers.0.post_attention_layernorm.weight/fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ded81e4d339d6394f1a69aff90f9a5c2e4c745252badb57964f99e95c0e3d0c
3
+ size 8781
global_step243198_universal/zero/model.layers.0.post_attention_layernorm.weight/step.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40343668ce95ee7677821672b43b1c1c43ed7c6b638a4da642b389ad816ced6
3
+ size 852
global_step243198_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:106125c7121823feebee36822359de0acda248d8a33f6aeb742ee49f69b242ec
3
+ size 1180
global_step243198_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg_sq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9f73b5e2b79fc60328eda519f0e0d55a740ad0b26f74ac77fada9cbbc4ed2273
3
+ size 1195
global_step243198_universal/zero/model.layers.0.post_attention_layernorm_alpha/fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06b155f15ab3e8a66e65a5abede44dc417fa2413777cf97d9bc5149c19be7639
3
+ size 1165
global_step243198_universal/zero/model.layers.0.post_attention_layernorm_alpha/step.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40343668ce95ee7677821672b43b1c1c43ed7c6b638a4da642b389ad816ced6
3
+ size 852
global_step243198_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2b1b92c6e5d2e9bb51a8189a62df50f1840210bf7a9e74bcbd4eae1b17ce837c
3
+ size 2716
global_step243198_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg_sq.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b79f7644943018df3c1a82fb2e7269bf2a648d92325561a64d9a26fbed095003
3
+ size 2731
global_step243198_universal/zero/model.layers.0.self_attn.k_proj.bias/fp32.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:303c3b60f63ca63af702d730e4f122d49e2a1e4aa34b9759205e90d76cf4abc0
3
+ size 2637
global_step243198_universal/zero/model.layers.0.self_attn.k_proj.bias/step.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b40343668ce95ee7677821672b43b1c1c43ed7c6b638a4da642b389ad816ced6
3
+ size 852