Upload deepspeed checkpoint
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- config.json +69 -0
- global_step262772_universal/mp_rank_00_model_states.pt +3 -0
- global_step262772_universal/zero/lm_head_alpha/exp_avg.pt +3 -0
- global_step262772_universal/zero/lm_head_alpha/exp_avg_sq.pt +3 -0
- global_step262772_universal/zero/lm_head_alpha/fp32.pt +3 -0
- global_step262772_universal/zero/lm_head_alpha/step.pt +3 -0
- global_step262772_universal/zero/model.embed_tokens.weight/exp_avg.pt +3 -0
- global_step262772_universal/zero/model.embed_tokens.weight/exp_avg_sq.pt +3 -0
- global_step262772_universal/zero/model.embed_tokens.weight/fp32.pt +3 -0
- global_step262772_universal/zero/model.embed_tokens.weight/step.pt +3 -0
- global_step262772_universal/zero/model.layers.0.down_proj_alpha/exp_avg.pt +3 -0
- global_step262772_universal/zero/model.layers.0.down_proj_alpha/exp_avg_sq.pt +3 -0
- global_step262772_universal/zero/model.layers.0.down_proj_alpha/fp32.pt +3 -0
- global_step262772_universal/zero/model.layers.0.down_proj_alpha/step.pt +3 -0
- global_step262772_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg.pt +3 -0
- global_step262772_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg_sq.pt +3 -0
- global_step262772_universal/zero/model.layers.0.gate_up_proj_alpha/fp32.pt +3 -0
- global_step262772_universal/zero/model.layers.0.gate_up_proj_alpha/step.pt +3 -0
- global_step262772_universal/zero/model.layers.0.input_layernorm.weight/exp_avg.pt +3 -0
- global_step262772_universal/zero/model.layers.0.input_layernorm.weight/exp_avg_sq.pt +3 -0
- global_step262772_universal/zero/model.layers.0.input_layernorm.weight/fp32.pt +3 -0
- global_step262772_universal/zero/model.layers.0.input_layernorm.weight/step.pt +3 -0
- global_step262772_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg.pt +3 -0
- global_step262772_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg_sq.pt +3 -0
- global_step262772_universal/zero/model.layers.0.input_layernorm_alpha/fp32.pt +3 -0
- global_step262772_universal/zero/model.layers.0.input_layernorm_alpha/step.pt +3 -0
- global_step262772_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg.pt +3 -0
- global_step262772_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg_sq.pt +3 -0
- global_step262772_universal/zero/model.layers.0.mlp.down_proj.weight/fp32.pt +3 -0
- global_step262772_universal/zero/model.layers.0.mlp.down_proj.weight/step.pt +3 -0
- global_step262772_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg.pt +3 -0
- global_step262772_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg_sq.pt +3 -0
- global_step262772_universal/zero/model.layers.0.mlp.gate_proj.weight/fp32.pt +3 -0
- global_step262772_universal/zero/model.layers.0.mlp.gate_proj.weight/step.pt +3 -0
- global_step262772_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg.pt +3 -0
- global_step262772_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg_sq.pt +3 -0
- global_step262772_universal/zero/model.layers.0.mlp.up_proj.weight/fp32.pt +3 -0
- global_step262772_universal/zero/model.layers.0.mlp.up_proj.weight/step.pt +3 -0
- global_step262772_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg.pt +3 -0
- global_step262772_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg_sq.pt +3 -0
- global_step262772_universal/zero/model.layers.0.post_attention_layernorm.weight/fp32.pt +3 -0
- global_step262772_universal/zero/model.layers.0.post_attention_layernorm.weight/step.pt +3 -0
- global_step262772_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg.pt +3 -0
- global_step262772_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg_sq.pt +3 -0
- global_step262772_universal/zero/model.layers.0.post_attention_layernorm_alpha/fp32.pt +3 -0
- global_step262772_universal/zero/model.layers.0.post_attention_layernorm_alpha/step.pt +3 -0
- global_step262772_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg.pt +3 -0
- global_step262772_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg_sq.pt +3 -0
- global_step262772_universal/zero/model.layers.0.self_attn.k_proj.bias/fp32.pt +3 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
global_step262772_universal/ filter=lfs diff=lfs merge=lfs -text
|
config.json
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"YuLanMiniForCausalLM"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "configuration_yulanmini.YuLanMiniConfig",
|
7 |
+
"AutoModel": "modeling_yulanmini.YuLanMiniModel",
|
8 |
+
"AutoModelForCausalLM": "modeling_yulanmini.YuLanMiniForCausalLM",
|
9 |
+
},
|
10 |
+
"attention_bias": true,
|
11 |
+
"attention_dropout": 0.0,
|
12 |
+
"bos_token_id": 1,
|
13 |
+
"dim_model_base": 1920,
|
14 |
+
"dim_model_base_attn": 64,
|
15 |
+
"dim_model_base_init": null,
|
16 |
+
"dim_model_base_lmh": 1,
|
17 |
+
"dim_model_base_logits": 1920.0,
|
18 |
+
"dim_model_base_lr": 256.0,
|
19 |
+
"down_proj_alpha": 0.03450327796711771,
|
20 |
+
"embed_tokens_alpha": 1,
|
21 |
+
"embedding_ln": false,
|
22 |
+
"embedding_rmsln": false,
|
23 |
+
"eos_token_id": 2,
|
24 |
+
"gate_up_proj_alpha": 0.3651483716701107,
|
25 |
+
"gradient_checkpointing_step": 56,
|
26 |
+
"hidden_act": "silu",
|
27 |
+
"hidden_size": 1920,
|
28 |
+
"hidden_states_shrink": 0.18708286933869706,
|
29 |
+
"init_scale_o": 1,
|
30 |
+
"initializer_range": 5e-05,
|
31 |
+
"input_layernorm_alpha": 1.0,
|
32 |
+
"intermediate_size": 4800,
|
33 |
+
"k_proj_alpha": 0.3651483716701107,
|
34 |
+
"layer_norm_eps": 1e-06,
|
35 |
+
"lm_head_alpha": 1.0,
|
36 |
+
"ln_scale": 1,
|
37 |
+
"max_position_embeddings": 28723,
|
38 |
+
"model_reproduce": "transformer",
|
39 |
+
"model_type": "yulanmini",
|
40 |
+
"norm_alpha": 1.0,
|
41 |
+
"num_attention_heads": 30,
|
42 |
+
"num_epochs_trained_before_this_epoch": 26,
|
43 |
+
"num_hidden_layers": 56,
|
44 |
+
"num_key_value_heads": 6,
|
45 |
+
"num_steps_trained_before_this_epoch": 253006,
|
46 |
+
"o_proj_alpha": 0.03450327796711771,
|
47 |
+
"post_attention_layernorm_alpha": 1.0,
|
48 |
+
"q_proj_alpha": 0.3651483716701107,
|
49 |
+
"qk_layernorm": false,
|
50 |
+
"rms_norm_eps": 1e-06,
|
51 |
+
"rms_type": "llama",
|
52 |
+
"rope_scaling": null,
|
53 |
+
"rope_theta": 490000.0,
|
54 |
+
"scale_emb": 10.0,
|
55 |
+
"shrink_alpha": 1,
|
56 |
+
"sliding_window": null,
|
57 |
+
"tie_word_embeddings": true,
|
58 |
+
"torch_dtype": "bfloat16",
|
59 |
+
"transformers_version": "4.44.0",
|
60 |
+
"use_cache": true,
|
61 |
+
"use_emb_alpha": true,
|
62 |
+
"use_liger": true,
|
63 |
+
"use_norm_alpha": true,
|
64 |
+
"use_sliding_window": false,
|
65 |
+
"v_proj_alpha": 0.3651483716701107,
|
66 |
+
"vocab_size": 99000,
|
67 |
+
"wesar_weights": true,
|
68 |
+
"z_loss": 0.0001
|
69 |
+
}
|
global_step262772_universal/mp_rank_00_model_states.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:068960e69bea17a9db7d28394f5d4188548e03b6123f2523d3306b4ea7453d3a
|
3 |
+
size 4468641200
|
global_step262772_universal/zero/lm_head_alpha/exp_avg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:24d1e97f32ee6a14d3b980485c33afd6eff0be75132a5e6c4420616ff70ba33a
|
3 |
+
size 1180
|
global_step262772_universal/zero/lm_head_alpha/exp_avg_sq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:488a84a8d79d80262280c382dc3084ae6a17c283073d19c6b8f3624a7e30504a
|
3 |
+
size 1195
|
global_step262772_universal/zero/lm_head_alpha/fp32.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:920a12ef25b001861b5826242427f2a5a12cd1ab5c0994646ac3a00744359739
|
3 |
+
size 1165
|
global_step262772_universal/zero/lm_head_alpha/step.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c7b9f13cc6e441f30fe561a052ed7fe853d93f8c150906b9656a8040e2ae39
|
3 |
+
size 852
|
global_step262772_universal/zero/model.embed_tokens.weight/exp_avg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:421f96fa124ae49c657d5be3c502391f9e3cb8147eed8c191851902a96d22b5e
|
3 |
+
size 760321244
|
global_step262772_universal/zero/model.embed_tokens.weight/exp_avg_sq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e2bf52fee54eac12bad80a185c257169bdfc0d128b378c42d44f9161ea3ea7f8
|
3 |
+
size 760321259
|
global_step262772_universal/zero/model.embed_tokens.weight/fp32.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:72496ea0b76d71296d456c776e28a2614bee98cb7e37c9016aa7be2c0518d440
|
3 |
+
size 760321165
|
global_step262772_universal/zero/model.embed_tokens.weight/step.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c7b9f13cc6e441f30fe561a052ed7fe853d93f8c150906b9656a8040e2ae39
|
3 |
+
size 852
|
global_step262772_universal/zero/model.layers.0.down_proj_alpha/exp_avg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:eec0ee5600fa9b05480124179b95824614d6dd26ad3fb6caadf510d2e6969608
|
3 |
+
size 1180
|
global_step262772_universal/zero/model.layers.0.down_proj_alpha/exp_avg_sq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1dbb3f87cb27b5f6ff2ee19db88cb77df6116edefdb304413066049db1e2d1ce
|
3 |
+
size 1195
|
global_step262772_universal/zero/model.layers.0.down_proj_alpha/fp32.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9557f20dc5e6ff6c7b550489769bd9682302a2573064285af046614557a5c642
|
3 |
+
size 1165
|
global_step262772_universal/zero/model.layers.0.down_proj_alpha/step.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c7b9f13cc6e441f30fe561a052ed7fe853d93f8c150906b9656a8040e2ae39
|
3 |
+
size 852
|
global_step262772_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a8de139b0406580e783c6c0734de73ae7a99f77e76981a4271107a26a9fb37e2
|
3 |
+
size 1180
|
global_step262772_universal/zero/model.layers.0.gate_up_proj_alpha/exp_avg_sq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a7b0967aff9b563dee22481fafdfa316d0b84101c6a07aad6c3542643f6df04
|
3 |
+
size 1195
|
global_step262772_universal/zero/model.layers.0.gate_up_proj_alpha/fp32.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f6dcfec001a806aa802f80e13df9a12f8fb938d0d95356ff766172d117efd756
|
3 |
+
size 1165
|
global_step262772_universal/zero/model.layers.0.gate_up_proj_alpha/step.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c7b9f13cc6e441f30fe561a052ed7fe853d93f8c150906b9656a8040e2ae39
|
3 |
+
size 852
|
global_step262772_universal/zero/model.layers.0.input_layernorm.weight/exp_avg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b633dfc682f32f3af5721f1e3e0f74de925ed6bf41e108347972271ef13f2e9b
|
3 |
+
size 8860
|
global_step262772_universal/zero/model.layers.0.input_layernorm.weight/exp_avg_sq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c7413a968724e9d33504e0c55dffaf905eea10453636f2653b0d14f7cb59c16f
|
3 |
+
size 8875
|
global_step262772_universal/zero/model.layers.0.input_layernorm.weight/fp32.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0edcc90e8d3e55908e1568e75c4a61a180dc276a963dbff07eae0e77f71f1a96
|
3 |
+
size 8781
|
global_step262772_universal/zero/model.layers.0.input_layernorm.weight/step.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c7b9f13cc6e441f30fe561a052ed7fe853d93f8c150906b9656a8040e2ae39
|
3 |
+
size 852
|
global_step262772_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a41d5db3d8cacbd3dbb42afedfcc7907df0a4c91ba4e95e833bdd6e961281406
|
3 |
+
size 1180
|
global_step262772_universal/zero/model.layers.0.input_layernorm_alpha/exp_avg_sq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c67b9840fd22c47fc0db75fb4070cc7c0df66a45a6dc834942d25b202a6df1cb
|
3 |
+
size 1195
|
global_step262772_universal/zero/model.layers.0.input_layernorm_alpha/fp32.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cd4e0b75f2f2fdaaf690fb830f58bf40df16b01b8c8dbba760f95a74d3fb5b45
|
3 |
+
size 1165
|
global_step262772_universal/zero/model.layers.0.input_layernorm_alpha/step.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c7b9f13cc6e441f30fe561a052ed7fe853d93f8c150906b9656a8040e2ae39
|
3 |
+
size 852
|
global_step262772_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8a4083737526eb251d4607ad0d638422fff8e0e1e7f86308d6c41784ad0d211c
|
3 |
+
size 36865244
|
global_step262772_universal/zero/model.layers.0.mlp.down_proj.weight/exp_avg_sq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ead5e3dc0d98cfe7817fed158c0ca54504c6cc1149677627b2c39831b673c7ee
|
3 |
+
size 36865259
|
global_step262772_universal/zero/model.layers.0.mlp.down_proj.weight/fp32.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f6aa9574d69070bb7ecb6ef7b0e8858090bc10c08aec506e4f3112e21144bc4
|
3 |
+
size 36865165
|
global_step262772_universal/zero/model.layers.0.mlp.down_proj.weight/step.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c7b9f13cc6e441f30fe561a052ed7fe853d93f8c150906b9656a8040e2ae39
|
3 |
+
size 852
|
global_step262772_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:96f6eb1b2f763978c092f284f62204be74c428b1a6c18290ba899d2c0623df16
|
3 |
+
size 36865244
|
global_step262772_universal/zero/model.layers.0.mlp.gate_proj.weight/exp_avg_sq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d484c7d9b0de884d18649bb0a1f335884b9242d4292d5ba430056f913414a4b1
|
3 |
+
size 36865259
|
global_step262772_universal/zero/model.layers.0.mlp.gate_proj.weight/fp32.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f4bb6e12bbc1413cda9cc0951e1f91dc995cc85593c668a6f03acaa9e58319d6
|
3 |
+
size 36865165
|
global_step262772_universal/zero/model.layers.0.mlp.gate_proj.weight/step.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c7b9f13cc6e441f30fe561a052ed7fe853d93f8c150906b9656a8040e2ae39
|
3 |
+
size 852
|
global_step262772_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b3c6617a041f3748e8181289f24ceff5f2cc982d43fd975da2c8366b91bf3651
|
3 |
+
size 36865244
|
global_step262772_universal/zero/model.layers.0.mlp.up_proj.weight/exp_avg_sq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f297e1d3bbc8ea0c21fc1b11b90970681b704e3b662b99473cf859c1884133ef
|
3 |
+
size 36865259
|
global_step262772_universal/zero/model.layers.0.mlp.up_proj.weight/fp32.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e20916aea6240cd434aa26da48df675e4f4ba6b59d9a067e4a2d2a8fd56bdd48
|
3 |
+
size 36865165
|
global_step262772_universal/zero/model.layers.0.mlp.up_proj.weight/step.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c7b9f13cc6e441f30fe561a052ed7fe853d93f8c150906b9656a8040e2ae39
|
3 |
+
size 852
|
global_step262772_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5c1be7eadf6d9dc033847ee913ade2676ff54832e06de6640078ca66c7450a6d
|
3 |
+
size 8860
|
global_step262772_universal/zero/model.layers.0.post_attention_layernorm.weight/exp_avg_sq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8783d4267134189b045a6bf632c72918e7774fdde3dbe0a4af6b64f4966e28d6
|
3 |
+
size 8875
|
global_step262772_universal/zero/model.layers.0.post_attention_layernorm.weight/fp32.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:14dd67d7ce0a219a1514cbf1e711ce7454a70f808e9eeafb5b266e5c26a2eb0b
|
3 |
+
size 8781
|
global_step262772_universal/zero/model.layers.0.post_attention_layernorm.weight/step.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c7b9f13cc6e441f30fe561a052ed7fe853d93f8c150906b9656a8040e2ae39
|
3 |
+
size 852
|
global_step262772_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d428b0db470fb2efd3ce6dff50c56de02f6d82c00c930714f7bd4ac34e5c5df
|
3 |
+
size 1180
|
global_step262772_universal/zero/model.layers.0.post_attention_layernorm_alpha/exp_avg_sq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55c91a7433fefb18434ce776e6dddc567b4b258b026ed78286a6ad87f8a06080
|
3 |
+
size 1195
|
global_step262772_universal/zero/model.layers.0.post_attention_layernorm_alpha/fp32.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3f815161135baf75aa6a0cac8111d03984316e61209efd4ec2100cdc6a8a66a8
|
3 |
+
size 1165
|
global_step262772_universal/zero/model.layers.0.post_attention_layernorm_alpha/step.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c7b9f13cc6e441f30fe561a052ed7fe853d93f8c150906b9656a8040e2ae39
|
3 |
+
size 852
|
global_step262772_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f9c337d7fe1cfa0f72e2f62ef43cbd3c435a3c88c021608bb2d18d7e1aaacd57
|
3 |
+
size 2716
|
global_step262772_universal/zero/model.layers.0.self_attn.k_proj.bias/exp_avg_sq.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0bdfe78e0197b464163d167b81c4d61ebe17878506184d7ef06442370fcbf185
|
3 |
+
size 2731
|
global_step262772_universal/zero/model.layers.0.self_attn.k_proj.bias/fp32.pt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c026e2229dedc3ec003318235c3e237568a65105983663ef2eda4cf610e37ee4
|
3 |
+
size 2637
|