add pretrained model
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +1 -0
- config_mistral.py +70 -0
- config_mistral_7b.py +88 -0
- config_mistral_7b.yaml +53 -0
- config_tiny_mistral.py +7 -42
- config_tiny_mistral.yaml +92 -0
- convert_trfrs_to_brrr.py +262 -0
- modeling_mistral.py +50 -27
- pretrained/Mistral-7B-v0.1/checkpoint_metadata.json +9 -0
- pretrained/Mistral-7B-v0.1/config.yaml +53 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/input_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/input_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/input_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/input_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/input_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/input_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/post_attention_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/input_layernorm/model_weight.safetensors +3 -0
- pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors +3 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__
|
config_mistral.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information.
|
2 |
+
|
3 |
+
Usage:
|
4 |
+
```
|
5 |
+
python config_tiny_mistral.py
|
6 |
+
```
|
7 |
+
"""
|
8 |
+
import os
|
9 |
+
from dataclasses import dataclass
|
10 |
+
from typing import Optional
|
11 |
+
|
12 |
+
from nanotron.config import (
|
13 |
+
CheckpointsArgs,
|
14 |
+
Config,
|
15 |
+
DataArgs,
|
16 |
+
GeneralArgs,
|
17 |
+
LoggingArgs,
|
18 |
+
LRSchedulerArgs,
|
19 |
+
ModelArgs,
|
20 |
+
OptimizerArgs,
|
21 |
+
ParallelismArgs,
|
22 |
+
PretrainDatasetsArgs,
|
23 |
+
RandomInit,
|
24 |
+
TokenizerArgs,
|
25 |
+
TokensArgs,
|
26 |
+
)
|
27 |
+
from nanotron.logging import human_format
|
28 |
+
|
29 |
+
|
30 |
+
@dataclass
|
31 |
+
class MistralConfig:
|
32 |
+
"""Configuration for a MISTRAL model
|
33 |
+
|
34 |
+
Be careful on having a coherent typing as we use it to reconstruct the model from yaml
|
35 |
+
"""
|
36 |
+
|
37 |
+
attn_pdrop: float = 0.0
|
38 |
+
bos_token_id: int = 1
|
39 |
+
eos_token_id: int = 2
|
40 |
+
hidden_act: str = "silu"
|
41 |
+
hidden_size: int = 4096
|
42 |
+
initializer_range: float = 0.02
|
43 |
+
intermediate_size: int = 14336
|
44 |
+
is_mistral_config: bool = True # We use this help differentiate models in yaml/python conversion
|
45 |
+
max_position_embeddings: int = 32768
|
46 |
+
num_attention_heads: int = 32
|
47 |
+
num_hidden_layers: int = 32
|
48 |
+
num_key_value_heads: Optional[int] = 8
|
49 |
+
pad_token_id: Optional[int] = None
|
50 |
+
pretraining_tp: int = 1
|
51 |
+
rms_norm_eps: float = 1e-05
|
52 |
+
rope_theta: float = 10000.0
|
53 |
+
sliding_window_size: int = 4096
|
54 |
+
tie_word_embeddings: bool = False
|
55 |
+
use_cache: bool = True
|
56 |
+
vocab_size: int = 32000
|
57 |
+
|
58 |
+
def __post_init__(self):
|
59 |
+
# for backward compatibility
|
60 |
+
if self.num_key_value_heads is None:
|
61 |
+
self.num_key_value_heads = self.num_attention_heads
|
62 |
+
|
63 |
+
def get_num_params(model_config: MistralConfig) -> int:
|
64 |
+
num_params = model_config.vocab_size * model_config.hidden_size * 2 + \
|
65 |
+
model_config.num_hidden_layers * (
|
66 |
+
3 * model_config.hidden_size * model_config.intermediate_size
|
67 |
+
+ 2 * model_config.hidden_size * model_config.hidden_size
|
68 |
+
+ 2 * model_config.hidden_size * (model_config.hidden_size / (model_config.num_attention_heads / model_config.num_key_value_heads))
|
69 |
+
)
|
70 |
+
return num_params
|
config_mistral_7b.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information.
|
2 |
+
|
3 |
+
Usage:
|
4 |
+
```
|
5 |
+
python config_tiny_mistral.py
|
6 |
+
```
|
7 |
+
"""
|
8 |
+
import os
|
9 |
+
from dataclasses import dataclass
|
10 |
+
from typing import Optional
|
11 |
+
|
12 |
+
from nanotron.config import (
|
13 |
+
CheckpointsArgs,
|
14 |
+
Config,
|
15 |
+
DataArgs,
|
16 |
+
GeneralArgs,
|
17 |
+
LoggingArgs,
|
18 |
+
LRSchedulerArgs,
|
19 |
+
ModelArgs,
|
20 |
+
OptimizerArgs,
|
21 |
+
ParallelismArgs,
|
22 |
+
PretrainDatasetsArgs,
|
23 |
+
RandomInit,
|
24 |
+
TokenizerArgs,
|
25 |
+
TokensArgs,
|
26 |
+
)
|
27 |
+
from nanotron.logging import human_format
|
28 |
+
|
29 |
+
from config_mistral import MistralConfig, get_num_params
|
30 |
+
|
31 |
+
|
32 |
+
MODEL_CONFIG = MistralConfig(
|
33 |
+
# Config for Mistral 7B
|
34 |
+
attn_pdrop=0.0,
|
35 |
+
bos_token_id=1,
|
36 |
+
eos_token_id=2,
|
37 |
+
hidden_act="silu",
|
38 |
+
hidden_size=4096,
|
39 |
+
initializer_range=0.02,
|
40 |
+
intermediate_size=14336,
|
41 |
+
max_position_embeddings=32768,
|
42 |
+
num_attention_heads=32,
|
43 |
+
num_hidden_layers=32,
|
44 |
+
num_key_value_heads=8,
|
45 |
+
pretraining_tp=1,
|
46 |
+
rms_norm_eps=1e-05,
|
47 |
+
rope_theta=10000.0,
|
48 |
+
sliding_window_size=4096,
|
49 |
+
tie_word_embeddings=False,
|
50 |
+
use_cache=True,
|
51 |
+
vocab_size=32000,
|
52 |
+
)
|
53 |
+
|
54 |
+
num_params = human_format(get_num_params(MODEL_CONFIG)).replace(".", "p")
|
55 |
+
|
56 |
+
print(f"Model has {num_params} parameters")
|
57 |
+
|
58 |
+
PARALLELISM = ParallelismArgs(
|
59 |
+
dp=2,
|
60 |
+
pp=2,
|
61 |
+
tp=2,
|
62 |
+
pp_engine="1f1b",
|
63 |
+
tp_mode="REDUCE_SCATTER",
|
64 |
+
tp_linear_async_communication=True,
|
65 |
+
recompute_granularity="selective",
|
66 |
+
)
|
67 |
+
|
68 |
+
CONFIG = Config(
|
69 |
+
general=GeneralArgs(project="mistralai", run="Mistral-7B-v0.1", seed=42),
|
70 |
+
checkpoints=None,
|
71 |
+
parallelism=PARALLELISM,
|
72 |
+
model=ModelArgs(init_method=RandomInit(std=0.025), model_config=MODEL_CONFIG),
|
73 |
+
tokenizer=TokenizerArgs("mistralai/Mistral-7B-v0.1"),
|
74 |
+
optimizer=None,
|
75 |
+
logging=None,
|
76 |
+
tokens=None,
|
77 |
+
data=None,
|
78 |
+
profiler=None,
|
79 |
+
)
|
80 |
+
|
81 |
+
if __name__ == "__main__":
|
82 |
+
file_path = os.path.abspath(__file__)
|
83 |
+
|
84 |
+
file_path = file_path.replace(".py", ".yaml")
|
85 |
+
# Save config as YAML file
|
86 |
+
config.save_as_yaml(file_path)
|
87 |
+
|
88 |
+
# You can now train a model with this config using `/run_train.py`
|
config_mistral_7b.yaml
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
checkpoints: null
|
2 |
+
data: null
|
3 |
+
general:
|
4 |
+
benchmark_csv_path: null
|
5 |
+
consumed_train_samples: null
|
6 |
+
ignore_sanity_checks: false
|
7 |
+
project: mistralai
|
8 |
+
run: Mistral-7B-v0.1
|
9 |
+
seed: 42
|
10 |
+
step: null
|
11 |
+
logging: null
|
12 |
+
model:
|
13 |
+
ddp_bucket_cap_mb: 25
|
14 |
+
dtype: bfloat16
|
15 |
+
init_method:
|
16 |
+
std: 0.025
|
17 |
+
make_vocab_size_divisible_by: 1
|
18 |
+
model_config:
|
19 |
+
attn_pdrop: 0.0
|
20 |
+
bos_token_id: 1
|
21 |
+
eos_token_id: 2
|
22 |
+
hidden_act: silu
|
23 |
+
hidden_size: 4096
|
24 |
+
initializer_range: 0.02
|
25 |
+
intermediate_size: 14336
|
26 |
+
is_mistral_config: true
|
27 |
+
max_position_embeddings: 32768
|
28 |
+
num_attention_heads: 32
|
29 |
+
num_hidden_layers: 32
|
30 |
+
num_key_value_heads: 8
|
31 |
+
pad_token_id: null
|
32 |
+
pretraining_tp: 1
|
33 |
+
rms_norm_eps: 1.0e-05
|
34 |
+
rope_theta: 10000.0
|
35 |
+
sliding_window_size: 4096
|
36 |
+
tie_word_embeddings: false
|
37 |
+
use_cache: true
|
38 |
+
vocab_size: 32000
|
39 |
+
optimizer: null
|
40 |
+
parallelism:
|
41 |
+
dp: 2
|
42 |
+
pp: 2
|
43 |
+
pp_engine: 1f1b
|
44 |
+
recompute_granularity: SELECTIVE
|
45 |
+
tp: 2
|
46 |
+
tp_linear_async_communication: true
|
47 |
+
tp_mode: REDUCE_SCATTER
|
48 |
+
profiler: null
|
49 |
+
tokenizer:
|
50 |
+
tokenizer_max_length: null
|
51 |
+
tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
|
52 |
+
tokenizer_revision: null
|
53 |
+
tokens: null
|
config_tiny_mistral.py
CHANGED
@@ -26,41 +26,12 @@ from nanotron.config import (
|
|
26 |
)
|
27 |
from nanotron.logging import human_format
|
28 |
|
29 |
-
|
30 |
-
@dataclass
|
31 |
-
class MistralConfig:
|
32 |
-
"""Configuration for a MISTRAL model
|
33 |
-
|
34 |
-
Be careful on having a coherent typing as we use it to reconstruct the model from yaml
|
35 |
-
"""
|
36 |
-
|
37 |
-
bos_token_id: int = 1
|
38 |
-
eos_token_id: int = 2
|
39 |
-
hidden_act: str = "silu"
|
40 |
-
hidden_size: int = 4096
|
41 |
-
initializer_range: float = 0.02
|
42 |
-
intermediate_size: int = 11008
|
43 |
-
is_mistral_config: bool = True # We use this help differentiate models in yaml/python conversion
|
44 |
-
max_position_embeddings: int = 2048
|
45 |
-
num_attention_heads: int = 32
|
46 |
-
num_hidden_layers: int = 32
|
47 |
-
num_key_value_heads: Optional[int] = None
|
48 |
-
pad_token_id: Optional[int] = None
|
49 |
-
pretraining_tp: int = 1
|
50 |
-
rms_norm_eps: float = 1e-6
|
51 |
-
rope_scaling: Optional[dict] = None
|
52 |
-
tie_word_embeddings: bool = False
|
53 |
-
use_cache: bool = True
|
54 |
-
vocab_size: int = 32000
|
55 |
-
|
56 |
-
def __post_init__(self):
|
57 |
-
# for backward compatibility
|
58 |
-
if self.num_key_value_heads is None:
|
59 |
-
self.num_key_value_heads = self.num_attention_heads
|
60 |
|
61 |
|
62 |
model_config = MistralConfig(
|
63 |
# Config for a tiny model model with 1.62M parameters
|
|
|
64 |
bos_token_id=1,
|
65 |
eos_token_id=2,
|
66 |
hidden_act="silu",
|
@@ -73,20 +44,13 @@ model_config = MistralConfig(
|
|
73 |
num_key_value_heads=4,
|
74 |
pretraining_tp=1,
|
75 |
rms_norm_eps=1e-05,
|
76 |
-
|
77 |
tie_word_embeddings=True,
|
78 |
use_cache=True,
|
79 |
vocab_size=256,
|
80 |
)
|
81 |
|
82 |
-
num_params = human_format(
|
83 |
-
model_config.vocab_size * model_config.hidden_size * 2
|
84 |
-
+ model_config.num_hidden_layers
|
85 |
-
* (
|
86 |
-
3 * model_config.hidden_size * model_config.intermediate_size
|
87 |
-
+ 4 * model_config.hidden_size * model_config.hidden_size
|
88 |
-
)
|
89 |
-
).replace(".", "p")
|
90 |
|
91 |
print(f"Model has {num_params} parameters")
|
92 |
|
@@ -141,9 +105,10 @@ config = Config(
|
|
141 |
)
|
142 |
|
143 |
if __name__ == "__main__":
|
144 |
-
|
145 |
|
|
|
146 |
# Save config as YAML file
|
147 |
-
config.save_as_yaml(
|
148 |
|
149 |
# You can now train a model with this config using `/run_train.py`
|
|
|
26 |
)
|
27 |
from nanotron.logging import human_format
|
28 |
|
29 |
+
from config_mistral import MistralConfig, get_num_params
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
|
32 |
model_config = MistralConfig(
|
33 |
# Config for a tiny model model with 1.62M parameters
|
34 |
+
attn_pdrop=0.0,
|
35 |
bos_token_id=1,
|
36 |
eos_token_id=2,
|
37 |
hidden_act="silu",
|
|
|
44 |
num_key_value_heads=4,
|
45 |
pretraining_tp=1,
|
46 |
rms_norm_eps=1e-05,
|
47 |
+
rope_theta=10000.0,
|
48 |
tie_word_embeddings=True,
|
49 |
use_cache=True,
|
50 |
vocab_size=256,
|
51 |
)
|
52 |
|
53 |
+
num_params = human_format(get_num_params(model_config)).replace(".", "p")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
|
55 |
print(f"Model has {num_params} parameters")
|
56 |
|
|
|
105 |
)
|
106 |
|
107 |
if __name__ == "__main__":
|
108 |
+
file_path = os.path.abspath(__file__)
|
109 |
|
110 |
+
file_path = file_path.replace(".py", ".yaml")
|
111 |
# Save config as YAML file
|
112 |
+
config.save_as_yaml(file_path)
|
113 |
|
114 |
# You can now train a model with this config using `/run_train.py`
|
config_tiny_mistral.yaml
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
checkpoints:
|
2 |
+
checkpoint_interval: 10
|
3 |
+
checkpoints_path: /fsx/thomwolf/github/textbooks-proj/brrr/models/checkpoints
|
4 |
+
checkpoints_path_is_shared_file_system: false
|
5 |
+
resume_checkpoint_path: null
|
6 |
+
save_initial_state: false
|
7 |
+
data:
|
8 |
+
dataset:
|
9 |
+
dataset_overwrite_cache: false
|
10 |
+
dataset_processing_num_proc_per_process: 1
|
11 |
+
hf_dataset_config_name: null
|
12 |
+
hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
|
13 |
+
hf_dataset_splits: train
|
14 |
+
text_column_name: completion
|
15 |
+
num_loading_workers: 1
|
16 |
+
seed: 42
|
17 |
+
general:
|
18 |
+
benchmark_csv_path: null
|
19 |
+
consumed_train_samples: null
|
20 |
+
ignore_sanity_checks: false
|
21 |
+
project: debug
|
22 |
+
run: tiny_mistral
|
23 |
+
seed: 42
|
24 |
+
step: null
|
25 |
+
logging:
|
26 |
+
iteration_step_info_interval: 1
|
27 |
+
log_level: info
|
28 |
+
log_level_replica: info
|
29 |
+
model:
|
30 |
+
ddp_bucket_cap_mb: 25
|
31 |
+
dtype: bfloat16
|
32 |
+
init_method:
|
33 |
+
std: 0.025
|
34 |
+
make_vocab_size_divisible_by: 1
|
35 |
+
model_config:
|
36 |
+
attn_pdrop: 0.0
|
37 |
+
bos_token_id: 1
|
38 |
+
eos_token_id: 2
|
39 |
+
hidden_act: silu
|
40 |
+
hidden_size: 16
|
41 |
+
initializer_range: 0.02
|
42 |
+
intermediate_size: 64
|
43 |
+
is_mistral_config: true
|
44 |
+
max_position_embeddings: 256
|
45 |
+
num_attention_heads: 4
|
46 |
+
num_hidden_layers: 2
|
47 |
+
num_key_value_heads: 4
|
48 |
+
pad_token_id: null
|
49 |
+
pretraining_tp: 1
|
50 |
+
rms_norm_eps: 1.0e-05
|
51 |
+
rope_theta: 10000.0
|
52 |
+
sliding_window_size: 4096
|
53 |
+
tie_word_embeddings: true
|
54 |
+
use_cache: true
|
55 |
+
vocab_size: 256
|
56 |
+
optimizer:
|
57 |
+
accumulate_grad_in_fp32: true
|
58 |
+
adam_beta1: 0.9
|
59 |
+
adam_beta2: 0.95
|
60 |
+
adam_eps: 1.0e-08
|
61 |
+
clip_grad: 1.0
|
62 |
+
learning_rate_scheduler:
|
63 |
+
learning_rate: 0.0003
|
64 |
+
lr_decay_steps: 8
|
65 |
+
lr_decay_style: cosine
|
66 |
+
lr_warmup_steps: 2
|
67 |
+
lr_warmup_style: linear
|
68 |
+
min_decay_lr: 1.0e-05
|
69 |
+
torch_adam_is_fused: true
|
70 |
+
weight_decay: 0.01
|
71 |
+
zero_stage: 0
|
72 |
+
parallelism:
|
73 |
+
dp: 2
|
74 |
+
pp: 2
|
75 |
+
pp_engine: 1f1b
|
76 |
+
recompute_granularity: SELECTIVE
|
77 |
+
tp: 2
|
78 |
+
tp_linear_async_communication: true
|
79 |
+
tp_mode: REDUCE_SCATTER
|
80 |
+
profiler: null
|
81 |
+
tokenizer:
|
82 |
+
tokenizer_max_length: null
|
83 |
+
tokenizer_name_or_path: gpt2
|
84 |
+
tokenizer_revision: null
|
85 |
+
tokens:
|
86 |
+
batch_accumulation_per_replica: 1
|
87 |
+
limit_test_batches: 0
|
88 |
+
limit_val_batches: 0
|
89 |
+
micro_batch_size: 2
|
90 |
+
sequence_length: 32
|
91 |
+
train_steps: 10
|
92 |
+
val_check_interval: -1
|
convert_trfrs_to_brrr.py
ADDED
@@ -0,0 +1,262 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ruff: noqa: E402
|
2 |
+
"""
|
3 |
+
This module converts a transformers LlamaForCausalLM to a brrr model
|
4 |
+
|
5 |
+
Command:
|
6 |
+
torchrun --nproc_per_node=1 convert_trfrs_to_brrr.py \
|
7 |
+
--model_name mistralai/Mistral-7B-v0.1 \
|
8 |
+
--save_path ./pretrained/Mistral-7B-v0.1
|
9 |
+
"""
|
10 |
+
import argparse
|
11 |
+
import sys
|
12 |
+
from dataclasses import asdict
|
13 |
+
from pathlib import Path
|
14 |
+
from typing import Dict, List
|
15 |
+
|
16 |
+
import torch
|
17 |
+
|
18 |
+
from brrr.trainer import DistributedTrainer
|
19 |
+
|
20 |
+
sys.path.append(Path(__file__).parent.parent.as_posix())
|
21 |
+
import os
|
22 |
+
|
23 |
+
from nanotron.parallel.parameters import NanotronParameter, sanity_check
|
24 |
+
from nanotron.parallel.pipeline_parallel.engine import (
|
25 |
+
AllForwardAllBackwardPipelineEngine,
|
26 |
+
)
|
27 |
+
from nanotron.parallel.tensor_parallel.nn import TensorParallelLinearMode
|
28 |
+
from transformers import MistralConfig as MistralConfig_trfs, MistralForCausalLM
|
29 |
+
|
30 |
+
import nanotron.distributed as dist
|
31 |
+
from nanotron.config import ParallelismArgs, RecomputeGranularity
|
32 |
+
from nanotron.parallel.context import ParallelContext
|
33 |
+
from nanotron.models import build_model
|
34 |
+
from nanotron.trainer import mark_tied_parameters
|
35 |
+
from nanotron.serialize import save_meta, save_weights, save
|
36 |
+
|
37 |
+
from modeling_mistral import MistralForTraining
|
38 |
+
from config_mistral_7b import PARALLELISM as PARALLELISM_BRRR, CONFIG as CONFIG_BRRR
|
39 |
+
|
40 |
+
|
41 |
+
def get_args():
|
42 |
+
parser = argparse.ArgumentParser(description="Convert transformers weights to brrr weights")
|
43 |
+
parser.add_argument("--model_name", type=str, default="mistralai/Mistral-7B-v0.1")
|
44 |
+
parser.add_argument("--save_path", type=str, default="pretrained/Mistral-7B-v0.1")
|
45 |
+
parser.add_argument("--dp", type=int, default=1)
|
46 |
+
parser.add_argument("--pp", type=int, default=1)
|
47 |
+
parser.add_argument("--tp", type=int, default=1)
|
48 |
+
return parser.parse_args()
|
49 |
+
|
50 |
+
|
51 |
+
def permute_for_rotary(tensor, num_heads, per_head_hidden_size, hidden_size):
|
52 |
+
return (
|
53 |
+
tensor.view(num_heads, 2, per_head_hidden_size // 2, hidden_size)
|
54 |
+
.transpose(1, 2)
|
55 |
+
.contiguous()
|
56 |
+
.view(num_heads * per_head_hidden_size, hidden_size)
|
57 |
+
)
|
58 |
+
|
59 |
+
|
60 |
+
def get_transformers_weight(
|
61 |
+
name: str, ref_module_state_dict: Dict[str, torch.Tensor], ref_module: MistralForCausalLM, get_grad: bool = False
|
62 |
+
) -> torch.Tensor:
|
63 |
+
"""From our brrr implementation, we get the equivalent tensor in transformers implementation"""
|
64 |
+
config = ref_module.config
|
65 |
+
brrr_prefix = "model."
|
66 |
+
assert name.startswith(brrr_prefix)
|
67 |
+
name = name[len(brrr_prefix) :]
|
68 |
+
|
69 |
+
path = name.split(".")
|
70 |
+
path.remove("pp_block")
|
71 |
+
name = ".".join(path)
|
72 |
+
|
73 |
+
if get_grad is False:
|
74 |
+
|
75 |
+
def get_tensor(path: str):
|
76 |
+
return ref_module_state_dict[path]
|
77 |
+
|
78 |
+
def get_tensors(path: List[str]):
|
79 |
+
return [get_tensor(p) for p in path]
|
80 |
+
|
81 |
+
else:
|
82 |
+
|
83 |
+
def get_tensor(path: str):
|
84 |
+
weight = ref_module.get_parameter(path)
|
85 |
+
return weight.grad
|
86 |
+
|
87 |
+
def get_tensors(path: List[str]):
|
88 |
+
return [get_tensor(p) for p in path]
|
89 |
+
|
90 |
+
if name == "token_position_embeddings.token_embedding.weight":
|
91 |
+
return get_tensor("model.embed_tokens.weight")
|
92 |
+
|
93 |
+
elif name == "lm_head.weight":
|
94 |
+
# This only used when weights are not shared
|
95 |
+
return get_tensor("lm_head.weight")
|
96 |
+
|
97 |
+
elif name == "final_layer_norm.weight":
|
98 |
+
return get_tensor("model.norm.weight")
|
99 |
+
|
100 |
+
if path[0] == "decoder":
|
101 |
+
transformer_path = ["model"] + ["layers"] + [path[1]]
|
102 |
+
|
103 |
+
if path[2] == "attn":
|
104 |
+
path[2] = "self_attn"
|
105 |
+
|
106 |
+
if path[2] == "ff":
|
107 |
+
path[2] = "mlp"
|
108 |
+
|
109 |
+
if path[3] == "qkv_proj":
|
110 |
+
proj_names = ["q_proj", "k_proj", "v_proj"]
|
111 |
+
tensor_list = get_tensors(
|
112 |
+
[".".join(transformer_path + path[2:3] + [proj_name] + path[4:]) for proj_name in proj_names]
|
113 |
+
)
|
114 |
+
# Permute q/k
|
115 |
+
per_head_hidden_size = config.hidden_size // config.num_attention_heads
|
116 |
+
# Permute q
|
117 |
+
print(f"Permuting q {tensor_list[0].shape}")
|
118 |
+
tensor_list[0] = permute_for_rotary(
|
119 |
+
tensor=tensor_list[0],
|
120 |
+
num_heads=config.num_attention_heads,
|
121 |
+
per_head_hidden_size=per_head_hidden_size,
|
122 |
+
hidden_size=config.hidden_size,
|
123 |
+
)
|
124 |
+
# Permute k
|
125 |
+
print(f"Permuting k {tensor_list[1].shape}")
|
126 |
+
tensor_list[1] = permute_for_rotary(
|
127 |
+
tensor=tensor_list[1],
|
128 |
+
num_heads=config.num_key_value_heads,
|
129 |
+
per_head_hidden_size=per_head_hidden_size,
|
130 |
+
hidden_size=config.hidden_size,
|
131 |
+
)
|
132 |
+
return torch.cat(tensor_list, dim=0)
|
133 |
+
|
134 |
+
if path[3] == "gate_up_proj":
|
135 |
+
tensor_list = get_tensors(
|
136 |
+
[
|
137 |
+
".".join(transformer_path + path[2:3] + [proj_name] + path[4:])
|
138 |
+
for proj_name in ["gate_proj", "up_proj"]
|
139 |
+
]
|
140 |
+
)
|
141 |
+
return torch.cat(tensor_list, dim=0)
|
142 |
+
|
143 |
+
return get_tensor(".".join(transformer_path + path[2:]))
|
144 |
+
|
145 |
+
else:
|
146 |
+
raise ValueError(f"Couldn't find transformer equivalent of {name}")
|
147 |
+
|
148 |
+
|
149 |
+
def convert_trfrs_to_brrr(dp, pp, tp, model_name="huggyllama/llama-7b", save_path="pretrained/llama-7b"):
|
150 |
+
# check save_path doesnt exist or is empty
|
151 |
+
save_path = Path(save_path)
|
152 |
+
# assert not save_path.exists() or len(list(save_path.iterdir())) == 0, f"save_path {save_path} is not empty"
|
153 |
+
|
154 |
+
parallel_config = PARALLELISM_BRRR
|
155 |
+
|
156 |
+
parallel_config.dp = dp
|
157 |
+
parallel_config.pp = pp
|
158 |
+
parallel_config.tp = tp
|
159 |
+
|
160 |
+
# Initialise all process groups
|
161 |
+
parallel_context = ParallelContext(
|
162 |
+
data_parallel_size=parallel_config.dp,
|
163 |
+
pipeline_parallel_size=parallel_config.pp,
|
164 |
+
tensor_parallel_size=parallel_config.tp,
|
165 |
+
)
|
166 |
+
# params
|
167 |
+
dtype = torch.bfloat16 # Flash attention doesn't support fp32
|
168 |
+
|
169 |
+
# Initialise brrr model
|
170 |
+
model_config_brrr = CONFIG_BRRR.model.model_config
|
171 |
+
|
172 |
+
model = build_model(
|
173 |
+
model_builder=lambda: MistralForTraining(
|
174 |
+
config=model_config_brrr,
|
175 |
+
parallel_context=parallel_context,
|
176 |
+
parallel_config=parallel_config,
|
177 |
+
random_states=None,
|
178 |
+
),
|
179 |
+
dtype=dtype,
|
180 |
+
parallel_context=parallel_context,
|
181 |
+
device=torch.device("cpu"),
|
182 |
+
)
|
183 |
+
|
184 |
+
# Initialise transformers model
|
185 |
+
device_map = {}
|
186 |
+
current_pp_rank = dist.get_rank(group=parallel_context.pp_pg)
|
187 |
+
device_map["model.embed_tokens"] = (
|
188 |
+
model.model.token_position_embeddings.rank
|
189 |
+
if current_pp_rank == model.model.token_position_embeddings.rank
|
190 |
+
else "meta"
|
191 |
+
)
|
192 |
+
for i in range(model_config_brrr.num_hidden_layers):
|
193 |
+
device_map[f"model.layers.{i}"] = (
|
194 |
+
model.model.decoder[i].rank if current_pp_rank == model.model.decoder[i].rank else "meta"
|
195 |
+
)
|
196 |
+
device_map["model.norm"] = (
|
197 |
+
model.model.final_layer_norm.rank if current_pp_rank == model.model.final_layer_norm.rank else "meta"
|
198 |
+
)
|
199 |
+
device_map["lm_head"] = model.model.lm_head.rank if current_pp_rank == model.model.lm_head.rank else "meta"
|
200 |
+
model_ref = MistralForCausalLM.from_pretrained(model_name, torch_dtype=dtype, device_map=device_map)
|
201 |
+
|
202 |
+
# Copy weights from trfrs to brrr
|
203 |
+
ref_state_dict = model_ref.state_dict()
|
204 |
+
for name, param in model.named_parameters():
|
205 |
+
print(f"Syncing {name}")
|
206 |
+
ref_param = get_transformers_weight(name=name, ref_module_state_dict=ref_state_dict, ref_module=model_ref)
|
207 |
+
|
208 |
+
param_is_tp_sharded = (
|
209 |
+
isinstance(param, NanotronParameter)
|
210 |
+
and param.is_sharded
|
211 |
+
and parallel_context.world_ranks_to_pg[param.get_sharded_info().global_ranks] == parallel_context.tp_pg
|
212 |
+
)
|
213 |
+
|
214 |
+
if param_is_tp_sharded:
|
215 |
+
sharded_info = param.get_sharded_info()
|
216 |
+
# copy param data (not just the reference)
|
217 |
+
with torch.no_grad():
|
218 |
+
for local_global_slices_pair in sharded_info.local_global_slices_pairs:
|
219 |
+
local_slices = local_global_slices_pair.local_slices
|
220 |
+
global_slices = local_global_slices_pair.global_slices
|
221 |
+
param[local_slices].copy_(ref_param[global_slices])
|
222 |
+
else:
|
223 |
+
assert (
|
224 |
+
ref_param.shape == param.shape
|
225 |
+
), f"Parameter shape don't match for {name}\n{ref_param.shape} != {param.shape}"
|
226 |
+
# copy param data (not just the reference)
|
227 |
+
with torch.no_grad():
|
228 |
+
param.copy_(ref_param)
|
229 |
+
ref_param = None
|
230 |
+
# torch.cuda.empty_cache()
|
231 |
+
|
232 |
+
# TODO @nouamanetazi: assert weights are the same
|
233 |
+
# Marks parameters as NanotronParameters
|
234 |
+
mark_tied_parameters(model=model, parallel_context=parallel_context, parallel_config=parallel_config)
|
235 |
+
|
236 |
+
sanity_check(root_module=model)
|
237 |
+
|
238 |
+
checkpoint_metadata = {
|
239 |
+
"last_train_step": 0,
|
240 |
+
"consumed_train_samples": 0,
|
241 |
+
}
|
242 |
+
save(config=CONFIG_BRRR, model=model, optimizer=None, lr_scheduler=None, parallel_context=parallel_context, root_folder=save_path,
|
243 |
+
should_save_optimizer=False, should_save_lr_scheduler=False, checkpoint_metadata=checkpoint_metadata,
|
244 |
+
sanity_checks=False)
|
245 |
+
# save_weights(model=model, parallel_context=parallel_context, root_folder=save_path)
|
246 |
+
# save_meta(root_folder=save_path, parallel_context=parallel_context, checkpoint_metadata=checkpoint_metadata)
|
247 |
+
|
248 |
+
if dist.get_rank(parallel_context.world_pg) == 0:
|
249 |
+
print(save_path)
|
250 |
+
import json
|
251 |
+
|
252 |
+
with open(save_path / "model_config.json", mode="w") as fo:
|
253 |
+
fo.write(json.dumps(asdict(CONFIG_BRRR.model.model_config), indent=4))
|
254 |
+
|
255 |
+
|
256 |
+
def main():
|
257 |
+
args = get_args()
|
258 |
+
convert_trfrs_to_brrr(**vars(args))
|
259 |
+
|
260 |
+
|
261 |
+
if __name__ == "__main__":
|
262 |
+
main()
|
modeling_mistral.py
CHANGED
@@ -15,6 +15,7 @@
|
|
15 |
""" PyTorch Mistral model.
|
16 |
"""
|
17 |
from typing import Dict, Optional, Union
|
|
|
18 |
|
19 |
import torch
|
20 |
from flash_attn import bert_padding
|
@@ -46,12 +47,15 @@ from nanotron.parallel.tensor_parallel.nn import (
|
|
46 |
)
|
47 |
from nanotron.random import RandomStates
|
48 |
from nanotron.utils import checkpoint_method
|
|
|
49 |
from torch import nn
|
50 |
-
|
51 |
-
from
|
52 |
|
53 |
logger = logging.get_logger(__name__)
|
54 |
|
|
|
|
|
55 |
|
56 |
class RotaryEmbedding(nn.Module):
|
57 |
def __init__(self, dim: int, end: int, theta: float = 10000.0):
|
@@ -189,15 +193,22 @@ class CoreAttention(nn.Module):
|
|
189 |
), f"Hidden size {config.hidden_size} must be divisible by number of attention heads {config.num_attention_heads}."
|
190 |
self.d_qk = config.hidden_size // config.num_attention_heads
|
191 |
self.d_v = config.hidden_size // config.num_attention_heads
|
|
|
192 |
|
193 |
self.checkpoint_attention = False # Because flash_attn already does checkpointing
|
194 |
|
|
|
|
|
|
|
|
|
|
|
|
|
195 |
@checkpoint_method(attr_name="checkpoint_attention")
|
196 |
def forward(
|
197 |
self,
|
198 |
-
query_states: torch.Tensor, # [batch_size * q_length,
|
199 |
-
key_states: torch.Tensor, # [batch_size * kv_length,
|
200 |
-
value_states: torch.Tensor, # [batch_size * kv_length,
|
201 |
q_sequence_mask: torch.Tensor, # torch.BoolTensor [batch_size, q_length] (can be broadcasted to that size)
|
202 |
kv_sequence_mask: torch.Tensor, # torch.BoolTensor [batch_size, kv_length] (can be broadcasted to that size)
|
203 |
):
|
@@ -218,9 +229,10 @@ class CoreAttention(nn.Module):
|
|
218 |
cu_seqlens_k=cu_seqlens_k,
|
219 |
max_seqlen_q=q_sequence_mask.shape[1],
|
220 |
max_seqlen_k=kv_sequence_mask.shape[1],
|
221 |
-
dropout_p=0.0,
|
222 |
-
softmax_scale=None, #
|
223 |
causal=causal,
|
|
|
224 |
return_attn_probs=False,
|
225 |
)
|
226 |
|
@@ -318,10 +330,11 @@ class CausalSelfAttention(nn.Module, AttachableStore):
|
|
318 |
self.rotary_embedding = RotaryEmbedding(
|
319 |
dim=self.d_qk,
|
320 |
end=config.max_position_embeddings,
|
|
|
321 |
)
|
322 |
|
323 |
# NOTE: Only supported for training (TODO(fmom): position_ids not supported yet)
|
324 |
-
self.flash_rotary_embedding = FlashRotaryEmbedding(dim=self.d_qk, interleaved=True)
|
325 |
|
326 |
self.o_proj = TensorParallelRowLinear(
|
327 |
config.num_attention_heads * self.d_qk,
|
@@ -852,7 +865,6 @@ class MistralForTraining(NanotronModel):
|
|
852 |
super().__init__()
|
853 |
import warnings
|
854 |
|
855 |
-
warnings.warn("This is just a Llama Model, not a Mistral one for demo purpose. Please fix implementation")
|
856 |
self.model = MistralModel(config=config, parallel_context=parallel_context, parallel_config=parallel_config)
|
857 |
self.loss = PipelineBlock(
|
858 |
p2p=self.model.p2p,
|
@@ -1044,12 +1056,13 @@ def get_flops(
|
|
1044 |
num_layers,
|
1045 |
hidden_size,
|
1046 |
num_heads,
|
1047 |
-
num_key_value_heads,
|
1048 |
vocab_size,
|
1049 |
seq_len,
|
1050 |
-
|
|
|
1051 |
batch_size=1,
|
1052 |
recompute_granularity=None,
|
|
|
1053 |
):
|
1054 |
"""Counts flops in an decoder-only model
|
1055 |
Args:
|
@@ -1066,33 +1079,43 @@ def get_flops(
|
|
1066 |
model_flops: flops in the model (should be independent of the hardware and model implementation)
|
1067 |
hardware_flops: flops in the hardware (actual flops performed on the hardware). Check 6.3 in https://arxiv.org/pdf/2205.05198.pdf
|
1068 |
"""
|
1069 |
-
if
|
1070 |
-
|
1071 |
-
|
|
|
|
|
|
|
1072 |
# In the following we mark the reduced dimension with parentheses
|
1073 |
# decoder
|
1074 |
-
# self attention
|
1075 |
-
##
|
1076 |
-
|
1077 |
-
|
1078 |
-
|
1079 |
-
)
|
1080 |
## qk logits
|
1081 |
-
decoder_qk_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (
|
|
|
|
|
|
|
1082 |
## v logits
|
1083 |
-
decoder_v_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (seq_len) *
|
|
|
1084 |
## attn out
|
1085 |
-
decoder_attn_out_flops_fwd = (
|
1086 |
-
2 * num_layers * batch_size * num_heads * seq_len * (hidden_size_per_head) * hidden_size
|
1087 |
-
)
|
1088 |
# FF
|
1089 |
## 1st layer
|
1090 |
-
decoder_ffn_1_flops_fwd =
|
|
|
|
|
|
|
|
|
|
|
1091 |
## 2nd layer
|
1092 |
decoder_ffn_2_flops_fwd = 2 * num_layers * batch_size * seq_len * (ffn_hidden_size) * hidden_size
|
1093 |
|
1094 |
decoder_flops_fwd = (
|
1095 |
-
|
|
|
1096 |
+ decoder_qk_logits_flops_fwd
|
1097 |
+ decoder_v_logits_flops_fwd
|
1098 |
+ decoder_attn_out_flops_fwd
|
|
|
15 |
""" PyTorch Mistral model.
|
16 |
"""
|
17 |
from typing import Dict, Optional, Union
|
18 |
+
import inspect
|
19 |
|
20 |
import torch
|
21 |
from flash_attn import bert_padding
|
|
|
47 |
)
|
48 |
from nanotron.random import RandomStates
|
49 |
from nanotron.utils import checkpoint_method
|
50 |
+
from nanotron.nn.activations import ACT2FN
|
51 |
from torch import nn
|
52 |
+
|
53 |
+
from config_mistral_7b import MistralConfig
|
54 |
|
55 |
logger = logging.get_logger(__name__)
|
56 |
|
57 |
+
_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_varlen_func).parameters)
|
58 |
+
|
59 |
|
60 |
class RotaryEmbedding(nn.Module):
|
61 |
def __init__(self, dim: int, end: int, theta: float = 10000.0):
|
|
|
193 |
), f"Hidden size {config.hidden_size} must be divisible by number of attention heads {config.num_attention_heads}."
|
194 |
self.d_qk = config.hidden_size // config.num_attention_heads
|
195 |
self.d_v = config.hidden_size // config.num_attention_heads
|
196 |
+
self.dropout = config.attn_pdrop
|
197 |
|
198 |
self.checkpoint_attention = False # Because flash_attn already does checkpointing
|
199 |
|
200 |
+
if config.sliding_window_size is not None:
|
201 |
+
assert (
|
202 |
+
_flash_supports_window_size
|
203 |
+
), "Current version of flash-attn doesn't support sliding window: `pip install flash-attn>=2.3`"
|
204 |
+
self.sliding_window_size = config.sliding_window_size # if layer_idx not in config.global_attn_layers else None
|
205 |
+
|
206 |
@checkpoint_method(attr_name="checkpoint_attention")
|
207 |
def forward(
|
208 |
self,
|
209 |
+
query_states: torch.Tensor, # [batch_size * q_length, num_heads, inner_dim]
|
210 |
+
key_states: torch.Tensor, # [batch_size * kv_length, 1, inner_dim]
|
211 |
+
value_states: torch.Tensor, # [batch_size * kv_length, 1, inner_dim]
|
212 |
q_sequence_mask: torch.Tensor, # torch.BoolTensor [batch_size, q_length] (can be broadcasted to that size)
|
213 |
kv_sequence_mask: torch.Tensor, # torch.BoolTensor [batch_size, kv_length] (can be broadcasted to that size)
|
214 |
):
|
|
|
229 |
cu_seqlens_k=cu_seqlens_k,
|
230 |
max_seqlen_q=q_sequence_mask.shape[1],
|
231 |
max_seqlen_k=kv_sequence_mask.shape[1],
|
232 |
+
dropout_p=self.dropout if self.training else 0.0,
|
233 |
+
softmax_scale=None, # defaults to 1/sqrt(d_qk)
|
234 |
causal=causal,
|
235 |
+
window_size=(self.sliding_window_size - 1, 0) if self.sliding_window_size is not None else (-1, -1),
|
236 |
return_attn_probs=False,
|
237 |
)
|
238 |
|
|
|
330 |
self.rotary_embedding = RotaryEmbedding(
|
331 |
dim=self.d_qk,
|
332 |
end=config.max_position_embeddings,
|
333 |
+
theta=config.rope_theta
|
334 |
)
|
335 |
|
336 |
# NOTE: Only supported for training (TODO(fmom): position_ids not supported yet)
|
337 |
+
self.flash_rotary_embedding = FlashRotaryEmbedding(dim=self.d_qk, base=config.rope_theta, interleaved=True)
|
338 |
|
339 |
self.o_proj = TensorParallelRowLinear(
|
340 |
config.num_attention_heads * self.d_qk,
|
|
|
865 |
super().__init__()
|
866 |
import warnings
|
867 |
|
|
|
868 |
self.model = MistralModel(config=config, parallel_context=parallel_context, parallel_config=parallel_config)
|
869 |
self.loss = PipelineBlock(
|
870 |
p2p=self.model.p2p,
|
|
|
1056 |
num_layers,
|
1057 |
hidden_size,
|
1058 |
num_heads,
|
|
|
1059 |
vocab_size,
|
1060 |
seq_len,
|
1061 |
+
kv_channels=None,
|
1062 |
+
ffn_hidden_size=None,
|
1063 |
batch_size=1,
|
1064 |
recompute_granularity=None,
|
1065 |
+
glu_activation=False,
|
1066 |
):
|
1067 |
"""Counts flops in an decoder-only model
|
1068 |
Args:
|
|
|
1079 |
model_flops: flops in the model (should be independent of the hardware and model implementation)
|
1080 |
hardware_flops: flops in the hardware (actual flops performed on the hardware). Check 6.3 in https://arxiv.org/pdf/2205.05198.pdf
|
1081 |
"""
|
1082 |
+
if kv_channels is None:
|
1083 |
+
assert hidden_size % num_heads == 0
|
1084 |
+
kv_channels = hidden_size // num_heads
|
1085 |
+
if ffn_hidden_size is None:
|
1086 |
+
ffn_hidden_size = 4 * hidden_size
|
1087 |
+
|
1088 |
# In the following we mark the reduced dimension with parentheses
|
1089 |
# decoder
|
1090 |
+
# self attention (MQA)
|
1091 |
+
## q projection
|
1092 |
+
decoder_q_proj_flops_fwd = 2 * num_layers * batch_size * seq_len * (hidden_size) * num_heads * kv_channels
|
1093 |
+
## kv projection, shared across heads
|
1094 |
+
decoder_kv_proj_flops_fwd = 2 * num_layers * batch_size * seq_len * (hidden_size) * 2 * kv_channels
|
|
|
1095 |
## qk logits
|
1096 |
+
decoder_qk_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (kv_channels) * seq_len
|
1097 |
+
### SWA (sliding window attention / local attention)
|
1098 |
+
# window_size = 4096
|
1099 |
+
# decoder_qk_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (kv_channels) * window_size
|
1100 |
## v logits
|
1101 |
+
decoder_v_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (seq_len) * kv_channels
|
1102 |
+
# decoder_v_logits_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (window_size) * kv_channels
|
1103 |
## attn out
|
1104 |
+
decoder_attn_out_flops_fwd = 2 * num_layers * batch_size * num_heads * seq_len * (kv_channels) * hidden_size
|
|
|
|
|
1105 |
# FF
|
1106 |
## 1st layer
|
1107 |
+
decoder_ffn_1_flops_fwd = 2 * num_layers * batch_size * seq_len * (hidden_size) * ffn_hidden_size
|
1108 |
+
if glu_activation:
|
1109 |
+
# 3 matmuls instead of 2 in FFN
|
1110 |
+
# ref. https://arxiv.org/pdf/2002.05202.pdf
|
1111 |
+
# Used for example in T5 v1.1
|
1112 |
+
decoder_ffn_1_flops_fwd = 4 * num_layers * batch_size * seq_len * (hidden_size) * ffn_hidden_size
|
1113 |
## 2nd layer
|
1114 |
decoder_ffn_2_flops_fwd = 2 * num_layers * batch_size * seq_len * (ffn_hidden_size) * hidden_size
|
1115 |
|
1116 |
decoder_flops_fwd = (
|
1117 |
+
decoder_q_proj_flops_fwd
|
1118 |
+
+ decoder_kv_proj_flops_fwd
|
1119 |
+ decoder_qk_logits_flops_fwd
|
1120 |
+ decoder_v_logits_flops_fwd
|
1121 |
+ decoder_attn_out_flops_fwd
|
pretrained/Mistral-7B-v0.1/checkpoint_metadata.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"dp": 1,
|
3 |
+
"metas": {
|
4 |
+
"consumed_train_samples": 0,
|
5 |
+
"last_train_step": 0
|
6 |
+
},
|
7 |
+
"tp": 1,
|
8 |
+
"version": "1.2"
|
9 |
+
}
|
pretrained/Mistral-7B-v0.1/config.yaml
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
checkpoints: null
|
2 |
+
data: null
|
3 |
+
general:
|
4 |
+
benchmark_csv_path: null
|
5 |
+
consumed_train_samples: null
|
6 |
+
ignore_sanity_checks: false
|
7 |
+
project: mistralai
|
8 |
+
run: Mistral-7B-v0.1
|
9 |
+
seed: 42
|
10 |
+
step: null
|
11 |
+
logging: null
|
12 |
+
model:
|
13 |
+
ddp_bucket_cap_mb: 25
|
14 |
+
dtype: bfloat16
|
15 |
+
init_method:
|
16 |
+
std: 0.025
|
17 |
+
make_vocab_size_divisible_by: 1
|
18 |
+
model_config:
|
19 |
+
attn_pdrop: 0.0
|
20 |
+
bos_token_id: 1
|
21 |
+
eos_token_id: 2
|
22 |
+
hidden_act: silu
|
23 |
+
hidden_size: 4096
|
24 |
+
initializer_range: 0.02
|
25 |
+
intermediate_size: 14336
|
26 |
+
is_mistral_config: true
|
27 |
+
max_position_embeddings: 32768
|
28 |
+
num_attention_heads: 32
|
29 |
+
num_hidden_layers: 32
|
30 |
+
num_key_value_heads: 8
|
31 |
+
pad_token_id: null
|
32 |
+
pretraining_tp: 1
|
33 |
+
rms_norm_eps: 1.0e-05
|
34 |
+
rope_theta: 10000.0
|
35 |
+
sliding_window_size: 4096
|
36 |
+
tie_word_embeddings: false
|
37 |
+
use_cache: true
|
38 |
+
vocab_size: 32000
|
39 |
+
optimizer: null
|
40 |
+
parallelism:
|
41 |
+
dp: 1
|
42 |
+
pp: 1
|
43 |
+
pp_engine: 1f1b
|
44 |
+
recompute_granularity: SELECTIVE
|
45 |
+
tp: 1
|
46 |
+
tp_linear_async_communication: true
|
47 |
+
tp_mode: REDUCE_SCATTER
|
48 |
+
profiler: null
|
49 |
+
tokenizer:
|
50 |
+
tokenizer_max_length: null
|
51 |
+
tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
|
52 |
+
tokenizer_revision: null
|
53 |
+
tokens: null
|
pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e6ef1be3d2daa611724f02567159bf507c9a9ea276d9771387a01f4942cafb6
|
3 |
+
size 33554672
|
pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b3d4484e1f9505c97b6ac37472d9526e95470e6aef462fec6ae461b63e4ff77a
|
3 |
+
size 50332000
|
pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/input_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7c286c58dfce1f3f030c13b90c41c831d05c4323da3d50e23fe434f38f81535b
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d69d412c683fc926a26a71536dd2530877994cfa6e4e9ae3f3a31f6861596b0
|
3 |
+
size 117440752
|
pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6148707c761016f313ee2ada71e15b9eb977878148fa382806eea4ef30a145e6
|
3 |
+
size 234881328
|
pretrained/Mistral-7B-v0.1/model/model/decoder/0/pp_block/post_attention_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:05588e50e8fafc16c332e2f7a3d3830c9e59d29c35858d439a98ba4e418eba78
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cffeb63cbb1f271fd7ab983b618dfe4a4fc2b6b3763b9332fc324d378207210d
|
3 |
+
size 33554672
|
pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5a1263d75614c4a9710ebc5a57fdec732b9348c1f57ace1887ce296e1805b529
|
3 |
+
size 50332000
|
pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/input_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:71cd7738481e6bcbacbc76ce206545fb2fe6d995f7e1a733b408c3fe92f7356c
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31de05f9c50d9e94fe67936a973c86840f82ed2aad1494806baa81df8bbf9bf8
|
3 |
+
size 117440752
|
pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:70761ee840fbdc950501814ff397c72e9d8bbc7be2030329f391c12eb5b73a0f
|
3 |
+
size 234881328
|
pretrained/Mistral-7B-v0.1/model/model/decoder/1/pp_block/post_attention_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1846ddc1c4ca9d8e03184f2fa34911398202f0edc310df5ea408a323a5f23ee8
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59d82cc1e5ec1f364e7401e17e58d53f62a39658799aeb4902060236ebb0cb60
|
3 |
+
size 33554672
|
pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:117c7832cefd9a767282b27c852f00ed4ce7888a8abb7e2f9257a0b2fed60608
|
3 |
+
size 50332000
|
pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/input_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c93dc35263c0efa22d22795777c009e4f9365cf1ef413b69880d14433d1069e8
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ad3fefd66e98df8fee62bd0fe451b18ca1a14545b72e570d499dce0464368b81
|
3 |
+
size 117440752
|
pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6346f7c23987a4c529ac1b63b5f6f56b4392981ffcaaf2cb84cf2bf5b2bc36a7
|
3 |
+
size 234881328
|
pretrained/Mistral-7B-v0.1/model/model/decoder/10/pp_block/post_attention_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6cd70662e84b3d81b4f4512929d00d9377515c2dfe75d78109edce27c57d834
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8c17ee146b384be81a4b9cb06960728dd540d6650a5798abcc95315bb0daf2ca
|
3 |
+
size 33554672
|
pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:497bbea1882c42d134dc4458194d71cd3d7d609b06e91e715093e0c243962116
|
3 |
+
size 50332000
|
pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/input_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a7fbc23c909758daf76a1d647d1beefb4c3cc62a4aa04f98679e22d79cc6813e
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef4f8de0f891e6d79255af98deda246f04c0775694835f696a1a8b0738f492da
|
3 |
+
size 117440752
|
pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:844adcbf23618ae38fbffaf92f7d38ce7d853be5643485bb25f8f839c0f2819c
|
3 |
+
size 234881328
|
pretrained/Mistral-7B-v0.1/model/model/decoder/11/pp_block/post_attention_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1823cbba03a4ec4683cc6a41eab51e34cec90e92cea7af388d0c675abe451284
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9da7a13da9a196108d2efd875884aa8629533e8143255eef5915417ac592d9c0
|
3 |
+
size 33554672
|
pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:31c8fb0c121f6977e10c7277544259157152d28de9559c8aae8236925398329f
|
3 |
+
size 50332000
|
pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/input_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:90fd4f646b1f5ca201781cc77b713093ab9a67d4ee8de11c669a486a2896d773
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:368a3142cb9f085a2da1db74de226b13c509467cbea81da25f27db8842347443
|
3 |
+
size 117440752
|
pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:48e00b1107d1e575c2425fa8368e92eb714b59825153206ae4cccc36eb4e8e45
|
3 |
+
size 234881328
|
pretrained/Mistral-7B-v0.1/model/model/decoder/12/pp_block/post_attention_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9c3300f1d0af46ca69fc14397728055e302b2955b8b9adfd9705b68a683377b1
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d0d2409179997ff51079156414cb112c82b964976a8023f5088b1dd7ab28f50c
|
3 |
+
size 33554672
|
pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c51efd448a50b2c75046bfb12f2703ce19e56b07f4f9e94f7a36f673c70517b8
|
3 |
+
size 50332000
|
pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/input_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fe220fd602b0f41f30f7bca607c400adacadb7b5e31f81a28d7e103fd5c0b0a8
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fa8d2abd973723ddb4ea4cb2188fa767ea55168bc2519e170e728619fde864c4
|
3 |
+
size 117440752
|
pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/mlp/gate_up_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:aeeb6ae12119f5ffd390add932122f819d81d50886389567eb41103451b36d24
|
3 |
+
size 234881328
|
pretrained/Mistral-7B-v0.1/model/model/decoder/13/pp_block/post_attention_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55a66303f00f120db3fba8ae4d13725fc2c22cd6fd3babc1d66dc0fdee7eb45f
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/attn/o_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b9fe17a044a248163b4c45783386f7d414e6217ae9657c8983d54a84e85aae8e
|
3 |
+
size 33554672
|
pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/attn/qkv_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b727021f1c6625dae8fb240904bf838985311afe7b5e19f62839c3563072e75a
|
3 |
+
size 50332000
|
pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/input_layernorm/model_weight.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e5f069023e761eacf5747a540516a9215c49e3efc8614ffaa7fa4ca016c67075
|
3 |
+
size 8288
|
pretrained/Mistral-7B-v0.1/model/model/decoder/14/pp_block/mlp/down_proj/model_weight_pp-rank-0-of-1_tp-rank-0-of-1.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:993eebcd3451163de4981bd6f7cd82a2bc0bfcc29a67cbbedceb502a5036466c
|
3 |
+
size 117440752
|