update README w deepspeed info (#605)
Browse files- README.md +22 -8
- deepspeed/zero1.json +39 -37
- deepspeed/zero2.json +42 -40
- deepspeed/zero3.json +3 -1
- requirements.txt +1 -0
- setup.py +7 -2
README.md
CHANGED
@@ -31,6 +31,7 @@ Features:
|
|
31 |
- [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
|
32 |
- [Config](#config)
|
33 |
- [Train](#train)
|
|
|
34 |
- [Inference](#inference)
|
35 |
- [Merge LORA to Base](#merge-lora-to-base)
|
36 |
- [Common Errors](#common-errors-)
|
@@ -86,7 +87,7 @@ git clone https://github.com/OpenAccess-AI-Collective/axolotl
|
|
86 |
cd axolotl
|
87 |
|
88 |
pip3 install packaging
|
89 |
-
pip3 install -e .[flash-attn]
|
90 |
pip3 install -U git+https://github.com/huggingface/peft.git
|
91 |
|
92 |
# finetune lora
|
@@ -121,7 +122,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
|
|
121 |
3. Install axolotl along with python dependencies
|
122 |
```bash
|
123 |
pip3 install packaging
|
124 |
-
pip3 install -e .[flash-attn]
|
125 |
```
|
126 |
|
127 |
- LambdaLabs
|
@@ -157,7 +158,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
|
|
157 |
cd axolotl
|
158 |
|
159 |
pip3 install packaging
|
160 |
-
pip3 install -e .[flash-attn]
|
161 |
pip3 install protobuf==3.20.3
|
162 |
pip3 install -U --ignore-installed requests Pillow psutil scipy
|
163 |
```
|
@@ -715,11 +716,6 @@ fsdp_config:
|
|
715 |
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
716 |
```
|
717 |
|
718 |
-
- llama Deepspeed
|
719 |
-
```yaml
|
720 |
-
deepspeed: deepspeed/zero3.json
|
721 |
-
```
|
722 |
-
|
723 |
##### Weights & Biases Logging
|
724 |
|
725 |
- wandb options
|
@@ -732,6 +728,24 @@ wandb_run_id:
|
|
732 |
wandb_log_model:
|
733 |
```
|
734 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
735 |
### Inference
|
736 |
|
737 |
Pass the appropriate flag to the train command:
|
|
|
31 |
- [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
|
32 |
- [Config](#config)
|
33 |
- [Train](#train)
|
34 |
+
- [Training w/ Deepspeed](#training-with-deepspeed)
|
35 |
- [Inference](#inference)
|
36 |
- [Merge LORA to Base](#merge-lora-to-base)
|
37 |
- [Common Errors](#common-errors-)
|
|
|
87 |
cd axolotl
|
88 |
|
89 |
pip3 install packaging
|
90 |
+
pip3 install -e .[flash-attn,deepspeed]
|
91 |
pip3 install -U git+https://github.com/huggingface/peft.git
|
92 |
|
93 |
# finetune lora
|
|
|
122 |
3. Install axolotl along with python dependencies
|
123 |
```bash
|
124 |
pip3 install packaging
|
125 |
+
pip3 install -e .[flash-attn,deepspeed]
|
126 |
```
|
127 |
|
128 |
- LambdaLabs
|
|
|
158 |
cd axolotl
|
159 |
|
160 |
pip3 install packaging
|
161 |
+
pip3 install -e .[flash-attn,deepspeed]
|
162 |
pip3 install protobuf==3.20.3
|
163 |
pip3 install -U --ignore-installed requests Pillow psutil scipy
|
164 |
```
|
|
|
716 |
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
717 |
```
|
718 |
|
|
|
|
|
|
|
|
|
|
|
719 |
##### Weights & Biases Logging
|
720 |
|
721 |
- wandb options
|
|
|
728 |
wandb_log_model:
|
729 |
```
|
730 |
|
731 |
+
### Training with Deepspeed
|
732 |
+
|
733 |
+
Deepspeed is an optimization suite for multi-gpu systems allowing you to train much larger models than you
|
734 |
+
might typically be able to fit into your GPU's VRAM. More information about the various optimization types
|
735 |
+
for deepspeed is available at https://huggingface.co/docs/accelerate/main/en/usage_guides/deepspeed#what-is-integrated
|
736 |
+
|
737 |
+
We provide several default deepspeed JSON configurations for ZeRO stage 1, 2, and 3.
|
738 |
+
|
739 |
+
```shell
|
740 |
+
accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed/zero1.json
|
741 |
+
```
|
742 |
+
|
743 |
+
or
|
744 |
+
|
745 |
+
```yaml
|
746 |
+
deepspeed: deepspeed/zero1.json
|
747 |
+
```
|
748 |
+
|
749 |
### Inference
|
750 |
|
751 |
Pass the appropriate flag to the train command:
|
deepspeed/zero1.json
CHANGED
@@ -1,39 +1,41 @@
|
|
1 |
{
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
}
|
36 |
-
|
37 |
-
|
38 |
-
|
|
|
|
|
39 |
}
|
|
|
1 |
{
|
2 |
+
"zero_optimization": {
|
3 |
+
"stage": 1,
|
4 |
+
"overlap_comm": true
|
5 |
+
},
|
6 |
+
"bf16": {
|
7 |
+
"enabled": "auto"
|
8 |
+
},
|
9 |
+
"fp16": {
|
10 |
+
"enabled": "auto",
|
11 |
+
"auto_cast": false,
|
12 |
+
"loss_scale": 0,
|
13 |
+
"initial_scale_power": 32,
|
14 |
+
"loss_scale_window": 1000,
|
15 |
+
"hysteresis": 2,
|
16 |
+
"min_loss_scale": 1
|
17 |
+
},
|
18 |
+
"optimizer": {
|
19 |
+
"type": "AdamW",
|
20 |
+
"params": {
|
21 |
+
"lr": "auto",
|
22 |
+
"betas": "auto",
|
23 |
+
"eps": "auto",
|
24 |
+
"weight_decay": "auto"
|
25 |
+
}
|
26 |
+
},
|
27 |
+
"scheduler": {
|
28 |
+
"type": "WarmupDecayLR",
|
29 |
+
"params": {
|
30 |
+
"warmup_min_lr": "auto",
|
31 |
+
"warmup_max_lr": "auto",
|
32 |
+
"warmup_num_steps": "auto",
|
33 |
+
"warmup_type": "linear",
|
34 |
+
"total_num_steps": "auto"
|
35 |
+
}
|
36 |
+
},
|
37 |
+
"gradient_accumulation_steps": "auto",
|
38 |
+
"train_batch_size": "auto",
|
39 |
+
"train_micro_batch_size_per_gpu": "auto",
|
40 |
+
"wall_clock_breakdown": false
|
41 |
}
|
deepspeed/zero2.json
CHANGED
@@ -1,43 +1,45 @@
|
|
1 |
{
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
6 |
-
},
|
7 |
-
"contiguous_gradients": true,
|
8 |
-
"overlap_comm": true
|
9 |
},
|
10 |
-
"
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
"
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
43 |
}
|
|
|
1 |
{
|
2 |
+
"zero_optimization": {
|
3 |
+
"stage": 2,
|
4 |
+
"offload_optimizer": {
|
5 |
+
"device": "cpu"
|
|
|
|
|
|
|
6 |
},
|
7 |
+
"contiguous_gradients": true,
|
8 |
+
"overlap_comm": true
|
9 |
+
},
|
10 |
+
"bf16": {
|
11 |
+
"enabled": "auto"
|
12 |
+
},
|
13 |
+
"fp16": {
|
14 |
+
"enabled": "auto",
|
15 |
+
"auto_cast": false,
|
16 |
+
"loss_scale": 0,
|
17 |
+
"initial_scale_power": 32,
|
18 |
+
"loss_scale_window": 1000,
|
19 |
+
"hysteresis": 2,
|
20 |
+
"min_loss_scale": 1
|
21 |
+
},
|
22 |
+
"optimizer": {
|
23 |
+
"type": "AdamW",
|
24 |
+
"params": {
|
25 |
+
"lr": "auto",
|
26 |
+
"betas": "auto",
|
27 |
+
"eps": "auto",
|
28 |
+
"weight_decay": "auto"
|
29 |
+
}
|
30 |
+
},
|
31 |
+
"scheduler": {
|
32 |
+
"type": "WarmupDecayLR",
|
33 |
+
"params": {
|
34 |
+
"warmup_min_lr": "auto",
|
35 |
+
"warmup_max_lr": "auto",
|
36 |
+
"warmup_num_steps": "auto",
|
37 |
+
"warmup_type": "linear",
|
38 |
+
"total_num_steps": "auto"
|
39 |
+
}
|
40 |
+
},
|
41 |
+
"gradient_accumulation_steps": "auto",
|
42 |
+
"train_batch_size": "auto",
|
43 |
+
"train_micro_batch_size_per_gpu": "auto",
|
44 |
+
"wall_clock_breakdown": false
|
45 |
}
|
deepspeed/zero3.json
CHANGED
@@ -45,9 +45,11 @@
|
|
45 |
"params": {
|
46 |
"warmup_min_lr": "auto",
|
47 |
"warmup_max_lr": "auto",
|
48 |
-
"warmup_num_steps": "auto"
|
|
|
49 |
}
|
50 |
},
|
|
|
51 |
"train_batch_size": "auto",
|
52 |
"train_micro_batch_size_per_gpu": "auto",
|
53 |
"wall_clock_breakdown": false
|
|
|
45 |
"params": {
|
46 |
"warmup_min_lr": "auto",
|
47 |
"warmup_max_lr": "auto",
|
48 |
+
"warmup_num_steps": "auto",
|
49 |
+
"warmup_type": "linear"
|
50 |
}
|
51 |
},
|
52 |
+
"gradient_accumulation_steps": "auto",
|
53 |
"train_batch_size": "auto",
|
54 |
"train_micro_batch_size_per_gpu": "auto",
|
55 |
"wall_clock_breakdown": false
|
requirements.txt
CHANGED
@@ -7,6 +7,7 @@ peft @ git+https://github.com/huggingface/peft.git
|
|
7 |
transformers @ git+https://github.com/huggingface/transformers.git
|
8 |
bitsandbytes>=0.41.1
|
9 |
accelerate @ git+https://github.com/huggingface/accelerate
|
|
|
10 |
addict
|
11 |
evaluate
|
12 |
fire
|
|
|
7 |
transformers @ git+https://github.com/huggingface/transformers.git
|
8 |
bitsandbytes>=0.41.1
|
9 |
accelerate @ git+https://github.com/huggingface/accelerate
|
10 |
+
deepspeed
|
11 |
addict
|
12 |
evaluate
|
13 |
fire
|
setup.py
CHANGED
@@ -13,7 +13,12 @@ def parse_requirements():
|
|
13 |
# Handle custom index URLs
|
14 |
_, url = line.split()
|
15 |
_dependency_links.append(url)
|
16 |
-
elif
|
|
|
|
|
|
|
|
|
|
|
17 |
# Handle standard packages
|
18 |
_install_requires.append(line)
|
19 |
return _install_requires, _dependency_links
|
@@ -35,7 +40,7 @@ setup(
|
|
35 |
"flash-attn": [
|
36 |
"flash-attn>=2.2.1",
|
37 |
],
|
38 |
-
"
|
39 |
"deepspeed",
|
40 |
],
|
41 |
},
|
|
|
13 |
# Handle custom index URLs
|
14 |
_, url = line.split()
|
15 |
_dependency_links.append(url)
|
16 |
+
elif (
|
17 |
+
"flash-attn" not in line
|
18 |
+
and "deepspeed" not in line
|
19 |
+
and line
|
20 |
+
and line[0] != "#"
|
21 |
+
):
|
22 |
# Handle standard packages
|
23 |
_install_requires.append(line)
|
24 |
return _install_requires, _dependency_links
|
|
|
40 |
"flash-attn": [
|
41 |
"flash-attn>=2.2.1",
|
42 |
],
|
43 |
+
"deepspeed": [
|
44 |
"deepspeed",
|
45 |
],
|
46 |
},
|