winglian commited on
Commit
c25ba79
·
unverified ·
1 Parent(s): d5f8589

update README w deepspeed info (#605)

Browse files
Files changed (6) hide show
  1. README.md +22 -8
  2. deepspeed/zero1.json +39 -37
  3. deepspeed/zero2.json +42 -40
  4. deepspeed/zero3.json +3 -1
  5. requirements.txt +1 -0
  6. setup.py +7 -2
README.md CHANGED
@@ -31,6 +31,7 @@ Features:
31
  - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
32
  - [Config](#config)
33
  - [Train](#train)
 
34
  - [Inference](#inference)
35
  - [Merge LORA to Base](#merge-lora-to-base)
36
  - [Common Errors](#common-errors-)
@@ -86,7 +87,7 @@ git clone https://github.com/OpenAccess-AI-Collective/axolotl
86
  cd axolotl
87
 
88
  pip3 install packaging
89
- pip3 install -e .[flash-attn]
90
  pip3 install -U git+https://github.com/huggingface/peft.git
91
 
92
  # finetune lora
@@ -121,7 +122,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
121
  3. Install axolotl along with python dependencies
122
  ```bash
123
  pip3 install packaging
124
- pip3 install -e .[flash-attn]
125
  ```
126
 
127
  - LambdaLabs
@@ -157,7 +158,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
157
  cd axolotl
158
 
159
  pip3 install packaging
160
- pip3 install -e .[flash-attn]
161
  pip3 install protobuf==3.20.3
162
  pip3 install -U --ignore-installed requests Pillow psutil scipy
163
  ```
@@ -715,11 +716,6 @@ fsdp_config:
715
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
716
  ```
717
 
718
- - llama Deepspeed
719
- ```yaml
720
- deepspeed: deepspeed/zero3.json
721
- ```
722
-
723
  ##### Weights & Biases Logging
724
 
725
  - wandb options
@@ -732,6 +728,24 @@ wandb_run_id:
732
  wandb_log_model:
733
  ```
734
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
735
  ### Inference
736
 
737
  Pass the appropriate flag to the train command:
 
31
  - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
32
  - [Config](#config)
33
  - [Train](#train)
34
+ - [Training w/ Deepspeed](#training-with-deepspeed)
35
  - [Inference](#inference)
36
  - [Merge LORA to Base](#merge-lora-to-base)
37
  - [Common Errors](#common-errors-)
 
87
  cd axolotl
88
 
89
  pip3 install packaging
90
+ pip3 install -e .[flash-attn,deepspeed]
91
  pip3 install -U git+https://github.com/huggingface/peft.git
92
 
93
  # finetune lora
 
122
  3. Install axolotl along with python dependencies
123
  ```bash
124
  pip3 install packaging
125
+ pip3 install -e .[flash-attn,deepspeed]
126
  ```
127
 
128
  - LambdaLabs
 
158
  cd axolotl
159
 
160
  pip3 install packaging
161
+ pip3 install -e .[flash-attn,deepspeed]
162
  pip3 install protobuf==3.20.3
163
  pip3 install -U --ignore-installed requests Pillow psutil scipy
164
  ```
 
716
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
717
  ```
718
 
 
 
 
 
 
719
  ##### Weights & Biases Logging
720
 
721
  - wandb options
 
728
  wandb_log_model:
729
  ```
730
 
731
+ ### Training with Deepspeed
732
+
733
+ Deepspeed is an optimization suite for multi-gpu systems allowing you to train much larger models than you
734
+ might typically be able to fit into your GPU's VRAM. More information about the various optimization types
735
+ for deepspeed is available at https://huggingface.co/docs/accelerate/main/en/usage_guides/deepspeed#what-is-integrated
736
+
737
+ We provide several default deepspeed JSON configurations for ZeRO stage 1, 2, and 3.
738
+
739
+ ```shell
740
+ accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed/zero1.json
741
+ ```
742
+
743
+ or
744
+
745
+ ```yaml
746
+ deepspeed: deepspeed/zero1.json
747
+ ```
748
+
749
  ### Inference
750
 
751
  Pass the appropriate flag to the train command:
deepspeed/zero1.json CHANGED
@@ -1,39 +1,41 @@
1
  {
2
- "zero_optimization": {
3
- "stage": 1,
4
- "overlap_comm": true
5
- },
6
- "bf16": {
7
- "enabled": "auto"
8
- },
9
- "fp16": {
10
- "enabled": "auto",
11
- "auto_cast": false,
12
- "loss_scale": 0,
13
- "initial_scale_power": 32,
14
- "loss_scale_window": 1000,
15
- "hysteresis": 2,
16
- "min_loss_scale": 1
17
- },
18
- "optimizer": {
19
- "type": "AdamW",
20
- "params": {
21
- "lr": "auto",
22
- "betas": "auto",
23
- "eps": "auto",
24
- "weight_decay": "auto"
25
- }
26
- },
27
- "scheduler": {
28
- "type": "WarmupDecayLR",
29
- "params": {
30
- "warmup_min_lr": "auto",
31
- "warmup_max_lr": "auto",
32
- "warmup_num_steps": "auto",
33
- "total_num_steps": "auto"
34
- }
35
- },
36
- "train_batch_size": "auto",
37
- "train_micro_batch_size_per_gpu": "auto",
38
- "wall_clock_breakdown": false
 
 
39
  }
 
1
  {
2
+ "zero_optimization": {
3
+ "stage": 1,
4
+ "overlap_comm": true
5
+ },
6
+ "bf16": {
7
+ "enabled": "auto"
8
+ },
9
+ "fp16": {
10
+ "enabled": "auto",
11
+ "auto_cast": false,
12
+ "loss_scale": 0,
13
+ "initial_scale_power": 32,
14
+ "loss_scale_window": 1000,
15
+ "hysteresis": 2,
16
+ "min_loss_scale": 1
17
+ },
18
+ "optimizer": {
19
+ "type": "AdamW",
20
+ "params": {
21
+ "lr": "auto",
22
+ "betas": "auto",
23
+ "eps": "auto",
24
+ "weight_decay": "auto"
25
+ }
26
+ },
27
+ "scheduler": {
28
+ "type": "WarmupDecayLR",
29
+ "params": {
30
+ "warmup_min_lr": "auto",
31
+ "warmup_max_lr": "auto",
32
+ "warmup_num_steps": "auto",
33
+ "warmup_type": "linear",
34
+ "total_num_steps": "auto"
35
+ }
36
+ },
37
+ "gradient_accumulation_steps": "auto",
38
+ "train_batch_size": "auto",
39
+ "train_micro_batch_size_per_gpu": "auto",
40
+ "wall_clock_breakdown": false
41
  }
deepspeed/zero2.json CHANGED
@@ -1,43 +1,45 @@
1
  {
2
- "zero_optimization": {
3
- "stage": 2,
4
- "offload_optimizer": {
5
- "device": "cpu"
6
- },
7
- "contiguous_gradients": true,
8
- "overlap_comm": true
9
  },
10
- "bf16": {
11
- "enabled": "auto"
12
- },
13
- "fp16": {
14
- "enabled": "auto",
15
- "auto_cast": false,
16
- "loss_scale": 0,
17
- "initial_scale_power": 32,
18
- "loss_scale_window": 1000,
19
- "hysteresis": 2,
20
- "min_loss_scale": 1
21
- },
22
- "optimizer": {
23
- "type": "AdamW",
24
- "params": {
25
- "lr": "auto",
26
- "betas": "auto",
27
- "eps": "auto",
28
- "weight_decay": "auto"
29
- }
30
- },
31
- "scheduler": {
32
- "type": "WarmupDecayLR",
33
- "params": {
34
- "warmup_min_lr": "auto",
35
- "warmup_max_lr": "auto",
36
- "warmup_num_steps": "auto",
37
- "total_num_steps": "auto"
38
- }
39
- },
40
- "train_batch_size": "auto",
41
- "train_micro_batch_size_per_gpu": "auto",
42
- "wall_clock_breakdown": false
 
 
 
 
 
43
  }
 
1
  {
2
+ "zero_optimization": {
3
+ "stage": 2,
4
+ "offload_optimizer": {
5
+ "device": "cpu"
 
 
 
6
  },
7
+ "contiguous_gradients": true,
8
+ "overlap_comm": true
9
+ },
10
+ "bf16": {
11
+ "enabled": "auto"
12
+ },
13
+ "fp16": {
14
+ "enabled": "auto",
15
+ "auto_cast": false,
16
+ "loss_scale": 0,
17
+ "initial_scale_power": 32,
18
+ "loss_scale_window": 1000,
19
+ "hysteresis": 2,
20
+ "min_loss_scale": 1
21
+ },
22
+ "optimizer": {
23
+ "type": "AdamW",
24
+ "params": {
25
+ "lr": "auto",
26
+ "betas": "auto",
27
+ "eps": "auto",
28
+ "weight_decay": "auto"
29
+ }
30
+ },
31
+ "scheduler": {
32
+ "type": "WarmupDecayLR",
33
+ "params": {
34
+ "warmup_min_lr": "auto",
35
+ "warmup_max_lr": "auto",
36
+ "warmup_num_steps": "auto",
37
+ "warmup_type": "linear",
38
+ "total_num_steps": "auto"
39
+ }
40
+ },
41
+ "gradient_accumulation_steps": "auto",
42
+ "train_batch_size": "auto",
43
+ "train_micro_batch_size_per_gpu": "auto",
44
+ "wall_clock_breakdown": false
45
  }
deepspeed/zero3.json CHANGED
@@ -45,9 +45,11 @@
45
  "params": {
46
  "warmup_min_lr": "auto",
47
  "warmup_max_lr": "auto",
48
- "warmup_num_steps": "auto"
 
49
  }
50
  },
 
51
  "train_batch_size": "auto",
52
  "train_micro_batch_size_per_gpu": "auto",
53
  "wall_clock_breakdown": false
 
45
  "params": {
46
  "warmup_min_lr": "auto",
47
  "warmup_max_lr": "auto",
48
+ "warmup_num_steps": "auto",
49
+ "warmup_type": "linear"
50
  }
51
  },
52
+ "gradient_accumulation_steps": "auto",
53
  "train_batch_size": "auto",
54
  "train_micro_batch_size_per_gpu": "auto",
55
  "wall_clock_breakdown": false
requirements.txt CHANGED
@@ -7,6 +7,7 @@ peft @ git+https://github.com/huggingface/peft.git
7
  transformers @ git+https://github.com/huggingface/transformers.git
8
  bitsandbytes>=0.41.1
9
  accelerate @ git+https://github.com/huggingface/accelerate
 
10
  addict
11
  evaluate
12
  fire
 
7
  transformers @ git+https://github.com/huggingface/transformers.git
8
  bitsandbytes>=0.41.1
9
  accelerate @ git+https://github.com/huggingface/accelerate
10
+ deepspeed
11
  addict
12
  evaluate
13
  fire
setup.py CHANGED
@@ -13,7 +13,12 @@ def parse_requirements():
13
  # Handle custom index URLs
14
  _, url = line.split()
15
  _dependency_links.append(url)
16
- elif "flash-attn" not in line and line and line[0] != "#":
 
 
 
 
 
17
  # Handle standard packages
18
  _install_requires.append(line)
19
  return _install_requires, _dependency_links
@@ -35,7 +40,7 @@ setup(
35
  "flash-attn": [
36
  "flash-attn>=2.2.1",
37
  ],
38
- "extras": [
39
  "deepspeed",
40
  ],
41
  },
 
13
  # Handle custom index URLs
14
  _, url = line.split()
15
  _dependency_links.append(url)
16
+ elif (
17
+ "flash-attn" not in line
18
+ and "deepspeed" not in line
19
+ and line
20
+ and line[0] != "#"
21
+ ):
22
  # Handle standard packages
23
  _install_requires.append(line)
24
  return _install_requires, _dependency_links
 
40
  "flash-attn": [
41
  "flash-attn>=2.2.1",
42
  ],
43
+ "deepspeed": [
44
  "deepspeed",
45
  ],
46
  },