Nanobit winglian commited on
Commit
946b497
1 Parent(s): 0ddfb24

feat: add deepspeed 3 with cpuoffload (#1466)

Browse files

* feat: add deepspeed 3 with cpuoffload

* make bf16 explicit, add param only offload variant

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>

deepspeed_configs/zero3_bf16_cpuoffload_all.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "zero_optimization": {
3
+ "stage": 3,
4
+ "offload_optimizer": {
5
+ "device": "cpu",
6
+ "pin_memory": true
7
+ },
8
+ "offload_param": {
9
+ "device": "cpu",
10
+ "pin_memory": true
11
+ },
12
+ "overlap_comm": true,
13
+ "contiguous_gradients": true,
14
+ "sub_group_size": 0,
15
+ "reduce_bucket_size": "auto",
16
+ "stage3_prefetch_bucket_size": "auto",
17
+ "stage3_param_persistence_threshold": "auto",
18
+ "stage3_max_live_parameters": 0,
19
+ "stage3_max_reuse_distance": 0,
20
+ "stage3_gather_16bit_weights_on_model_save": true
21
+ },
22
+ "bf16": {
23
+ "enabled": true
24
+ },
25
+ "fp16": {
26
+ "enabled": "auto",
27
+ "auto_cast": false,
28
+ "loss_scale": 0,
29
+ "initial_scale_power": 32,
30
+ "loss_scale_window": 1000,
31
+ "hysteresis": 2,
32
+ "min_loss_scale": 1
33
+ },
34
+ "gradient_accumulation_steps": "auto",
35
+ "gradient_clipping": "auto",
36
+ "train_batch_size": "auto",
37
+ "train_micro_batch_size_per_gpu": "auto",
38
+ "wall_clock_breakdown": false
39
+ }
deepspeed_configs/zero3_bf16_cpuoffload_params.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "zero_optimization": {
3
+ "stage": 3,
4
+ "offload_param": {
5
+ "device": "cpu",
6
+ "pin_memory": true
7
+ },
8
+ "overlap_comm": true,
9
+ "contiguous_gradients": true,
10
+ "sub_group_size": 0,
11
+ "reduce_bucket_size": "auto",
12
+ "stage3_prefetch_bucket_size": "auto",
13
+ "stage3_param_persistence_threshold": "auto",
14
+ "stage3_max_live_parameters": 0,
15
+ "stage3_max_reuse_distance": 0,
16
+ "stage3_gather_16bit_weights_on_model_save": true
17
+ },
18
+ "bf16": {
19
+ "enabled": true
20
+ },
21
+ "fp16": {
22
+ "enabled": "auto",
23
+ "auto_cast": false,
24
+ "loss_scale": 0,
25
+ "initial_scale_power": 32,
26
+ "loss_scale_window": 1000,
27
+ "hysteresis": 2,
28
+ "min_loss_scale": 1
29
+ },
30
+ "gradient_accumulation_steps": "auto",
31
+ "gradient_clipping": "auto",
32
+ "train_batch_size": "auto",
33
+ "train_micro_batch_size_per_gpu": "auto",
34
+ "wall_clock_breakdown": false
35
+ }