aaditya commited on
Commit
03a40fb
1 Parent(s): aff4b40

Upload 6 files

Browse files
deepspeed_configs/zero1.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "zero_optimization": {
3
+ "stage": 1,
4
+ "overlap_comm": true
5
+ },
6
+ "bf16": {
7
+ "enabled": "auto"
8
+ },
9
+ "fp16": {
10
+ "enabled": "auto",
11
+ "auto_cast": false,
12
+ "loss_scale": 0,
13
+ "initial_scale_power": 32,
14
+ "loss_scale_window": 1000,
15
+ "hysteresis": 2,
16
+ "min_loss_scale": 1
17
+ },
18
+ "gradient_accumulation_steps": "auto",
19
+ "gradient_clipping": "auto",
20
+ "train_batch_size": "auto",
21
+ "train_micro_batch_size_per_gpu": "auto",
22
+ "wall_clock_breakdown": false
23
+ }
deepspeed_configs/zero2.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "zero_optimization": {
3
+ "stage": 2,
4
+ "offload_optimizer": {
5
+ "device": "cpu"
6
+ },
7
+ "contiguous_gradients": true,
8
+ "overlap_comm": true
9
+ },
10
+ "bf16": {
11
+ "enabled": "auto"
12
+ },
13
+ "fp16": {
14
+ "enabled": "auto",
15
+ "auto_cast": false,
16
+ "loss_scale": 0,
17
+ "initial_scale_power": 32,
18
+ "loss_scale_window": 1000,
19
+ "hysteresis": 2,
20
+ "min_loss_scale": 1
21
+ },
22
+ "gradient_accumulation_steps": "auto",
23
+ "gradient_clipping": "auto",
24
+ "train_batch_size": "auto",
25
+ "train_micro_batch_size_per_gpu": "auto",
26
+ "wall_clock_breakdown": false
27
+ }
deepspeed_configs/zero3.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "zero_optimization": {
3
+ "stage": 3,
4
+ "overlap_comm": true,
5
+ "contiguous_gradients": true,
6
+ "sub_group_size": 0,
7
+ "reduce_bucket_size": "auto",
8
+ "stage3_prefetch_bucket_size": "auto",
9
+ "stage3_param_persistence_threshold": "auto",
10
+ "stage3_max_live_parameters": 0,
11
+ "stage3_max_reuse_distance": 0,
12
+ "stage3_gather_16bit_weights_on_model_save": true
13
+ },
14
+ "bf16": {
15
+ "enabled": "auto"
16
+ },
17
+ "fp16": {
18
+ "enabled": "auto",
19
+ "auto_cast": false,
20
+ "loss_scale": 0,
21
+ "initial_scale_power": 32,
22
+ "loss_scale_window": 1000,
23
+ "hysteresis": 2,
24
+ "min_loss_scale": 1
25
+ },
26
+ "gradient_accumulation_steps": "auto",
27
+ "gradient_clipping": "auto",
28
+ "train_batch_size": "auto",
29
+ "train_micro_batch_size_per_gpu": "auto",
30
+ "wall_clock_breakdown": false
31
+ }
deepspeed_configs/zero3_bf16.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "zero_optimization": {
3
+ "stage": 3,
4
+ "overlap_comm": true,
5
+ "contiguous_gradients": true,
6
+ "sub_group_size": 0,
7
+ "reduce_bucket_size": "auto",
8
+ "stage3_prefetch_bucket_size": "auto",
9
+ "stage3_param_persistence_threshold": "auto",
10
+ "stage3_max_live_parameters": 0,
11
+ "stage3_max_reuse_distance": 0,
12
+ "stage3_gather_16bit_weights_on_model_save": true
13
+ },
14
+ "bf16": {
15
+ "enabled": true
16
+ },
17
+ "fp16": {
18
+ "enabled": "auto",
19
+ "auto_cast": false,
20
+ "loss_scale": 0,
21
+ "initial_scale_power": 32,
22
+ "loss_scale_window": 1000,
23
+ "hysteresis": 2,
24
+ "min_loss_scale": 1
25
+ },
26
+ "gradient_accumulation_steps": "auto",
27
+ "gradient_clipping": "auto",
28
+ "train_batch_size": "auto",
29
+ "train_micro_batch_size_per_gpu": "auto",
30
+ "wall_clock_breakdown": false
31
+ }
deepspeed_configs/zero3_bf16_cpuoffload_all.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "zero_optimization": {
3
+ "stage": 3,
4
+ "offload_optimizer": {
5
+ "device": "cpu",
6
+ "pin_memory": true
7
+ },
8
+ "offload_param": {
9
+ "device": "cpu",
10
+ "pin_memory": true
11
+ },
12
+ "overlap_comm": true,
13
+ "contiguous_gradients": true,
14
+ "sub_group_size": 0,
15
+ "reduce_bucket_size": "auto",
16
+ "stage3_prefetch_bucket_size": "auto",
17
+ "stage3_param_persistence_threshold": "auto",
18
+ "stage3_max_live_parameters": 0,
19
+ "stage3_max_reuse_distance": 0,
20
+ "stage3_gather_16bit_weights_on_model_save": true
21
+ },
22
+ "bf16": {
23
+ "enabled": true
24
+ },
25
+ "fp16": {
26
+ "enabled": "auto",
27
+ "auto_cast": false,
28
+ "loss_scale": 0,
29
+ "initial_scale_power": 32,
30
+ "loss_scale_window": 1000,
31
+ "hysteresis": 2,
32
+ "min_loss_scale": 1
33
+ },
34
+ "gradient_accumulation_steps": "auto",
35
+ "gradient_clipping": "auto",
36
+ "train_batch_size": "auto",
37
+ "train_micro_batch_size_per_gpu": "auto",
38
+ "wall_clock_breakdown": false
39
+ }
deepspeed_configs/zero3_bf16_cpuoffload_params.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "zero_optimization": {
3
+ "stage": 3,
4
+ "offload_param": {
5
+ "device": "cpu",
6
+ "pin_memory": true
7
+ },
8
+ "overlap_comm": true,
9
+ "contiguous_gradients": true,
10
+ "sub_group_size": 0,
11
+ "reduce_bucket_size": "auto",
12
+ "stage3_prefetch_bucket_size": "auto",
13
+ "stage3_param_persistence_threshold": "auto",
14
+ "stage3_max_live_parameters": 0,
15
+ "stage3_max_reuse_distance": 0,
16
+ "stage3_gather_16bit_weights_on_model_save": true
17
+ },
18
+ "bf16": {
19
+ "enabled": true
20
+ },
21
+ "fp16": {
22
+ "enabled": "auto",
23
+ "auto_cast": false,
24
+ "loss_scale": 0,
25
+ "initial_scale_power": 32,
26
+ "loss_scale_window": 1000,
27
+ "hysteresis": 2,
28
+ "min_loss_scale": 1
29
+ },
30
+ "gradient_accumulation_steps": "auto",
31
+ "gradient_clipping": "auto",
32
+ "train_batch_size": "auto",
33
+ "train_micro_batch_size_per_gpu": "auto",
34
+ "wall_clock_breakdown": false
35
+ }