swap batch size for gradient accumulation steps to decouple from num gpu
Browse files- README.md +1 -1
- configs/cerebras_1_3B_alpaca.yml +1 -1
- configs/galactica_1_3B.yml +1 -1
- configs/gpt_neox_20b.yml +1 -1
- configs/llama_13B_alpaca.yml +1 -1
- configs/llama_65B_alpaca.yml +1 -1
- configs/llama_7B_4bit.yml +1 -1
- configs/llama_7B_alpaca.yml +1 -1
- configs/llama_7B_jeopardy.yml +1 -1
- configs/pythia_1_2B_alpaca.yml +1 -1
- configs/quickstart.yml +1 -1
- configs/sample.yml +2 -1
- configs/stability_3b.yml +1 -1
- configs/vicuna_13B_4bit_reflect.yml +1 -1
- examples/gptq-lora-7b/config.yml +1 -1
- examples/mpt-7b/config.yml +1 -1
README.md
CHANGED
@@ -265,7 +265,7 @@ wandb_log_model: # 'checkpoint'
|
|
265 |
output_dir: ./completed-model
|
266 |
|
267 |
# training hyperparameters
|
268 |
-
|
269 |
micro_batch_size: 2
|
270 |
eval_batch_size: 2
|
271 |
num_epochs: 3
|
|
|
265 |
output_dir: ./completed-model
|
266 |
|
267 |
# training hyperparameters
|
268 |
+
gradient_accumulation_steps: 1
|
269 |
micro_batch_size: 2
|
270 |
eval_batch_size: 2
|
271 |
num_epochs: 3
|
configs/cerebras_1_3B_alpaca.yml
CHANGED
@@ -26,7 +26,7 @@ wandb_watch:
|
|
26 |
wandb_run_id:
|
27 |
wandb_log_model:
|
28 |
output_dir: ./lora-alpaca
|
29 |
-
|
30 |
micro_batch_size: 4
|
31 |
num_epochs: 5
|
32 |
learning_rate: 0.0003
|
|
|
26 |
wandb_run_id:
|
27 |
wandb_log_model:
|
28 |
output_dir: ./lora-alpaca
|
29 |
+
gradient_accumulation_steps: 1
|
30 |
micro_batch_size: 4
|
31 |
num_epochs: 5
|
32 |
learning_rate: 0.0003
|
configs/galactica_1_3B.yml
CHANGED
@@ -23,7 +23,7 @@ wandb_watch:
|
|
23 |
wandb_run_id:
|
24 |
wandb_log_model:
|
25 |
output_dir: ./lora-llama-alpaca
|
26 |
-
|
27 |
micro_batch_size: 16
|
28 |
num_epochs: 3
|
29 |
learning_rate: 0.00003
|
|
|
23 |
wandb_run_id:
|
24 |
wandb_log_model:
|
25 |
output_dir: ./lora-llama-alpaca
|
26 |
+
gradient_accumulation_steps: 1
|
27 |
micro_batch_size: 16
|
28 |
num_epochs: 3
|
29 |
learning_rate: 0.00003
|
configs/gpt_neox_20b.yml
CHANGED
@@ -25,7 +25,7 @@ wandb_watch:
|
|
25 |
wandb_run_id:
|
26 |
wandb_log_model:
|
27 |
output_dir: ./gpt4all-neox-20b
|
28 |
-
|
29 |
micro_batch_size: 4
|
30 |
num_epochs: 5
|
31 |
learning_rate: 0.00003
|
|
|
25 |
wandb_run_id:
|
26 |
wandb_log_model:
|
27 |
output_dir: ./gpt4all-neox-20b
|
28 |
+
gradient_accumulation_steps: 1
|
29 |
micro_batch_size: 4
|
30 |
num_epochs: 5
|
31 |
learning_rate: 0.00003
|
configs/llama_13B_alpaca.yml
CHANGED
@@ -23,7 +23,7 @@ wandb_watch:
|
|
23 |
wandb_run_id:
|
24 |
wandb_log_model:
|
25 |
output_dir: ./llama-13b-sharegpt
|
26 |
-
|
27 |
micro_batch_size: 2
|
28 |
warmup_steps: 1000
|
29 |
save_steps:
|
|
|
23 |
wandb_run_id:
|
24 |
wandb_log_model:
|
25 |
output_dir: ./llama-13b-sharegpt
|
26 |
+
gradient_accumulation_steps: 1
|
27 |
micro_batch_size: 2
|
28 |
warmup_steps: 1000
|
29 |
save_steps:
|
configs/llama_65B_alpaca.yml
CHANGED
@@ -29,7 +29,7 @@ wandb_watch:
|
|
29 |
wandb_run_id:
|
30 |
wandb_log_model:
|
31 |
output_dir: ./lora-llama-alpaca
|
32 |
-
|
33 |
micro_batch_size: 16
|
34 |
warmup_steps: 1000
|
35 |
save_steps:
|
|
|
29 |
wandb_run_id:
|
30 |
wandb_log_model:
|
31 |
output_dir: ./lora-llama-alpaca
|
32 |
+
gradient_accumulation_steps: 1
|
33 |
micro_batch_size: 16
|
34 |
warmup_steps: 1000
|
35 |
save_steps:
|
configs/llama_7B_4bit.yml
CHANGED
@@ -26,7 +26,7 @@ wandb_watch:
|
|
26 |
wandb_run_id:
|
27 |
wandb_log_model:
|
28 |
output_dir: ./lora-test
|
29 |
-
|
30 |
micro_batch_size: 2
|
31 |
num_epochs: 3
|
32 |
warmup_steps: 100
|
|
|
26 |
wandb_run_id:
|
27 |
wandb_log_model:
|
28 |
output_dir: ./lora-test
|
29 |
+
gradient_accumulation_steps: 1
|
30 |
micro_batch_size: 2
|
31 |
num_epochs: 3
|
32 |
warmup_steps: 100
|
configs/llama_7B_alpaca.yml
CHANGED
@@ -28,7 +28,7 @@ wandb_watch:
|
|
28 |
wandb_run_id:
|
29 |
wandb_log_model:
|
30 |
output_dir: ./lora-llama-alpaca
|
31 |
-
|
32 |
micro_batch_size: 16
|
33 |
num_epochs: 5
|
34 |
learning_rate: 0.00003
|
|
|
28 |
wandb_run_id:
|
29 |
wandb_log_model:
|
30 |
output_dir: ./lora-llama-alpaca
|
31 |
+
gradient_accumulation_steps: 1
|
32 |
micro_batch_size: 16
|
33 |
num_epochs: 5
|
34 |
learning_rate: 0.00003
|
configs/llama_7B_jeopardy.yml
CHANGED
@@ -24,7 +24,7 @@ wandb_watch:
|
|
24 |
wandb_run_id:
|
25 |
wandb_log_model:
|
26 |
output_dir: ./jeopardy-bot-7b
|
27 |
-
|
28 |
micro_batch_size: 1
|
29 |
num_epochs: 2
|
30 |
optimizer: adamw_bnb_8bit
|
|
|
24 |
wandb_run_id:
|
25 |
wandb_log_model:
|
26 |
output_dir: ./jeopardy-bot-7b
|
27 |
+
gradient_accumulation_steps: 2
|
28 |
micro_batch_size: 1
|
29 |
num_epochs: 2
|
30 |
optimizer: adamw_bnb_8bit
|
configs/pythia_1_2B_alpaca.yml
CHANGED
@@ -28,7 +28,7 @@ wandb_watch:
|
|
28 |
wandb_run_id:
|
29 |
wandb_log_model:
|
30 |
output_dir: ./lora-alpaca
|
31 |
-
|
32 |
micro_batch_size: 4
|
33 |
num_epochs: 5
|
34 |
learning_rate: 0.00001
|
|
|
28 |
wandb_run_id:
|
29 |
wandb_log_model:
|
30 |
output_dir: ./lora-alpaca
|
31 |
+
gradient_accumulation_steps: 1
|
32 |
micro_batch_size: 4
|
33 |
num_epochs: 5
|
34 |
learning_rate: 0.00001
|
configs/quickstart.yml
CHANGED
@@ -26,7 +26,7 @@ wandb_watch:
|
|
26 |
wandb_run_id:
|
27 |
wandb_log_model:
|
28 |
output_dir: ./lora-test
|
29 |
-
|
30 |
micro_batch_size: 1
|
31 |
num_epochs: 3
|
32 |
warmup_steps: 100
|
|
|
26 |
wandb_run_id:
|
27 |
wandb_log_model:
|
28 |
output_dir: ./lora-test
|
29 |
+
gradient_accumulation_steps: 1
|
30 |
micro_batch_size: 1
|
31 |
num_epochs: 3
|
32 |
warmup_steps: 100
|
configs/sample.yml
CHANGED
@@ -53,7 +53,8 @@ wandb_log_model:
|
|
53 |
# where to save the finsihed model to
|
54 |
output_dir: ./completed-model
|
55 |
# training hyperparameters
|
56 |
-
|
|
|
57 |
micro_batch_size: 2
|
58 |
num_epochs: 3
|
59 |
warmup_steps: 100
|
|
|
53 |
# where to save the finsihed model to
|
54 |
output_dir: ./completed-model
|
55 |
# training hyperparameters
|
56 |
+
gradient_accumulation_steps: 1
|
57 |
+
batch_size:
|
58 |
micro_batch_size: 2
|
59 |
num_epochs: 3
|
60 |
warmup_steps: 100
|
configs/stability_3b.yml
CHANGED
@@ -22,7 +22,7 @@ wandb_watch:
|
|
22 |
wandb_run_id:
|
23 |
wandb_log_model:
|
24 |
output_dir: ./stable-alpaca-3b
|
25 |
-
|
26 |
micro_batch_size: 1
|
27 |
num_epochs: 1
|
28 |
optimizer: adamw_bnb_8bit
|
|
|
22 |
wandb_run_id:
|
23 |
wandb_log_model:
|
24 |
output_dir: ./stable-alpaca-3b
|
25 |
+
gradient_accumulation_steps: 1
|
26 |
micro_batch_size: 1
|
27 |
num_epochs: 1
|
28 |
optimizer: adamw_bnb_8bit
|
configs/vicuna_13B_4bit_reflect.yml
CHANGED
@@ -30,7 +30,7 @@ wandb_watch:
|
|
30 |
wandb_run_id:
|
31 |
wandb_log_model:
|
32 |
output_dir: ./lora-reflect
|
33 |
-
|
34 |
micro_batch_size: 2
|
35 |
num_epochs: 3
|
36 |
learning_rate: 0.00003
|
|
|
30 |
wandb_run_id:
|
31 |
wandb_log_model:
|
32 |
output_dir: ./lora-reflect
|
33 |
+
gradient_accumulation_steps: 1
|
34 |
micro_batch_size: 2
|
35 |
num_epochs: 3
|
36 |
learning_rate: 0.00003
|
examples/gptq-lora-7b/config.yml
CHANGED
@@ -26,7 +26,7 @@ wandb_watch:
|
|
26 |
wandb_run_id:
|
27 |
wandb_log_model:
|
28 |
output_dir: ./llama-7b-lora-int4
|
29 |
-
|
30 |
micro_batch_size: 1
|
31 |
num_epochs: 3
|
32 |
optimizer: adamw_bnb_8bit
|
|
|
26 |
wandb_run_id:
|
27 |
wandb_log_model:
|
28 |
output_dir: ./llama-7b-lora-int4
|
29 |
+
gradient_accumulation_steps: 1
|
30 |
micro_batch_size: 1
|
31 |
num_epochs: 3
|
32 |
optimizer: adamw_bnb_8bit
|
examples/mpt-7b/config.yml
CHANGED
@@ -24,7 +24,7 @@ wandb_watch:
|
|
24 |
wandb_run_id:
|
25 |
wandb_log_model:
|
26 |
output_dir: ./mpt-alpaca-7b
|
27 |
-
|
28 |
micro_batch_size: 1
|
29 |
num_epochs: 3
|
30 |
optimizer: adamw_bnb_8bit
|
|
|
24 |
wandb_run_id:
|
25 |
wandb_log_model:
|
26 |
output_dir: ./mpt-alpaca-7b
|
27 |
+
gradient_accumulation_steps: 1
|
28 |
micro_batch_size: 1
|
29 |
num_epochs: 3
|
30 |
optimizer: adamw_bnb_8bit
|