fix eval_steps to be a sane default (#797)
Browse files* fix eval_steps to be a sane default
* update docs for fractional eval_steps
- README.md +2 -2
- examples/cerebras/qlora.yml +1 -1
- examples/code-llama/13b/lora.yml +2 -2
- examples/code-llama/13b/qlora.yml +2 -2
- examples/code-llama/34b/lora.yml +2 -2
- examples/code-llama/34b/qlora.yml +2 -2
- examples/code-llama/7b/lora.yml +2 -2
- examples/code-llama/7b/qlora.yml +2 -2
- examples/falcon/config-7b-qlora.yml +1 -1
- examples/gptj/qlora.yml +1 -1
- examples/jeopardy-bot/config.yml +1 -1
- examples/llama-2/gptq-lora.yml +1 -1
- examples/llama-2/lora.yml +2 -2
- examples/llama-2/qlora.yml +2 -2
- examples/llama-2/relora.yml +2 -2
- examples/llama-2/tiny-llama.yml +2 -2
- examples/mistral/config.yml +2 -2
- examples/mistral/qlora.yml +1 -1
- examples/mpt-7b/config.yml +1 -1
- examples/pythia/lora.yml +2 -2
- examples/redpajama/config-3b.yml +1 -1
- examples/replit-3b/config-lora.yml +1 -1
- examples/xgen-7b/xgen-7b-8k-qlora.yml +1 -1
README.md
CHANGED
@@ -618,14 +618,14 @@ gradient_accumulation_steps: 1
|
|
618 |
# The number of samples to include in each batch. This is the number of samples sent to each GPU.
|
619 |
micro_batch_size: 2
|
620 |
eval_batch_size:
|
621 |
-
num_epochs:
|
622 |
warmup_steps: 100
|
623 |
learning_rate: 0.00003
|
624 |
lr_quadratic_warmup:
|
625 |
logging_steps:
|
626 |
save_strategy: # Set to `no` to skip checkpoint saves
|
627 |
save_steps: # Leave empty to save at each epoch
|
628 |
-
eval_steps: # Leave empty to eval at each epoch
|
629 |
save_total_limit: # Checkpoints saved at a time
|
630 |
# Maximum number of iterations to train for. It precedes num_epochs which means that
|
631 |
# if both are set, num_epochs will not be guaranteed.
|
|
|
618 |
# The number of samples to include in each batch. This is the number of samples sent to each GPU.
|
619 |
micro_batch_size: 2
|
620 |
eval_batch_size:
|
621 |
+
num_epochs: 4
|
622 |
warmup_steps: 100
|
623 |
learning_rate: 0.00003
|
624 |
lr_quadratic_warmup:
|
625 |
logging_steps:
|
626 |
save_strategy: # Set to `no` to skip checkpoint saves
|
627 |
save_steps: # Leave empty to save at each epoch
|
628 |
+
eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
|
629 |
save_total_limit: # Checkpoints saved at a time
|
630 |
# Maximum number of iterations to train for. It precedes num_epochs which means that
|
631 |
# if both are set, num_epochs will not be guaranteed.
|
examples/cerebras/qlora.yml
CHANGED
@@ -49,7 +49,7 @@ flash_attention:
|
|
49 |
gptq_groupsize:
|
50 |
gptq_model_v1:
|
51 |
warmup_steps: 10
|
52 |
-
eval_steps:
|
53 |
save_steps:
|
54 |
debug:
|
55 |
deepspeed:
|
|
|
49 |
gptq_groupsize:
|
50 |
gptq_model_v1:
|
51 |
warmup_steps: 10
|
52 |
+
eval_steps: 0.05
|
53 |
save_steps:
|
54 |
debug:
|
55 |
deepspeed:
|
examples/code-llama/13b/lora.yml
CHANGED
@@ -34,7 +34,7 @@ wandb_log_model:
|
|
34 |
|
35 |
gradient_accumulation_steps: 4
|
36 |
micro_batch_size: 2
|
37 |
-
num_epochs:
|
38 |
optimizer: adamw_bnb_8bit
|
39 |
lr_scheduler: cosine
|
40 |
learning_rate: 0.0002
|
@@ -54,7 +54,7 @@ xformers_attention:
|
|
54 |
flash_attention: true
|
55 |
|
56 |
warmup_steps: 10
|
57 |
-
eval_steps:
|
58 |
save_steps:
|
59 |
debug:
|
60 |
deepspeed:
|
|
|
34 |
|
35 |
gradient_accumulation_steps: 4
|
36 |
micro_batch_size: 2
|
37 |
+
num_epochs: 4
|
38 |
optimizer: adamw_bnb_8bit
|
39 |
lr_scheduler: cosine
|
40 |
learning_rate: 0.0002
|
|
|
54 |
flash_attention: true
|
55 |
|
56 |
warmup_steps: 10
|
57 |
+
eval_steps: 0.05
|
58 |
save_steps:
|
59 |
debug:
|
60 |
deepspeed:
|
examples/code-llama/13b/qlora.yml
CHANGED
@@ -36,7 +36,7 @@ wandb_log_model:
|
|
36 |
|
37 |
gradient_accumulation_steps: 4
|
38 |
micro_batch_size: 2
|
39 |
-
num_epochs:
|
40 |
optimizer: paged_adamw_32bit
|
41 |
lr_scheduler: cosine
|
42 |
learning_rate: 0.0002
|
@@ -56,7 +56,7 @@ xformers_attention:
|
|
56 |
flash_attention: true
|
57 |
|
58 |
warmup_steps: 10
|
59 |
-
eval_steps:
|
60 |
save_steps:
|
61 |
debug:
|
62 |
deepspeed:
|
|
|
36 |
|
37 |
gradient_accumulation_steps: 4
|
38 |
micro_batch_size: 2
|
39 |
+
num_epochs: 4
|
40 |
optimizer: paged_adamw_32bit
|
41 |
lr_scheduler: cosine
|
42 |
learning_rate: 0.0002
|
|
|
56 |
flash_attention: true
|
57 |
|
58 |
warmup_steps: 10
|
59 |
+
eval_steps: 0.05
|
60 |
save_steps:
|
61 |
debug:
|
62 |
deepspeed:
|
examples/code-llama/34b/lora.yml
CHANGED
@@ -34,7 +34,7 @@ wandb_log_model:
|
|
34 |
|
35 |
gradient_accumulation_steps: 4
|
36 |
micro_batch_size: 2
|
37 |
-
num_epochs:
|
38 |
optimizer: adamw_bnb_8bit
|
39 |
lr_scheduler: cosine
|
40 |
learning_rate: 0.0002
|
@@ -54,7 +54,7 @@ xformers_attention:
|
|
54 |
flash_attention: true
|
55 |
|
56 |
warmup_steps: 10
|
57 |
-
eval_steps:
|
58 |
save_steps:
|
59 |
debug:
|
60 |
deepspeed:
|
|
|
34 |
|
35 |
gradient_accumulation_steps: 4
|
36 |
micro_batch_size: 2
|
37 |
+
num_epochs: 4
|
38 |
optimizer: adamw_bnb_8bit
|
39 |
lr_scheduler: cosine
|
40 |
learning_rate: 0.0002
|
|
|
54 |
flash_attention: true
|
55 |
|
56 |
warmup_steps: 10
|
57 |
+
eval_steps: 0.05
|
58 |
save_steps:
|
59 |
debug:
|
60 |
deepspeed:
|
examples/code-llama/34b/qlora.yml
CHANGED
@@ -36,7 +36,7 @@ wandb_log_model:
|
|
36 |
|
37 |
gradient_accumulation_steps: 4
|
38 |
micro_batch_size: 2
|
39 |
-
num_epochs:
|
40 |
optimizer: paged_adamw_32bit
|
41 |
lr_scheduler: cosine
|
42 |
learning_rate: 0.0002
|
@@ -56,7 +56,7 @@ xformers_attention:
|
|
56 |
flash_attention: true
|
57 |
|
58 |
warmup_steps: 10
|
59 |
-
eval_steps:
|
60 |
save_steps:
|
61 |
debug:
|
62 |
deepspeed:
|
|
|
36 |
|
37 |
gradient_accumulation_steps: 4
|
38 |
micro_batch_size: 2
|
39 |
+
num_epochs: 4
|
40 |
optimizer: paged_adamw_32bit
|
41 |
lr_scheduler: cosine
|
42 |
learning_rate: 0.0002
|
|
|
56 |
flash_attention: true
|
57 |
|
58 |
warmup_steps: 10
|
59 |
+
eval_steps: 0.05
|
60 |
save_steps:
|
61 |
debug:
|
62 |
deepspeed:
|
examples/code-llama/7b/lora.yml
CHANGED
@@ -34,7 +34,7 @@ wandb_log_model:
|
|
34 |
|
35 |
gradient_accumulation_steps: 4
|
36 |
micro_batch_size: 2
|
37 |
-
num_epochs:
|
38 |
optimizer: adamw_bnb_8bit
|
39 |
lr_scheduler: cosine
|
40 |
learning_rate: 0.0002
|
@@ -54,7 +54,7 @@ xformers_attention:
|
|
54 |
flash_attention: true
|
55 |
|
56 |
warmup_steps: 10
|
57 |
-
eval_steps:
|
58 |
save_steps:
|
59 |
debug:
|
60 |
deepspeed:
|
|
|
34 |
|
35 |
gradient_accumulation_steps: 4
|
36 |
micro_batch_size: 2
|
37 |
+
num_epochs: 4
|
38 |
optimizer: adamw_bnb_8bit
|
39 |
lr_scheduler: cosine
|
40 |
learning_rate: 0.0002
|
|
|
54 |
flash_attention: true
|
55 |
|
56 |
warmup_steps: 10
|
57 |
+
eval_steps: 0.05
|
58 |
save_steps:
|
59 |
debug:
|
60 |
deepspeed:
|
examples/code-llama/7b/qlora.yml
CHANGED
@@ -36,7 +36,7 @@ wandb_log_model:
|
|
36 |
|
37 |
gradient_accumulation_steps: 4
|
38 |
micro_batch_size: 2
|
39 |
-
num_epochs:
|
40 |
optimizer: paged_adamw_32bit
|
41 |
lr_scheduler: cosine
|
42 |
learning_rate: 0.0002
|
@@ -56,7 +56,7 @@ xformers_attention:
|
|
56 |
flash_attention: true
|
57 |
|
58 |
warmup_steps: 10
|
59 |
-
eval_steps:
|
60 |
save_steps:
|
61 |
debug:
|
62 |
deepspeed:
|
|
|
36 |
|
37 |
gradient_accumulation_steps: 4
|
38 |
micro_batch_size: 2
|
39 |
+
num_epochs: 4
|
40 |
optimizer: paged_adamw_32bit
|
41 |
lr_scheduler: cosine
|
42 |
learning_rate: 0.0002
|
|
|
56 |
flash_attention: true
|
57 |
|
58 |
warmup_steps: 10
|
59 |
+
eval_steps: 0.05
|
60 |
save_steps:
|
61 |
debug:
|
62 |
deepspeed:
|
examples/falcon/config-7b-qlora.yml
CHANGED
@@ -53,7 +53,7 @@ output_dir: ./qlora-out
|
|
53 |
# decrease if OOM, increase for max VRAM utilization
|
54 |
micro_batch_size: 1
|
55 |
gradient_accumulation_steps: 2
|
56 |
-
num_epochs:
|
57 |
# Optimizer for QLoRA
|
58 |
optimizer: paged_adamw_32bit
|
59 |
torchdistx_path:
|
|
|
53 |
# decrease if OOM, increase for max VRAM utilization
|
54 |
micro_batch_size: 1
|
55 |
gradient_accumulation_steps: 2
|
56 |
+
num_epochs: 4
|
57 |
# Optimizer for QLoRA
|
58 |
optimizer: paged_adamw_32bit
|
59 |
torchdistx_path:
|
examples/gptj/qlora.yml
CHANGED
@@ -46,7 +46,7 @@ flash_attention:
|
|
46 |
gptq_groupsize:
|
47 |
gptq_model_v1:
|
48 |
warmup_steps: 10
|
49 |
-
eval_steps:
|
50 |
save_steps:
|
51 |
debug:
|
52 |
deepspeed:
|
|
|
46 |
gptq_groupsize:
|
47 |
gptq_model_v1:
|
48 |
warmup_steps: 10
|
49 |
+
eval_steps: 0.05
|
50 |
save_steps:
|
51 |
debug:
|
52 |
deepspeed:
|
examples/jeopardy-bot/config.yml
CHANGED
@@ -24,7 +24,7 @@ wandb_log_model:
|
|
24 |
output_dir: ./jeopardy-bot-7b
|
25 |
gradient_accumulation_steps: 1
|
26 |
micro_batch_size: 1
|
27 |
-
num_epochs:
|
28 |
optimizer: adamw_bnb_8bit
|
29 |
torchdistx_path:
|
30 |
lr_scheduler: cosine
|
|
|
24 |
output_dir: ./jeopardy-bot-7b
|
25 |
gradient_accumulation_steps: 1
|
26 |
micro_batch_size: 1
|
27 |
+
num_epochs: 4
|
28 |
optimizer: adamw_bnb_8bit
|
29 |
torchdistx_path:
|
30 |
lr_scheduler: cosine
|
examples/llama-2/gptq-lora.yml
CHANGED
@@ -37,7 +37,7 @@ wandb_log_model:
|
|
37 |
output_dir: ./model-out
|
38 |
gradient_accumulation_steps: 1
|
39 |
micro_batch_size: 1
|
40 |
-
num_epochs:
|
41 |
optimizer: adamw_torch
|
42 |
adam_beta2: 0.95
|
43 |
adam_eps: 0.00001
|
|
|
37 |
output_dir: ./model-out
|
38 |
gradient_accumulation_steps: 1
|
39 |
micro_batch_size: 1
|
40 |
+
num_epochs: 4
|
41 |
optimizer: adamw_torch
|
42 |
adam_beta2: 0.95
|
43 |
adam_eps: 0.00001
|
examples/llama-2/lora.yml
CHANGED
@@ -34,7 +34,7 @@ wandb_log_model:
|
|
34 |
|
35 |
gradient_accumulation_steps: 4
|
36 |
micro_batch_size: 2
|
37 |
-
num_epochs:
|
38 |
optimizer: adamw_bnb_8bit
|
39 |
lr_scheduler: cosine
|
40 |
learning_rate: 0.0002
|
@@ -54,7 +54,7 @@ xformers_attention:
|
|
54 |
flash_attention: true
|
55 |
|
56 |
warmup_steps: 10
|
57 |
-
eval_steps:
|
58 |
eval_table_size:
|
59 |
eval_table_max_new_tokens: 128
|
60 |
save_steps:
|
|
|
34 |
|
35 |
gradient_accumulation_steps: 4
|
36 |
micro_batch_size: 2
|
37 |
+
num_epochs: 4
|
38 |
optimizer: adamw_bnb_8bit
|
39 |
lr_scheduler: cosine
|
40 |
learning_rate: 0.0002
|
|
|
54 |
flash_attention: true
|
55 |
|
56 |
warmup_steps: 10
|
57 |
+
eval_steps: 0.05
|
58 |
eval_table_size:
|
59 |
eval_table_max_new_tokens: 128
|
60 |
save_steps:
|
examples/llama-2/qlora.yml
CHANGED
@@ -36,7 +36,7 @@ wandb_log_model:
|
|
36 |
|
37 |
gradient_accumulation_steps: 4
|
38 |
micro_batch_size: 2
|
39 |
-
num_epochs:
|
40 |
optimizer: paged_adamw_32bit
|
41 |
lr_scheduler: cosine
|
42 |
learning_rate: 0.0002
|
@@ -56,7 +56,7 @@ xformers_attention:
|
|
56 |
flash_attention: true
|
57 |
|
58 |
warmup_steps: 10
|
59 |
-
eval_steps:
|
60 |
eval_table_size:
|
61 |
save_steps:
|
62 |
debug:
|
|
|
36 |
|
37 |
gradient_accumulation_steps: 4
|
38 |
micro_batch_size: 2
|
39 |
+
num_epochs: 4
|
40 |
optimizer: paged_adamw_32bit
|
41 |
lr_scheduler: cosine
|
42 |
learning_rate: 0.0002
|
|
|
56 |
flash_attention: true
|
57 |
|
58 |
warmup_steps: 10
|
59 |
+
eval_steps: 0.05
|
60 |
eval_table_size:
|
61 |
save_steps:
|
62 |
debug:
|
examples/llama-2/relora.yml
CHANGED
@@ -40,7 +40,7 @@ wandb_log_model:
|
|
40 |
|
41 |
gradient_accumulation_steps: 4
|
42 |
micro_batch_size: 4
|
43 |
-
num_epochs:
|
44 |
optimizer: adamw_bnb_8bit
|
45 |
lr_scheduler: cosine
|
46 |
learning_rate: 0.0002
|
@@ -60,7 +60,7 @@ xformers_attention:
|
|
60 |
flash_attention: true
|
61 |
|
62 |
warmup_steps: 10
|
63 |
-
eval_steps:
|
64 |
save_steps: 50
|
65 |
debug:
|
66 |
deepspeed:
|
|
|
40 |
|
41 |
gradient_accumulation_steps: 4
|
42 |
micro_batch_size: 4
|
43 |
+
num_epochs: 4
|
44 |
optimizer: adamw_bnb_8bit
|
45 |
lr_scheduler: cosine
|
46 |
learning_rate: 0.0002
|
|
|
60 |
flash_attention: true
|
61 |
|
62 |
warmup_steps: 10
|
63 |
+
eval_steps: 0.05
|
64 |
save_steps: 50
|
65 |
debug:
|
66 |
deepspeed:
|
examples/llama-2/tiny-llama.yml
CHANGED
@@ -34,7 +34,7 @@ wandb_log_model:
|
|
34 |
|
35 |
gradient_accumulation_steps: 4
|
36 |
micro_batch_size: 2
|
37 |
-
num_epochs:
|
38 |
optimizer: adamw_bnb_8bit
|
39 |
lr_scheduler: cosine
|
40 |
learning_rate: 0.0002
|
@@ -54,7 +54,7 @@ xformers_attention:
|
|
54 |
flash_attention: true
|
55 |
|
56 |
warmup_steps: 10
|
57 |
-
eval_steps:
|
58 |
eval_table_size:
|
59 |
save_steps:
|
60 |
debug:
|
|
|
34 |
|
35 |
gradient_accumulation_steps: 4
|
36 |
micro_batch_size: 2
|
37 |
+
num_epochs: 4
|
38 |
optimizer: adamw_bnb_8bit
|
39 |
lr_scheduler: cosine
|
40 |
learning_rate: 0.0002
|
|
|
54 |
flash_attention: true
|
55 |
|
56 |
warmup_steps: 10
|
57 |
+
eval_steps: 0.05
|
58 |
eval_table_size:
|
59 |
save_steps:
|
60 |
debug:
|
examples/mistral/config.yml
CHANGED
@@ -26,7 +26,7 @@ wandb_log_model:
|
|
26 |
|
27 |
gradient_accumulation_steps: 4
|
28 |
micro_batch_size: 2
|
29 |
-
num_epochs:
|
30 |
optimizer: adamw_bnb_8bit
|
31 |
lr_scheduler: cosine
|
32 |
learning_rate: 0.000005
|
@@ -46,7 +46,7 @@ xformers_attention:
|
|
46 |
flash_attention: true
|
47 |
|
48 |
warmup_steps: 10
|
49 |
-
eval_steps:
|
50 |
eval_table_size:
|
51 |
eval_table_max_new_tokens: 128
|
52 |
save_steps:
|
|
|
26 |
|
27 |
gradient_accumulation_steps: 4
|
28 |
micro_batch_size: 2
|
29 |
+
num_epochs: 4
|
30 |
optimizer: adamw_bnb_8bit
|
31 |
lr_scheduler: cosine
|
32 |
learning_rate: 0.000005
|
|
|
46 |
flash_attention: true
|
47 |
|
48 |
warmup_steps: 10
|
49 |
+
eval_steps: 0.05
|
50 |
eval_table_size:
|
51 |
eval_table_max_new_tokens: 128
|
52 |
save_steps:
|
examples/mistral/qlora.yml
CHANGED
@@ -63,7 +63,7 @@ xformers_attention:
|
|
63 |
flash_attention: true
|
64 |
|
65 |
warmup_steps: 10
|
66 |
-
eval_steps:
|
67 |
eval_table_size:
|
68 |
eval_table_max_new_tokens: 128
|
69 |
save_steps:
|
|
|
63 |
flash_attention: true
|
64 |
|
65 |
warmup_steps: 10
|
66 |
+
eval_steps: 0.05
|
67 |
eval_table_size:
|
68 |
eval_table_max_new_tokens: 128
|
69 |
save_steps:
|
examples/mpt-7b/config.yml
CHANGED
@@ -26,7 +26,7 @@ wandb_log_model:
|
|
26 |
output_dir: ./mpt-alpaca-7b
|
27 |
gradient_accumulation_steps: 1
|
28 |
micro_batch_size: 1
|
29 |
-
num_epochs:
|
30 |
optimizer: adamw_bnb_8bit
|
31 |
torchdistx_path:
|
32 |
lr_scheduler: cosine
|
|
|
26 |
output_dir: ./mpt-alpaca-7b
|
27 |
gradient_accumulation_steps: 1
|
28 |
micro_batch_size: 1
|
29 |
+
num_epochs: 4
|
30 |
optimizer: adamw_bnb_8bit
|
31 |
torchdistx_path:
|
32 |
lr_scheduler: cosine
|
examples/pythia/lora.yml
CHANGED
@@ -23,7 +23,7 @@ wandb_log_model:
|
|
23 |
output_dir: ./lora-alpaca-pythia
|
24 |
gradient_accumulation_steps: 1
|
25 |
micro_batch_size: 4
|
26 |
-
num_epochs:
|
27 |
learning_rate: 0.00001
|
28 |
train_on_inputs: false
|
29 |
group_by_length: false
|
@@ -33,5 +33,5 @@ early_stopping_patience:
|
|
33 |
resume_from_checkpoint:
|
34 |
local_rank:
|
35 |
weight_decay: 0.1
|
36 |
-
eval_steps:
|
37 |
logging_steps: 1
|
|
|
23 |
output_dir: ./lora-alpaca-pythia
|
24 |
gradient_accumulation_steps: 1
|
25 |
micro_batch_size: 4
|
26 |
+
num_epochs: 4
|
27 |
learning_rate: 0.00001
|
28 |
train_on_inputs: false
|
29 |
group_by_length: false
|
|
|
33 |
resume_from_checkpoint:
|
34 |
local_rank:
|
35 |
weight_decay: 0.1
|
36 |
+
eval_steps: 0.05
|
37 |
logging_steps: 1
|
examples/redpajama/config-3b.yml
CHANGED
@@ -27,7 +27,7 @@ wandb_log_model:
|
|
27 |
output_dir: ./redpajama-alpaca-3b
|
28 |
batch_size: 4
|
29 |
micro_batch_size: 1
|
30 |
-
num_epochs:
|
31 |
optimizer: adamw_bnb_8bit
|
32 |
torchdistx_path:
|
33 |
lr_scheduler: cosine
|
|
|
27 |
output_dir: ./redpajama-alpaca-3b
|
28 |
batch_size: 4
|
29 |
micro_batch_size: 1
|
30 |
+
num_epochs: 4
|
31 |
optimizer: adamw_bnb_8bit
|
32 |
torchdistx_path:
|
33 |
lr_scheduler: cosine
|
examples/replit-3b/config-lora.yml
CHANGED
@@ -26,7 +26,7 @@ wandb_log_model:
|
|
26 |
output_dir: ./lora-replit
|
27 |
batch_size: 8
|
28 |
micro_batch_size: 1
|
29 |
-
num_epochs:
|
30 |
optimizer:
|
31 |
torchdistx_path:
|
32 |
lr_scheduler:
|
|
|
26 |
output_dir: ./lora-replit
|
27 |
batch_size: 8
|
28 |
micro_batch_size: 1
|
29 |
+
num_epochs: 4
|
30 |
optimizer:
|
31 |
torchdistx_path:
|
32 |
lr_scheduler:
|
examples/xgen-7b/xgen-7b-8k-qlora.yml
CHANGED
@@ -51,7 +51,7 @@ output_dir: ./qlora-out
|
|
51 |
# decrease if OOM, increase for max VRAM utilization
|
52 |
micro_batch_size: 1
|
53 |
gradient_accumulation_steps: 1
|
54 |
-
num_epochs:
|
55 |
# Optimizer for QLoRA
|
56 |
optimizer: paged_adamw_32bit
|
57 |
torchdistx_path:
|
|
|
51 |
# decrease if OOM, increase for max VRAM utilization
|
52 |
micro_batch_size: 1
|
53 |
gradient_accumulation_steps: 1
|
54 |
+
num_epochs: 4
|
55 |
# Optimizer for QLoRA
|
56 |
optimizer: paged_adamw_32bit
|
57 |
torchdistx_path:
|