winglian commited on
Commit
8b79ff0
1 Parent(s): 0800885

fix eval_steps to be a sane default (#797)

Browse files

* fix eval_steps to be a sane default

* update docs for fractional eval_steps

README.md CHANGED
@@ -618,14 +618,14 @@ gradient_accumulation_steps: 1
618
  # The number of samples to include in each batch. This is the number of samples sent to each GPU.
619
  micro_batch_size: 2
620
  eval_batch_size:
621
- num_epochs: 3
622
  warmup_steps: 100
623
  learning_rate: 0.00003
624
  lr_quadratic_warmup:
625
  logging_steps:
626
  save_strategy: # Set to `no` to skip checkpoint saves
627
  save_steps: # Leave empty to save at each epoch
628
- eval_steps: # Leave empty to eval at each epoch
629
  save_total_limit: # Checkpoints saved at a time
630
  # Maximum number of iterations to train for. It precedes num_epochs which means that
631
  # if both are set, num_epochs will not be guaranteed.
 
618
  # The number of samples to include in each batch. This is the number of samples sent to each GPU.
619
  micro_batch_size: 2
620
  eval_batch_size:
621
+ num_epochs: 4
622
  warmup_steps: 100
623
  learning_rate: 0.00003
624
  lr_quadratic_warmup:
625
  logging_steps:
626
  save_strategy: # Set to `no` to skip checkpoint saves
627
  save_steps: # Leave empty to save at each epoch
628
+ eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
629
  save_total_limit: # Checkpoints saved at a time
630
  # Maximum number of iterations to train for. It precedes num_epochs which means that
631
  # if both are set, num_epochs will not be guaranteed.
examples/cerebras/qlora.yml CHANGED
@@ -49,7 +49,7 @@ flash_attention:
49
  gptq_groupsize:
50
  gptq_model_v1:
51
  warmup_steps: 10
52
- eval_steps: 20
53
  save_steps:
54
  debug:
55
  deepspeed:
 
49
  gptq_groupsize:
50
  gptq_model_v1:
51
  warmup_steps: 10
52
+ eval_steps: 0.05
53
  save_steps:
54
  debug:
55
  deepspeed:
examples/code-llama/13b/lora.yml CHANGED
@@ -34,7 +34,7 @@ wandb_log_model:
34
 
35
  gradient_accumulation_steps: 4
36
  micro_batch_size: 2
37
- num_epochs: 3
38
  optimizer: adamw_bnb_8bit
39
  lr_scheduler: cosine
40
  learning_rate: 0.0002
@@ -54,7 +54,7 @@ xformers_attention:
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
- eval_steps: 20
58
  save_steps:
59
  debug:
60
  deepspeed:
 
34
 
35
  gradient_accumulation_steps: 4
36
  micro_batch_size: 2
37
+ num_epochs: 4
38
  optimizer: adamw_bnb_8bit
39
  lr_scheduler: cosine
40
  learning_rate: 0.0002
 
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
+ eval_steps: 0.05
58
  save_steps:
59
  debug:
60
  deepspeed:
examples/code-llama/13b/qlora.yml CHANGED
@@ -36,7 +36,7 @@ wandb_log_model:
36
 
37
  gradient_accumulation_steps: 4
38
  micro_batch_size: 2
39
- num_epochs: 3
40
  optimizer: paged_adamw_32bit
41
  lr_scheduler: cosine
42
  learning_rate: 0.0002
@@ -56,7 +56,7 @@ xformers_attention:
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
- eval_steps: 20
60
  save_steps:
61
  debug:
62
  deepspeed:
 
36
 
37
  gradient_accumulation_steps: 4
38
  micro_batch_size: 2
39
+ num_epochs: 4
40
  optimizer: paged_adamw_32bit
41
  lr_scheduler: cosine
42
  learning_rate: 0.0002
 
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
+ eval_steps: 0.05
60
  save_steps:
61
  debug:
62
  deepspeed:
examples/code-llama/34b/lora.yml CHANGED
@@ -34,7 +34,7 @@ wandb_log_model:
34
 
35
  gradient_accumulation_steps: 4
36
  micro_batch_size: 2
37
- num_epochs: 3
38
  optimizer: adamw_bnb_8bit
39
  lr_scheduler: cosine
40
  learning_rate: 0.0002
@@ -54,7 +54,7 @@ xformers_attention:
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
- eval_steps: 20
58
  save_steps:
59
  debug:
60
  deepspeed:
 
34
 
35
  gradient_accumulation_steps: 4
36
  micro_batch_size: 2
37
+ num_epochs: 4
38
  optimizer: adamw_bnb_8bit
39
  lr_scheduler: cosine
40
  learning_rate: 0.0002
 
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
+ eval_steps: 0.05
58
  save_steps:
59
  debug:
60
  deepspeed:
examples/code-llama/34b/qlora.yml CHANGED
@@ -36,7 +36,7 @@ wandb_log_model:
36
 
37
  gradient_accumulation_steps: 4
38
  micro_batch_size: 2
39
- num_epochs: 3
40
  optimizer: paged_adamw_32bit
41
  lr_scheduler: cosine
42
  learning_rate: 0.0002
@@ -56,7 +56,7 @@ xformers_attention:
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
- eval_steps: 20
60
  save_steps:
61
  debug:
62
  deepspeed:
 
36
 
37
  gradient_accumulation_steps: 4
38
  micro_batch_size: 2
39
+ num_epochs: 4
40
  optimizer: paged_adamw_32bit
41
  lr_scheduler: cosine
42
  learning_rate: 0.0002
 
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
+ eval_steps: 0.05
60
  save_steps:
61
  debug:
62
  deepspeed:
examples/code-llama/7b/lora.yml CHANGED
@@ -34,7 +34,7 @@ wandb_log_model:
34
 
35
  gradient_accumulation_steps: 4
36
  micro_batch_size: 2
37
- num_epochs: 3
38
  optimizer: adamw_bnb_8bit
39
  lr_scheduler: cosine
40
  learning_rate: 0.0002
@@ -54,7 +54,7 @@ xformers_attention:
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
- eval_steps: 20
58
  save_steps:
59
  debug:
60
  deepspeed:
 
34
 
35
  gradient_accumulation_steps: 4
36
  micro_batch_size: 2
37
+ num_epochs: 4
38
  optimizer: adamw_bnb_8bit
39
  lr_scheduler: cosine
40
  learning_rate: 0.0002
 
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
+ eval_steps: 0.05
58
  save_steps:
59
  debug:
60
  deepspeed:
examples/code-llama/7b/qlora.yml CHANGED
@@ -36,7 +36,7 @@ wandb_log_model:
36
 
37
  gradient_accumulation_steps: 4
38
  micro_batch_size: 2
39
- num_epochs: 3
40
  optimizer: paged_adamw_32bit
41
  lr_scheduler: cosine
42
  learning_rate: 0.0002
@@ -56,7 +56,7 @@ xformers_attention:
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
- eval_steps: 20
60
  save_steps:
61
  debug:
62
  deepspeed:
 
36
 
37
  gradient_accumulation_steps: 4
38
  micro_batch_size: 2
39
+ num_epochs: 4
40
  optimizer: paged_adamw_32bit
41
  lr_scheduler: cosine
42
  learning_rate: 0.0002
 
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
+ eval_steps: 0.05
60
  save_steps:
61
  debug:
62
  deepspeed:
examples/falcon/config-7b-qlora.yml CHANGED
@@ -53,7 +53,7 @@ output_dir: ./qlora-out
53
  # decrease if OOM, increase for max VRAM utilization
54
  micro_batch_size: 1
55
  gradient_accumulation_steps: 2
56
- num_epochs: 3
57
  # Optimizer for QLoRA
58
  optimizer: paged_adamw_32bit
59
  torchdistx_path:
 
53
  # decrease if OOM, increase for max VRAM utilization
54
  micro_batch_size: 1
55
  gradient_accumulation_steps: 2
56
+ num_epochs: 4
57
  # Optimizer for QLoRA
58
  optimizer: paged_adamw_32bit
59
  torchdistx_path:
examples/gptj/qlora.yml CHANGED
@@ -46,7 +46,7 @@ flash_attention:
46
  gptq_groupsize:
47
  gptq_model_v1:
48
  warmup_steps: 10
49
- eval_steps: 20
50
  save_steps:
51
  debug:
52
  deepspeed:
 
46
  gptq_groupsize:
47
  gptq_model_v1:
48
  warmup_steps: 10
49
+ eval_steps: 0.05
50
  save_steps:
51
  debug:
52
  deepspeed:
examples/jeopardy-bot/config.yml CHANGED
@@ -24,7 +24,7 @@ wandb_log_model:
24
  output_dir: ./jeopardy-bot-7b
25
  gradient_accumulation_steps: 1
26
  micro_batch_size: 1
27
- num_epochs: 3
28
  optimizer: adamw_bnb_8bit
29
  torchdistx_path:
30
  lr_scheduler: cosine
 
24
  output_dir: ./jeopardy-bot-7b
25
  gradient_accumulation_steps: 1
26
  micro_batch_size: 1
27
+ num_epochs: 4
28
  optimizer: adamw_bnb_8bit
29
  torchdistx_path:
30
  lr_scheduler: cosine
examples/llama-2/gptq-lora.yml CHANGED
@@ -37,7 +37,7 @@ wandb_log_model:
37
  output_dir: ./model-out
38
  gradient_accumulation_steps: 1
39
  micro_batch_size: 1
40
- num_epochs: 3
41
  optimizer: adamw_torch
42
  adam_beta2: 0.95
43
  adam_eps: 0.00001
 
37
  output_dir: ./model-out
38
  gradient_accumulation_steps: 1
39
  micro_batch_size: 1
40
+ num_epochs: 4
41
  optimizer: adamw_torch
42
  adam_beta2: 0.95
43
  adam_eps: 0.00001
examples/llama-2/lora.yml CHANGED
@@ -34,7 +34,7 @@ wandb_log_model:
34
 
35
  gradient_accumulation_steps: 4
36
  micro_batch_size: 2
37
- num_epochs: 3
38
  optimizer: adamw_bnb_8bit
39
  lr_scheduler: cosine
40
  learning_rate: 0.0002
@@ -54,7 +54,7 @@ xformers_attention:
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
- eval_steps: 20
58
  eval_table_size:
59
  eval_table_max_new_tokens: 128
60
  save_steps:
 
34
 
35
  gradient_accumulation_steps: 4
36
  micro_batch_size: 2
37
+ num_epochs: 4
38
  optimizer: adamw_bnb_8bit
39
  lr_scheduler: cosine
40
  learning_rate: 0.0002
 
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
+ eval_steps: 0.05
58
  eval_table_size:
59
  eval_table_max_new_tokens: 128
60
  save_steps:
examples/llama-2/qlora.yml CHANGED
@@ -36,7 +36,7 @@ wandb_log_model:
36
 
37
  gradient_accumulation_steps: 4
38
  micro_batch_size: 2
39
- num_epochs: 3
40
  optimizer: paged_adamw_32bit
41
  lr_scheduler: cosine
42
  learning_rate: 0.0002
@@ -56,7 +56,7 @@ xformers_attention:
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
- eval_steps: 20
60
  eval_table_size:
61
  save_steps:
62
  debug:
 
36
 
37
  gradient_accumulation_steps: 4
38
  micro_batch_size: 2
39
+ num_epochs: 4
40
  optimizer: paged_adamw_32bit
41
  lr_scheduler: cosine
42
  learning_rate: 0.0002
 
56
  flash_attention: true
57
 
58
  warmup_steps: 10
59
+ eval_steps: 0.05
60
  eval_table_size:
61
  save_steps:
62
  debug:
examples/llama-2/relora.yml CHANGED
@@ -40,7 +40,7 @@ wandb_log_model:
40
 
41
  gradient_accumulation_steps: 4
42
  micro_batch_size: 4
43
- num_epochs: 3
44
  optimizer: adamw_bnb_8bit
45
  lr_scheduler: cosine
46
  learning_rate: 0.0002
@@ -60,7 +60,7 @@ xformers_attention:
60
  flash_attention: true
61
 
62
  warmup_steps: 10
63
- eval_steps: 20
64
  save_steps: 50
65
  debug:
66
  deepspeed:
 
40
 
41
  gradient_accumulation_steps: 4
42
  micro_batch_size: 4
43
+ num_epochs: 4
44
  optimizer: adamw_bnb_8bit
45
  lr_scheduler: cosine
46
  learning_rate: 0.0002
 
60
  flash_attention: true
61
 
62
  warmup_steps: 10
63
+ eval_steps: 0.05
64
  save_steps: 50
65
  debug:
66
  deepspeed:
examples/llama-2/tiny-llama.yml CHANGED
@@ -34,7 +34,7 @@ wandb_log_model:
34
 
35
  gradient_accumulation_steps: 4
36
  micro_batch_size: 2
37
- num_epochs: 3
38
  optimizer: adamw_bnb_8bit
39
  lr_scheduler: cosine
40
  learning_rate: 0.0002
@@ -54,7 +54,7 @@ xformers_attention:
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
- eval_steps: 20
58
  eval_table_size:
59
  save_steps:
60
  debug:
 
34
 
35
  gradient_accumulation_steps: 4
36
  micro_batch_size: 2
37
+ num_epochs: 4
38
  optimizer: adamw_bnb_8bit
39
  lr_scheduler: cosine
40
  learning_rate: 0.0002
 
54
  flash_attention: true
55
 
56
  warmup_steps: 10
57
+ eval_steps: 0.05
58
  eval_table_size:
59
  save_steps:
60
  debug:
examples/mistral/config.yml CHANGED
@@ -26,7 +26,7 @@ wandb_log_model:
26
 
27
  gradient_accumulation_steps: 4
28
  micro_batch_size: 2
29
- num_epochs: 3
30
  optimizer: adamw_bnb_8bit
31
  lr_scheduler: cosine
32
  learning_rate: 0.000005
@@ -46,7 +46,7 @@ xformers_attention:
46
  flash_attention: true
47
 
48
  warmup_steps: 10
49
- eval_steps: 20
50
  eval_table_size:
51
  eval_table_max_new_tokens: 128
52
  save_steps:
 
26
 
27
  gradient_accumulation_steps: 4
28
  micro_batch_size: 2
29
+ num_epochs: 4
30
  optimizer: adamw_bnb_8bit
31
  lr_scheduler: cosine
32
  learning_rate: 0.000005
 
46
  flash_attention: true
47
 
48
  warmup_steps: 10
49
+ eval_steps: 0.05
50
  eval_table_size:
51
  eval_table_max_new_tokens: 128
52
  save_steps:
examples/mistral/qlora.yml CHANGED
@@ -63,7 +63,7 @@ xformers_attention:
63
  flash_attention: true
64
 
65
  warmup_steps: 10
66
- eval_steps: 20
67
  eval_table_size:
68
  eval_table_max_new_tokens: 128
69
  save_steps:
 
63
  flash_attention: true
64
 
65
  warmup_steps: 10
66
+ eval_steps: 0.05
67
  eval_table_size:
68
  eval_table_max_new_tokens: 128
69
  save_steps:
examples/mpt-7b/config.yml CHANGED
@@ -26,7 +26,7 @@ wandb_log_model:
26
  output_dir: ./mpt-alpaca-7b
27
  gradient_accumulation_steps: 1
28
  micro_batch_size: 1
29
- num_epochs: 3
30
  optimizer: adamw_bnb_8bit
31
  torchdistx_path:
32
  lr_scheduler: cosine
 
26
  output_dir: ./mpt-alpaca-7b
27
  gradient_accumulation_steps: 1
28
  micro_batch_size: 1
29
+ num_epochs: 4
30
  optimizer: adamw_bnb_8bit
31
  torchdistx_path:
32
  lr_scheduler: cosine
examples/pythia/lora.yml CHANGED
@@ -23,7 +23,7 @@ wandb_log_model:
23
  output_dir: ./lora-alpaca-pythia
24
  gradient_accumulation_steps: 1
25
  micro_batch_size: 4
26
- num_epochs: 3
27
  learning_rate: 0.00001
28
  train_on_inputs: false
29
  group_by_length: false
@@ -33,5 +33,5 @@ early_stopping_patience:
33
  resume_from_checkpoint:
34
  local_rank:
35
  weight_decay: 0.1
36
- eval_steps: 20
37
  logging_steps: 1
 
23
  output_dir: ./lora-alpaca-pythia
24
  gradient_accumulation_steps: 1
25
  micro_batch_size: 4
26
+ num_epochs: 4
27
  learning_rate: 0.00001
28
  train_on_inputs: false
29
  group_by_length: false
 
33
  resume_from_checkpoint:
34
  local_rank:
35
  weight_decay: 0.1
36
+ eval_steps: 0.05
37
  logging_steps: 1
examples/redpajama/config-3b.yml CHANGED
@@ -27,7 +27,7 @@ wandb_log_model:
27
  output_dir: ./redpajama-alpaca-3b
28
  batch_size: 4
29
  micro_batch_size: 1
30
- num_epochs: 3
31
  optimizer: adamw_bnb_8bit
32
  torchdistx_path:
33
  lr_scheduler: cosine
 
27
  output_dir: ./redpajama-alpaca-3b
28
  batch_size: 4
29
  micro_batch_size: 1
30
+ num_epochs: 4
31
  optimizer: adamw_bnb_8bit
32
  torchdistx_path:
33
  lr_scheduler: cosine
examples/replit-3b/config-lora.yml CHANGED
@@ -26,7 +26,7 @@ wandb_log_model:
26
  output_dir: ./lora-replit
27
  batch_size: 8
28
  micro_batch_size: 1
29
- num_epochs: 3
30
  optimizer:
31
  torchdistx_path:
32
  lr_scheduler:
 
26
  output_dir: ./lora-replit
27
  batch_size: 8
28
  micro_batch_size: 1
29
+ num_epochs: 4
30
  optimizer:
31
  torchdistx_path:
32
  lr_scheduler:
examples/xgen-7b/xgen-7b-8k-qlora.yml CHANGED
@@ -51,7 +51,7 @@ output_dir: ./qlora-out
51
  # decrease if OOM, increase for max VRAM utilization
52
  micro_batch_size: 1
53
  gradient_accumulation_steps: 1
54
- num_epochs: 3
55
  # Optimizer for QLoRA
56
  optimizer: paged_adamw_32bit
57
  torchdistx_path:
 
51
  # decrease if OOM, increase for max VRAM utilization
52
  micro_batch_size: 1
53
  gradient_accumulation_steps: 1
54
+ num_epochs: 4
55
  # Optimizer for QLoRA
56
  optimizer: paged_adamw_32bit
57
  torchdistx_path: