winglian commited on
Commit
674c576
1 Parent(s): 1eebbd0

more sane defaults for openllama 3b used for quickstarts (#602)

Browse files

* more sane defaults for openllama 3b used for quickstarts

* don't use bf16 for quickstart to simplify gpu compatibility

* use the update openlm-research/open_llama_3b_v2 models

examples/openllama-3b/config.yml CHANGED
@@ -1,5 +1,5 @@
1
- base_model: openlm-research/open_llama_3b
2
- base_model_config: openlm-research/open_llama_3b
3
  model_type: LlamaForCausalLM
4
  tokenizer_type: LlamaTokenizer
5
  load_in_8bit: false
@@ -13,8 +13,8 @@ dataset_prepared_path: last_run_prepared
13
  val_set_size: 0.02
14
  adapter:
15
  lora_model_dir:
16
- sequence_len: 256
17
- max_packed_sequence_len:
18
  lora_r:
19
  lora_alpha:
20
  lora_dropout:
@@ -29,11 +29,11 @@ wandb_log_model:
29
  output_dir: ./openllama-out
30
  gradient_accumulation_steps: 1
31
  micro_batch_size: 1
32
- num_epochs: 3
33
  optimizer: adamw_bnb_8bit
34
  torchdistx_path:
35
  lr_scheduler: cosine
36
- learning_rate: 0.00001
37
  train_on_inputs: false
38
  group_by_length: false
39
  float16: true
@@ -45,12 +45,12 @@ early_stopping_patience:
45
  resume_from_checkpoint:
46
  local_rank:
47
  logging_steps: 1
48
- xformers_attention: true
49
- flash_attention:
50
  gptq_groupsize:
51
  gptq_model_v1:
52
- warmup_steps: 10
53
- eval_steps: 50
54
  save_steps:
55
  debug:
56
  deepspeed:
 
1
+ base_model: openlm-research/open_llama_3b_v2
2
+ base_model_config: openlm-research/open_llama_3b_v2
3
  model_type: LlamaForCausalLM
4
  tokenizer_type: LlamaTokenizer
5
  load_in_8bit: false
 
13
  val_set_size: 0.02
14
  adapter:
15
  lora_model_dir:
16
+ sequence_len: 1024
17
+ sample_packing: true
18
  lora_r:
19
  lora_alpha:
20
  lora_dropout:
 
29
  output_dir: ./openllama-out
30
  gradient_accumulation_steps: 1
31
  micro_batch_size: 1
32
+ num_epochs: 4
33
  optimizer: adamw_bnb_8bit
34
  torchdistx_path:
35
  lr_scheduler: cosine
36
+ learning_rate: 0.000003
37
  train_on_inputs: false
38
  group_by_length: false
39
  float16: true
 
45
  resume_from_checkpoint:
46
  local_rank:
47
  logging_steps: 1
48
+ xformers_attention:
49
+ flash_attention: true
50
  gptq_groupsize:
51
  gptq_model_v1:
52
+ warmup_steps: 20
53
+ eval_steps: 0.05
54
  save_steps:
55
  debug:
56
  deepspeed:
examples/openllama-3b/lora.yml CHANGED
@@ -1,5 +1,5 @@
1
- base_model: openlm-research/open_llama_3b
2
- base_model_config: openlm-research/open_llama_3b
3
  model_type: LlamaForCausalLM
4
  tokenizer_type: LlamaTokenizer
5
  load_in_8bit: true
@@ -13,8 +13,8 @@ dataset_prepared_path: last_run_prepared
13
  val_set_size: 0.02
14
  adapter: lora
15
  lora_model_dir:
16
- sequence_len: 256
17
- max_packed_sequence_len:
18
  lora_r: 8
19
  lora_alpha: 16
20
  lora_dropout: 0.0
@@ -33,9 +33,9 @@ wandb_watch:
33
  wandb_run_id:
34
  wandb_log_model:
35
  output_dir: ./lora-out
36
- batch_size: 16
37
- micro_batch_size: 4
38
- num_epochs: 3
39
  optimizer: adamw_bnb_8bit
40
  torchdistx_path:
41
  lr_scheduler: cosine
@@ -50,16 +50,16 @@ early_stopping_patience:
50
  resume_from_checkpoint:
51
  local_rank:
52
  logging_steps: 1
53
- xformers_attention: true
54
- flash_attention:
55
  gptq_groupsize:
56
  gptq_model_v1:
57
- warmup_steps: 10
58
- eval_steps: 50
59
  save_steps:
60
  debug:
61
  deepspeed:
62
- weight_decay: 0.0
63
  fsdp:
64
  fsdp_config:
65
  special_tokens:
 
1
+ base_model: openlm-research/open_llama_3b_v2
2
+ base_model_config: openlm-research/open_llama_3b_v2
3
  model_type: LlamaForCausalLM
4
  tokenizer_type: LlamaTokenizer
5
  load_in_8bit: true
 
13
  val_set_size: 0.02
14
  adapter: lora
15
  lora_model_dir:
16
+ sequence_len: 1024
17
+ sample_packing: true
18
  lora_r: 8
19
  lora_alpha: 16
20
  lora_dropout: 0.0
 
33
  wandb_run_id:
34
  wandb_log_model:
35
  output_dir: ./lora-out
36
+ gradient_accumulation_steps: 1
37
+ micro_batch_size: 2
38
+ num_epochs: 4
39
  optimizer: adamw_bnb_8bit
40
  torchdistx_path:
41
  lr_scheduler: cosine
 
50
  resume_from_checkpoint:
51
  local_rank:
52
  logging_steps: 1
53
+ xformers_attention:
54
+ flash_attention: true
55
  gptq_groupsize:
56
  gptq_model_v1:
57
+ warmup_steps: 20
58
+ eval_steps: 0.05
59
  save_steps:
60
  debug:
61
  deepspeed:
62
+ weight_decay: 0.1
63
  fsdp:
64
  fsdp_config:
65
  special_tokens:
examples/openllama-3b/qlora.yml CHANGED
@@ -1,5 +1,5 @@
1
- base_model: openlm-research/open_llama_3b
2
- base_model_config: openlm-research/open_llama_3b
3
  model_type: LlamaForCausalLM
4
  tokenizer_type: LlamaTokenizer
5
  load_in_8bit: false
@@ -13,8 +13,8 @@ dataset_prepared_path: last_run_prepared
13
  val_set_size: 0.01
14
  adapter: qlora
15
  lora_model_dir:
16
- sequence_len: 2048
17
- max_packed_sequence_len: 2048
18
  lora_r: 8
19
  lora_alpha: 32
20
  lora_dropout: 0.05
@@ -27,33 +27,33 @@ wandb_watch:
27
  wandb_run_id:
28
  wandb_log_model:
29
  output_dir: ./qlora-out
30
- batch_size: 4
31
- micro_batch_size: 4
32
- num_epochs: 2
33
  optimizer: paged_adamw_32bit
34
  torchdistx_path:
35
  lr_scheduler: cosine
36
  learning_rate: 0.0002
37
  train_on_inputs: false
38
  group_by_length: false
39
- bf16: true
40
- fp16: false
41
- tf32: true
42
  gradient_checkpointing: true
43
  early_stopping_patience:
44
  resume_from_checkpoint:
45
  local_rank:
46
  logging_steps: 1
47
- xformers_attention: true
48
- flash_attention:
49
  gptq_groupsize:
50
  gptq_model_v1:
51
- warmup_steps: 10
52
- eval_steps: 20
53
  save_steps:
54
  debug:
55
  deepspeed:
56
- weight_decay: 0.0
57
  fsdp:
58
  fsdp_config:
59
  special_tokens:
 
1
+ base_model: openlm-research/open_llama_3b_v2
2
+ base_model_config: openlm-research/open_llama_3b_v2
3
  model_type: LlamaForCausalLM
4
  tokenizer_type: LlamaTokenizer
5
  load_in_8bit: false
 
13
  val_set_size: 0.01
14
  adapter: qlora
15
  lora_model_dir:
16
+ sequence_len: 1024
17
+ sample_packing: true
18
  lora_r: 8
19
  lora_alpha: 32
20
  lora_dropout: 0.05
 
27
  wandb_run_id:
28
  wandb_log_model:
29
  output_dir: ./qlora-out
30
+ gradient_accumulation_steps: 1
31
+ micro_batch_size: 2
32
+ num_epochs: 4
33
  optimizer: paged_adamw_32bit
34
  torchdistx_path:
35
  lr_scheduler: cosine
36
  learning_rate: 0.0002
37
  train_on_inputs: false
38
  group_by_length: false
39
+ bf16: false
40
+ fp16: true
41
+ tf32: false
42
  gradient_checkpointing: true
43
  early_stopping_patience:
44
  resume_from_checkpoint:
45
  local_rank:
46
  logging_steps: 1
47
+ xformers_attention:
48
+ flash_attention: true
49
  gptq_groupsize:
50
  gptq_model_v1:
51
+ warmup_steps: 20
52
+ eval_steps: 0.05
53
  save_steps:
54
  debug:
55
  deepspeed:
56
+ weight_decay: 0.1
57
  fsdp:
58
  fsdp_config:
59
  special_tokens: