yentinglin commited on
Commit
8f21f38
1 Parent(s): 6f6db45

Update mixtral-zhtw.yml

Browse files
Files changed (1) hide show
  1. mixtral-zhtw.yml +18 -30
mixtral-zhtw.yml CHANGED
@@ -1,6 +1,7 @@
1
  base_model: mistralai/Mixtral-8x7B-Instruct-v0.1
2
- model_type: AutoModelForCausalLM
3
  tokenizer_type: LlamaTokenizer
 
4
  trust_remote_code: true
5
 
6
  load_in_8bit: false
@@ -10,24 +11,15 @@ strict: false
10
  datasets:
11
  - path: yentinglin/v1
12
  type: sharegpt
13
- conversation: mistral
14
- dataset_prepared_path: last_run_prepared
15
- val_set_size: 0.0
16
- output_dir: ./qlora-out-3e
17
-
18
- ## You can optionally freeze the entire model and unfreeze a subset of parameters
19
- unfrozen_parameters:
20
- # - lm_head.*
21
- # - model.embed_tokens.*
22
- # - model.layers.2[0-9]+.block_sparse_moe.gate.*
23
- # - model.layers.2[0-9]+.block_sparse_moe.experts.*
24
- # - model.layers.3[0-9]+.block_sparse_moe.gate.*
25
- # - model.layers.3[0-9]+.block_sparse_moe.experts.*
26
 
27
  adapter: qlora
28
  lora_model_dir:
29
 
30
- sequence_len: 4096
31
  sample_packing: true
32
  pad_to_sequence_len: true
33
 
@@ -37,17 +29,12 @@ model_config:
37
  lora_r: 32
38
  lora_alpha: 16
39
  lora_dropout: 0.05
 
40
  lora_target_linear: true
41
  lora_fan_in_fan_out:
42
- #lora_target_modules:
43
- # - gate
44
- # - q_proj
45
- # - k_proj
46
- # - v_proj
47
- # - o_proj
48
- # - w1
49
- # - w2
50
- # - w3
51
 
52
  hub_model_id: yentinglin/Taiwan-LLM-MoE-chat-alpha-3e
53
  hub_strategy: end
@@ -57,10 +44,10 @@ wandb_watch:
57
  wandb_name:
58
  wandb_log_model:
59
 
60
- gradient_accumulation_steps: 4
61
- micro_batch_size: 1
62
  num_epochs: 3
63
- optimizer: adamw_bnb_8bit
64
  lr_scheduler: cosine
65
  learning_rate: 0.0002
66
 
@@ -78,9 +65,6 @@ logging_steps: 1
78
  xformers_attention:
79
  flash_attention: true
80
 
81
- loss_watchdog_threshold: 5.0
82
- loss_watchdog_patience: 3
83
-
84
  warmup_steps: 10
85
  evals_per_epoch: 4
86
  eval_table_size:
@@ -92,6 +76,10 @@ weight_decay: 0.0
92
  fsdp:
93
  fsdp_config:
94
  special_tokens:
 
 
 
 
95
 
96
  ddp_timeout: 8640000000
97
  dataset_processes: 16
 
1
  base_model: mistralai/Mixtral-8x7B-Instruct-v0.1
2
+ model_type: MixtralForCausalLM
3
  tokenizer_type: LlamaTokenizer
4
+ is_mistral_derived_model: false
5
  trust_remote_code: true
6
 
7
  load_in_8bit: false
 
11
  datasets:
12
  - path: yentinglin/v1
13
  type: sharegpt
14
+ conversation: chatml
15
+ dataset_prepared_path:
16
+ val_set_size: 0.005
17
+ output_dir: ./Taiwan-LLM-MoE-pilot-v0.2
 
 
 
 
 
 
 
 
 
18
 
19
  adapter: qlora
20
  lora_model_dir:
21
 
22
+ sequence_len: 16384
23
  sample_packing: true
24
  pad_to_sequence_len: true
25
 
 
29
  lora_r: 32
30
  lora_alpha: 16
31
  lora_dropout: 0.05
32
+ lora_target_modules:
33
  lora_target_linear: true
34
  lora_fan_in_fan_out:
35
+ lora_modules_to_save:
36
+ - embed_tokens
37
+ - lm_head
 
 
 
 
 
 
38
 
39
  hub_model_id: yentinglin/Taiwan-LLM-MoE-chat-alpha-3e
40
  hub_strategy: end
 
44
  wandb_name:
45
  wandb_log_model:
46
 
47
+ gradient_accumulation_steps: 2
48
+ micro_batch_size: 3
49
  num_epochs: 3
50
+ optimizer: paged_adamw_8bit
51
  lr_scheduler: cosine
52
  learning_rate: 0.0002
53
 
 
65
  xformers_attention:
66
  flash_attention: true
67
 
 
 
 
68
  warmup_steps: 10
69
  evals_per_epoch: 4
70
  eval_table_size:
 
76
  fsdp:
77
  fsdp_config:
78
  special_tokens:
79
+ eos_token: "<|im_end|>"
80
+ tokens:
81
+ - "<|im_start|>"
82
+ trust_remote_code: true
83
 
84
  ddp_timeout: 8640000000
85
  dataset_processes: 16