Atlas commited on
Commit
0eadfc8
1 Parent(s): bcaa923

Create mixtral_22.yml (#1514) [skip ci]

Browse files

Code sourced from here:

https://twitter.com/mattshumer_/status/1778135774887567712

Files changed (1) hide show
  1. examples/mistral/mixtral_22.yml +59 -0
examples/mistral/mixtral_22.yml ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ base_model: mistral-community/Mixtral-8x22B-v0.1
2
+ model_type: AutoModelForCausalLM
3
+ tokenizer_type: LlamaTokenizer
4
+ trust_remote_code: true
5
+
6
+ load_in_8bit: false
7
+ load_in_4bit: false
8
+ strict: false
9
+
10
+ unfrozen_parameters:
11
+ - ^lm_head.weight$
12
+ - ^model.embed_tokens.weight$
13
+ - model.layers.4[4-9]+.block_sparse_moe.gate
14
+ - model.layers.4[4-9]+.block_sparse_moe.experts
15
+ - model.layers.5[0-5]+.block_sparse_moe.gate
16
+ - model.layers.5[0-5]+.block_sparse_moe.experts
17
+
18
+ model_config:
19
+ output_router_logits: true
20
+
21
+ DATA_STUFF_HERE
22
+ output_dir: ./out
23
+
24
+ sequence_len: 8000
25
+ sample_packing: true
26
+ pad_to_sequence_len: true
27
+
28
+ gradient_accumulation_steps: 1
29
+ micro_batch_size: 1
30
+ num_epochs: 3
31
+ optimizer: adamw_bnb_8bit
32
+ lr_scheduler: cosine
33
+ learning_rate: 0.0001
34
+
35
+ train_on_inputs: false
36
+ group_by_length: false
37
+ bf16: auto
38
+ fp16:
39
+ tf32: false
40
+
41
+ gradient_checkpointing: true
42
+ early_stopping_patience:
43
+ resume_from_checkpoint:
44
+ local_rank:
45
+ logging_steps: 1
46
+ xformers_attention:
47
+ flash_attention: true
48
+
49
+ save_total_limit: 1
50
+ save_steps:
51
+ debug:
52
+ deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json
53
+ weight_decay: 0.0
54
+ fsdp:
55
+ fsdp_config:
56
+ special_tokens:
57
+ eos_token: "<|im_end|>"
58
+ tokens:
59
+ - "<|im_start|>"