model: max_token_len: 1024 # should be larger than the seqlen #vocab_size: 32000 n_layer: 24 n_head: 4 n_kv_head: 2 # multi-query attention dim: 128 #dim_qk_head: 32 # usually set to dim // n_head, but can be different #hidden_dim: # 768*4, the MLP after the attention layer #multiple_of: 64 # make sure the hidden_dim is a multiple of this number, beause silu (swish) is used, so hidden layer will be changed dropout_rate: 0.05 # for the attention map #layer_init_factor: 0.1 # by default = (n_layer * 8) ** -1/2; should use default value, based on the microsoft DeepNet paper #residual_factor: 2 # by default = (2 * n_layer) ** 1/2; should use default value attn_window_size: 512 front_window_size: 0 use_rotary: True use_alibi: False mimic_attn_layer: 21 # replace this layer to be a training target, to mimic the attention of the teacher; this special layer should use the similar setting as the teacher mimic_n_head: 16 mimic_n_kv_head: 16 #mimic_sliding_window_size: 1024 mimic_attn_dropout: 0.0 mimic_dim_qk_head: 16 mimic_use_rotary: True mimic_use_alibi: False opt: gradient_clip: 1.0 lr: 1 beta1: 0.9 beta2: 0.99 weight_decay: 0.2 opt_name: sophia loss: soft_loss_weight: 0.0 hard_loss_weight: 1.0 mimic_loss_weight: 0.0 virtual_v_head_num: 16 # based on MiniLM v2, it is similar to attention but only use v to do self-attn. It make the student's x_v similar to teacher's x_v loss_soft_temperature: 1 # temperature for the soft loss, to make the softmax more smooth, sensitive to the small logits scheduler: slr_seg: # - [0.0000001, 0.0005, 300] # - [0.0005, 0.0005, 2000] - [0.0005, 0.00025, 1000]