model:
  max_token_len: 1024 # should be larger than the seqlen
  #vocab_size: 32000
  n_layer: 24
  n_head: 4
  n_kv_head: 2 # multi-query attention
  dim: 128
  #dim_qk_head: 32 # usually set to dim // n_head, but can be different
  #hidden_dim:  # 768*4, the MLP after the attention layer
  #multiple_of: 64 # make sure the hidden_dim is a multiple of this number, beause silu (swish) is used, so hidden layer will be changed
  dropout_rate: 0.05 # for the attention map
  #layer_init_factor: 0.1 # by default = (n_layer * 8) ** -1/2; should use default value, based on the microsoft DeepNet paper
  #residual_factor: 2 # by default = (2 * n_layer) ** 1/2; should use default value
  attn_window_size: 512
  front_window_size: 0
  use_rotary: True
  use_alibi: False

  mimic_attn_layer: 21 # replace this layer to be a training target, to mimic the attention of the teacher; this special layer should use the similar setting as the teacher
  mimic_n_head: 16
  mimic_n_kv_head: 16
  #mimic_sliding_window_size: 1024
  mimic_attn_dropout: 0.0
  mimic_dim_qk_head: 16
  mimic_use_rotary: True
  mimic_use_alibi: False

opt:
  gradient_clip: 1.0
  lr: 1
  beta1: 0.9
  beta2: 0.99
  weight_decay: 0.2
  opt_name: sophia

loss:
  soft_loss_weight: 0.0
  hard_loss_weight: 1.0
  mimic_loss_weight: 0.0
  virtual_v_head_num: 16 # based on MiniLM v2, it is similar to attention but only use v to do self-attn. It make the student's x_v similar to teacher's x_v
  loss_soft_temperature: 1 # temperature for the soft loss, to make the softmax more smooth, sensitive to the small logits

scheduler:
  slr_seg:
    # - [0.0000001, 0.0005, 300]
    # - [0.0005, 0.0005, 2000]
    - [0.0005, 0.00025, 1000]