dearth-tiny / ts100-re2-h1.yml
XFious's picture
first-commit
4ae913a
model:
max_token_len: 1024 # should be larger than the seqlen
#vocab_size: 32000
n_layer: 24
n_head: 4
n_kv_head: 2 # multi-query attention
dim: 128
#dim_qk_head: 32 # usually set to dim // n_head, but can be different
#hidden_dim: # 768*4, the MLP after the attention layer
#multiple_of: 64 # make sure the hidden_dim is a multiple of this number, beause silu (swish) is used, so hidden layer will be changed
dropout_rate: 0.05 # for the attention map
#layer_init_factor: 0.1 # by default = (n_layer * 8) ** -1/2; should use default value, based on the microsoft DeepNet paper
#residual_factor: 2 # by default = (2 * n_layer) ** 1/2; should use default value
attn_window_size: 512
front_window_size: 0
use_rotary: True
use_alibi: False
mimic_attn_layer: 21 # replace this layer to be a training target, to mimic the attention of the teacher; this special layer should use the similar setting as the teacher
mimic_n_head: 16
mimic_n_kv_head: 16
#mimic_sliding_window_size: 1024
mimic_attn_dropout: 0.0
mimic_dim_qk_head: 16
mimic_use_rotary: True
mimic_use_alibi: False
opt:
gradient_clip: 1.0
lr: 1
beta1: 0.9
beta2: 0.99
weight_decay: 0.2
opt_name: sophia
loss:
soft_loss_weight: 0.0
hard_loss_weight: 1.0
mimic_loss_weight: 0.0
virtual_v_head_num: 16 # based on MiniLM v2, it is similar to attention but only use v to do self-attn. It make the student's x_v similar to teacher's x_v
loss_soft_temperature: 1 # temperature for the soft loss, to make the softmax more smooth, sensitive to the small logits
scheduler:
slr_seg:
# - [0.0000001, 0.0005, 300]
# - [0.0005, 0.0005, 2000]
- [0.0005, 0.00025, 1000]