Spaces:
Sleeping
Sleeping
model: | |
max_token_len: 1024 # should be larger than the seqlen | |
#vocab_size: 32000 | |
n_layer: 24 | |
n_head: 4 | |
n_kv_head: 2 # multi-query attention | |
dim: 128 | |
#dim_qk_head: 32 # usually set to dim // n_head, but can be different | |
#hidden_dim: # 768*4, the MLP after the attention layer | |
#multiple_of: 64 # make sure the hidden_dim is a multiple of this number, beause silu (swish) is used, so hidden layer will be changed | |
dropout_rate: 0.05 # for the attention map | |
#layer_init_factor: 0.1 # by default = (n_layer * 8) ** -1/2; should use default value, based on the microsoft DeepNet paper | |
#residual_factor: 2 # by default = (2 * n_layer) ** 1/2; should use default value | |
attn_window_size: 512 | |
front_window_size: 0 | |
use_rotary: True | |
use_alibi: False | |
mimic_attn_layer: 21 # replace this layer to be a training target, to mimic the attention of the teacher; this special layer should use the similar setting as the teacher | |
mimic_n_head: 16 | |
mimic_n_kv_head: 16 | |
#mimic_sliding_window_size: 1024 | |
mimic_attn_dropout: 0.0 | |
mimic_dim_qk_head: 16 | |
mimic_use_rotary: True | |
mimic_use_alibi: False | |
opt: | |
gradient_clip: 1.0 | |
lr: 1 | |
beta1: 0.9 | |
beta2: 0.99 | |
weight_decay: 0.2 | |
opt_name: sophia | |
loss: | |
soft_loss_weight: 0.0 | |
hard_loss_weight: 1.0 | |
mimic_loss_weight: 0.0 | |
virtual_v_head_num: 16 # based on MiniLM v2, it is similar to attention but only use v to do self-attn. It make the student's x_v similar to teacher's x_v | |
loss_soft_temperature: 1 # temperature for the soft loss, to make the softmax more smooth, sensitive to the small logits | |
scheduler: | |
slr_seg: | |
# - [0.0000001, 0.0005, 300] | |
# - [0.0005, 0.0005, 2000] | |
- [0.0005, 0.00025, 1000] | |