Spaces:

XFious
/

dearth-tiny

Sleeping

App Files Files Community

dearth-tiny / ts100-re2-h1.yml

XFious

first-commit

4ae913a about 1 year ago

raw

history blame contribute delete

1.7 kB

	model:
	max_token_len: 1024 # should be larger than the seqlen
	#vocab_size: 32000
	n_layer: 24
	n_head: 4
	n_kv_head: 2 # multi-query attention
	dim: 128
	#dim_qk_head: 32 # usually set to dim // n_head, but can be different
	#hidden_dim: # 768*4, the MLP after the attention layer
	#multiple_of: 64 # make sure the hidden_dim is a multiple of this number, beause silu (swish) is used, so hidden layer will be changed
	dropout_rate: 0.05 # for the attention map
	#layer_init_factor: 0.1 # by default = (n_layer * 8) ** -1/2; should use default value, based on the microsoft DeepNet paper
	#residual_factor: 2 # by default = (2 * n_layer) ** 1/2; should use default value
	attn_window_size: 512
	front_window_size: 0
	use_rotary: True
	use_alibi: False

	mimic_attn_layer: 21 # replace this layer to be a training target, to mimic the attention of the teacher; this special layer should use the similar setting as the teacher
	mimic_n_head: 16
	mimic_n_kv_head: 16
	#mimic_sliding_window_size: 1024
	mimic_attn_dropout: 0.0
	mimic_dim_qk_head: 16
	mimic_use_rotary: True
	mimic_use_alibi: False

	opt:
	gradient_clip: 1.0
	lr: 1
	beta1: 0.9
	beta2: 0.99
	weight_decay: 0.2
	opt_name: sophia

	loss:
	soft_loss_weight: 0.0
	hard_loss_weight: 1.0
	mimic_loss_weight: 0.0
	virtual_v_head_num: 16 # based on MiniLM v2, it is similar to attention but only use v to do self-attn. It make the student's x_v similar to teacher's x_v
	loss_soft_temperature: 1 # temperature for the soft loss, to make the softmax more smooth, sensitive to the small logits

	scheduler:
	slr_seg:
	# - [0.0000001, 0.0005, 300]
	# - [0.0005, 0.0005, 2000]
	- [0.0005, 0.00025, 1000]