ylwt's picture
first commit
4d2400c
arch:
type: TransformerLMHeadModel
args:
transformer_config:
type: TransformerDecoderOnlyModel
args:
embed_config:
type: OPTEmbeddingBlock
args:
token_embed_config:
type: TokenEmbedding
args:
n_embed: 768
n_vocab: 50000
pos_embed_config:
type: OPTLearnedPositionalEmbedding
args:
n_pos: 2048
n_embed: 768
offset: 2
type_embed_config: null
ln_config: null
p_drop_embed: 0.1
concat_strategy: id_first
decoder_config:
type: TransformerDecoderBlock
args:
attn_config:
type: MultiHeadKeyValueAttention
args:
n_embed: 768
n_pos: 2048
n_head: 12
head_size: 64
p_drop_attn: 0.1
p_drop_resid: 0.1
bias_attn: true
bias_proj: true
cross_attn: false
scale_dot_product: false
scale_layer_wise: false
layer_idx: null
perform_linear_bias: false
perform_bloom_split_head: false
perform_query_scaling: true
attn_window_size: null
mlp_config:
type: TransformerMLP
args:
n_embed: 768
n_inner: 3072
act_fn_config:
type: NewGELUActivation
args: {}
p_drop_mlp: 0.1
ln_config:
type: LayerNorm
args:
n_embed: 768
ln_eps: 1.0e-05
n_embed: 768
post_norm: false
add_cross_attn: false
n_embed: 768
n_layer: 12
n_head: 12
ln_config:
type: LayerNorm
args:
n_embed: 768
ln_eps: 1.0e-05
perform_linear_bias: false
attn_window_size_loop_unit: null
lm_head_config:
type: TransformerLMHead
args:
n_vocab: 50000
n_embed: 768
perform_transform: false
act_fn_config: null
ln_config: null