File size: 2,438 Bytes
fc1343a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 |
arch:
type: TransformerLMHeadModel
args:
transformer_config:
type: TransformerDecoderOnlyModel
args:
embed_config:
type: TransformerEmbeddingBlock
args:
token_embed_config:
type: TokenEmbedding
args:
n_embed: 512
n_vocab: 50304
pos_embed_config: null
type_embed_config: null
ln_config: null
p_drop_embed: 0.0
concat_strategy: id_first
decoder_config:
type: ParallelTransformerDecoderBlock
args:
attn_config:
type: GPTNeoXAttention
args:
n_embed: 512
n_pos: 2048
n_head: 8
n_key_value_head: 8
head_size: 64
p_drop_attn: 0.0
p_drop_resid: 0.0
bias_attn: true
bias_proj: true
cross_attn: false
scale_dot_product: true
scale_layer_wise: false
layer_idx: null
rope_config:
type: MistralRotaryEmbedding
args:
rotary_head_size: 16
n_pos: 2048
base: 10000
scaling_type: null
scaling_factor: null
perform_bloom_split_head: true
mlp_config:
type: TransformerMLP
args:
n_embed: 512
n_inner: 2048
act_fn_config:
type: NewGELUActivation
args: {}
p_drop_mlp: 0.0
ln_config:
type: LayerNorm
args:
n_embed: 512
ln_eps: 1.0e-05
n_embed: 512
post_norm: false
add_cross_attn: false
share_layer_norm: false
n_embed: 512
n_layer: 6
n_head: 8
ln_config:
type: LayerNorm
args:
n_embed: 512
ln_eps: 1.0e-05
perform_linear_bias: false
attn_window_size_loop_unit: null
lm_head_config:
type: TransformerLMHead
args:
n_vocab: 50304
n_embed: 512
bias_lm_head: false
perform_transform: false
act_fn_config: null
ln_config: null
|