|
{ |
|
"arch": { |
|
"architectures": [ |
|
"ScriptableMaskedLM" |
|
], |
|
"attention": { |
|
"causal_attention": false, |
|
"dropout_prob": 0.1, |
|
"high_level_fusion": false, |
|
"low_level_fusion": true, |
|
"num_attention_heads": 12, |
|
"qkv_bias": false, |
|
"rotary_embedding": false, |
|
"seq_op_in_fp32": false, |
|
"sequence_op": "torch-softmax", |
|
"skip_output_projection": false, |
|
"type": "flash-attention-impl" |
|
}, |
|
"classification_head": { |
|
"classifier_dropout": 0.1, |
|
"head_dim": 1024, |
|
"include_ff_layer": true, |
|
"nonlin": "Tanh", |
|
"pooler": "zero_index" |
|
}, |
|
"decoder_bias": false, |
|
"deepnorm_scaling": false, |
|
"embedding": { |
|
"dropout_prob": 0.1, |
|
"embedding_dim": 768, |
|
"max_seq_length": 128, |
|
"normalization": true, |
|
"pad_token_id": 0, |
|
"pos_embedding": "learned", |
|
"vocab_size": 32768 |
|
}, |
|
"ffn_layer_frequency": 1, |
|
"final_norm": true, |
|
"gradient_checkpointing": false, |
|
"hidden_dropout_prob": 0.1, |
|
"hidden_size": 768, |
|
"init": { |
|
"std": 0.02, |
|
"type": "normal" |
|
}, |
|
"intermed_size": 3072, |
|
"layer_drop_theta": null, |
|
"layer_fusion": true, |
|
"layer_macro_type": "transformer", |
|
"loss": "cross-entropy", |
|
"nonlin": "GELUglu", |
|
"norm": "LayerNorm", |
|
"norm_eps": 1e-12, |
|
"norm_scheme": "pre", |
|
"num_labels": null, |
|
"num_transformer_layers": 16, |
|
"recurrent_layers": null, |
|
"skip_head_transform": true, |
|
"sparse_prediction": true, |
|
"tie_weights": true, |
|
"use_bias": false, |
|
"z_loss_factor": 0 |
|
}, |
|
"architectures": [ |
|
"ScriptableLMForPreTraining" |
|
], |
|
"model_type": "crammedBERT", |
|
"torch_dtype": "float32", |
|
"transformers_version": "4.26.1" |
|
} |
|
|