|
{ |
|
"architectures": [ |
|
"ScriptableCrammedBERT" |
|
], |
|
"num_transformer_layers": 16, |
|
"intermed_type": "fff", |
|
"hidden_size": 768, |
|
"intermed_size": 1, |
|
"intermed_depth": 11, |
|
"hidden_dropout_prob": 0.1, |
|
"norm": "LayerNorm", |
|
"norm_eps": 1e-12, |
|
"norm_scheme": "pre", |
|
"nonlin": "GELU", |
|
"tie_weights": true, |
|
"decoder_bias": false, |
|
"sparse_prediction": 0.15, |
|
"loss": "cross-entropy", |
|
"objective_layout": "MLM", |
|
"embedding": { |
|
"vocab_size": null, |
|
"pos_embedding": "scaled-sinusoidal", |
|
"dropout_prob": 0.1, |
|
"pad_token_id": 0, |
|
"max_seq_length": 128, |
|
"embedding_dim": 768, |
|
"normalization": true, |
|
"stable_low_precision": false |
|
}, |
|
"attention": { |
|
"type": "self-attention", |
|
"causal_attention": false, |
|
"num_attention_heads": 12, |
|
"dropout_prob": 0.1, |
|
"skip_output_projection": false, |
|
"qkv_bias": false, |
|
"rotary_embedding": false, |
|
"seq_op_in_fp32": false, |
|
"sequence_op": "torch-softmax" |
|
}, |
|
"init": { |
|
"type": "normal", |
|
"std": 0.02 |
|
}, |
|
"ffn_layer_frequency": 1, |
|
"skip_head_transform": true, |
|
"use_bias": false, |
|
"final_norm": true, |
|
"num_labels": null, |
|
"classification_head": { |
|
"pooler": "avg", |
|
"include_ff_layer": true, |
|
"head_dim": 1024, |
|
"nonlin": "Tanh", |
|
"classifier_dropout": 0.1 |
|
} |
|
} |