UltraFastBERT-1x11-long / arch_budget_hours_24.json

Upload 11 files

8c43fc6 about 1 year ago

1.47 kB

	{
	"architectures": [
	"ScriptableCrammedBERT"
	],
	"num_transformer_layers": 16,
	"intermed_type": "fff",
	"hidden_size": 768,
	"intermed_size": 1,
	"intermed_depth": 11,
	"hidden_dropout_prob": 0.1,
	"norm": "LayerNorm",
	"norm_eps": 1e-12,
	"norm_scheme": "pre",
	"nonlin": "GELU",
	"tie_weights": true,
	"decoder_bias": false,
	"sparse_prediction": 0.15,
	"loss": "cross-entropy",
	"objective_layout": "MLM",
	"embedding": {
	"vocab_size": null,
	"pos_embedding": "scaled-sinusoidal",
	"dropout_prob": 0.1,
	"pad_token_id": 0,
	"max_seq_length": 128,
	"embedding_dim": 768,
	"normalization": true,
	"stable_low_precision": false
	},
	"attention": {
	"type": "self-attention",
	"causal_attention": false,
	"num_attention_heads": 12,
	"dropout_prob": 0.1,
	"skip_output_projection": false,
	"qkv_bias": false,
	"rotary_embedding": false,
	"seq_op_in_fp32": false,
	"sequence_op": "torch-softmax"
	},
	"init": {
	"type": "normal",
	"std": 0.02
	},
	"ffn_layer_frequency": 1,
	"skip_head_transform": true,
	"use_bias": false,
	"final_norm": true,
	"num_labels": null,
	"classification_head": {
	"pooler": "avg",
	"include_ff_layer": true,
	"head_dim": 1024,
	"nonlin": "Tanh",
	"classifier_dropout": 0.1
	}
	}