{ "activation": "relu", "bias": true, "d_model": 512, "dropout": 0.2, "hidden_dim": 2048, "mlp": "MLP", "num_heads": 32, "num_kv_heads": 0, "num_layers": 32, "seq_len": 256, "vocab_size": 50257, "weight_tying": false }