|
{ |
|
"_name_or_path": "microsoft/Phi-3-medium-128k-instruct", |
|
"architectures": [ |
|
"Phi3ForCausalLM" |
|
], |
|
"attention_bias": false, |
|
"attention_dropout": 0.0, |
|
"auto_map": { |
|
"AutoConfig": "configuration_phi3.Phi3Config", |
|
"AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" |
|
}, |
|
"bos_token_id": 1, |
|
"embd_pdrop": 0.0, |
|
"eos_token_id": 32000, |
|
"hidden_act": "silu", |
|
"hidden_size": 5120, |
|
"initializer_range": 0.02, |
|
"intermediate_size": 17920, |
|
"max_position_embeddings": 131072, |
|
"model_type": "phi3", |
|
"num_attention_heads": 40, |
|
"num_experts": 2, |
|
"num_experts_per_tok": 1, |
|
"num_hidden_layers": 40, |
|
"num_key_value_heads": 10, |
|
"num_local_experts": 8, |
|
"original_max_position_embeddings": 4096, |
|
"output_router_logits": false, |
|
"resid_pdrop": 0.0, |
|
"rms_norm_eps": 1e-05, |
|
"rope_scaling": { |
|
"long_factor": [ |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.25, |
|
1.25, |
|
1.5, |
|
2.0, |
|
2.75, |
|
5.75, |
|
5.75, |
|
6.5, |
|
9.25, |
|
11.0, |
|
13.25, |
|
19.25, |
|
19.75, |
|
19.75, |
|
21.25, |
|
21.5, |
|
26.5, |
|
30.0, |
|
33.75, |
|
35.25, |
|
38.5, |
|
42.0, |
|
42.25, |
|
46.0, |
|
47.0, |
|
50.0, |
|
50.5, |
|
51.0, |
|
52.0, |
|
52.75, |
|
53.75, |
|
54.75, |
|
57.0, |
|
57.25, |
|
58.5, |
|
59.25, |
|
59.5, |
|
62.0, |
|
62.5, |
|
62.75, |
|
63.25, |
|
63.25, |
|
63.25, |
|
63.75, |
|
64.0, |
|
64.0, |
|
64.25, |
|
64.5, |
|
64.5, |
|
65.0, |
|
65.0 |
|
], |
|
"short_factor": [ |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.0, |
|
1.01, |
|
1.02, |
|
1.02, |
|
1.04, |
|
1.04, |
|
1.07, |
|
1.07, |
|
1.1, |
|
1.3000000000000003, |
|
1.3000000000000003, |
|
1.5000000000000004, |
|
1.5700000000000005, |
|
1.9000000000000008, |
|
2.3100000000000014, |
|
2.759999999999992, |
|
3.3899999999999784, |
|
3.9399999999999666, |
|
4.009999999999965, |
|
4.289999999999959, |
|
4.349999999999958, |
|
5.349999999999937, |
|
6.659999999999909, |
|
7.029999999999901, |
|
7.51999999999989, |
|
8.00999999999988, |
|
8.249999999999876, |
|
8.279999999999875, |
|
9.629999999999846, |
|
9.89999999999984, |
|
10.589999999999826, |
|
11.049999999999816, |
|
11.7899999999998, |
|
12.189999999999792, |
|
12.889999999999777, |
|
13.129999999999772, |
|
13.16999999999977, |
|
13.20999999999977, |
|
13.479999999999764, |
|
13.539999999999763, |
|
13.779999999999758, |
|
13.929999999999755, |
|
14.429999999999744, |
|
14.759999999999737, |
|
15.149999999999729, |
|
15.419999999999723, |
|
15.53999999999972, |
|
15.659999999999718, |
|
15.749999999999716, |
|
15.759999999999716, |
|
15.799999999999715, |
|
16.05999999999971, |
|
16.079999999999714, |
|
16.11999999999972, |
|
16.11999999999972, |
|
16.18999999999973, |
|
16.31999999999975, |
|
16.539999999999786, |
|
16.799999999999827 |
|
], |
|
"type": "su" |
|
}, |
|
"rope_theta": 10000.0, |
|
"router_aux_loss_coef": 0.001, |
|
"router_jitter_noise": 0.0, |
|
"sliding_window": null, |
|
"tie_word_embeddings": false, |
|
"torch_dtype": "float16", |
|
"transformers_version": "4.41.1", |
|
"use_cache": true, |
|
"vocab_size": 32064 |
|
} |
|
|