Upload folder using huggingface_hub
#1
by
Qubitium
- opened
- config.json +44 -0
- model-00001-of-00009.safetensors +3 -0
- model-00002-of-00009.safetensors +3 -0
- model-00003-of-00009.safetensors +3 -0
- model-00004-of-00009.safetensors +3 -0
- model-00005-of-00009.safetensors +3 -0
- model-00006-of-00009.safetensors +3 -0
- model-00007-of-00009.safetensors +3 -0
- model-00008-of-00009.safetensors +3 -0
- model-00009-of-00009.safetensors +3 -0
- model.safetensors.index.json +0 -0
- quant_log.json +1 -0
- quantize_config.json +17 -0
- special_tokens_map.json +23 -0
- tokenizer.json +0 -0
- tokenizer.model +3 -0
- tokenizer_config.json +0 -0
config.json
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/monster/data/model/Mistral-Large-Instruct-2407/",
|
3 |
+
"architectures": [
|
4 |
+
"MistralForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_dropout": 0.0,
|
7 |
+
"bos_token_id": 1,
|
8 |
+
"eos_token_id": 2,
|
9 |
+
"head_dim": 128,
|
10 |
+
"hidden_act": "silu",
|
11 |
+
"hidden_size": 12288,
|
12 |
+
"initializer_range": 0.02,
|
13 |
+
"intermediate_size": 28672,
|
14 |
+
"max_position_embeddings": 131072,
|
15 |
+
"model_type": "mistral",
|
16 |
+
"num_attention_heads": 96,
|
17 |
+
"num_hidden_layers": 88,
|
18 |
+
"num_key_value_heads": 8,
|
19 |
+
"quantization_config": {
|
20 |
+
"bits": 4,
|
21 |
+
"checkpoint_format": "gptq",
|
22 |
+
"damp_percent": 0.0025,
|
23 |
+
"desc_act": true,
|
24 |
+
"group_size": 128,
|
25 |
+
"lm_head": false,
|
26 |
+
"meta": {
|
27 |
+
"quantizer": "gptqmodel:0.9.10-dev0"
|
28 |
+
},
|
29 |
+
"model_file_base_name": null,
|
30 |
+
"model_name_or_path": null,
|
31 |
+
"quant_method": "gptq",
|
32 |
+
"static_groups": false,
|
33 |
+
"sym": true,
|
34 |
+
"true_sequential": true
|
35 |
+
},
|
36 |
+
"rms_norm_eps": 1e-05,
|
37 |
+
"rope_theta": 1000000.0,
|
38 |
+
"sliding_window": null,
|
39 |
+
"tie_word_embeddings": false,
|
40 |
+
"torch_dtype": "bfloat16",
|
41 |
+
"transformers_version": "4.43.1",
|
42 |
+
"use_cache": true,
|
43 |
+
"vocab_size": 32768
|
44 |
+
}
|
model-00001-of-00009.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e58da05ab1f347c6c0b65ed97b73e765f88ee6c573fe0b46b86b95de6a4c2de2
|
3 |
+
size 7995196168
|
model-00002-of-00009.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0a4609c8ded8058e08f8226203ce52836a51d8f1a19ebdde59a4f676d561710
|
3 |
+
size 7999280768
|
model-00003-of-00009.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cc147c2af6b31b185cdea3c2be713a1c5def2eabce1e4df3951a5bfa211e6906
|
3 |
+
size 7928288896
|
model-00004-of-00009.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7eac92eccde5af9af22349442e378f9ddbb2f34c8b7c4bad132dc1f589466318
|
3 |
+
size 7993611456
|
model-00005-of-00009.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef2f8d4852b58fc402239f21fb3eb9c102710d695703eb45ad0a2c4b954ed1f7
|
3 |
+
size 7915115256
|
model-00006-of-00009.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bb88077a3a9010b03316dc2dcfb1abdb46441a834dd0c5218e7f79fa2f3d84ba
|
3 |
+
size 7915115256
|
model-00007-of-00009.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c181a5ced75a73ddc7101dd4570cedfcbd4330203fa68b5d1b21dfbe415cb320
|
3 |
+
size 7915115256
|
model-00008-of-00009.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8862bf39a221eac40bc8e47ad631397740698ff5c3cda4713a70c77976e12cd2
|
3 |
+
size 7915115256
|
model-00009-of-00009.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b0f929facf5319f0084dc517ef182b7d4f7275b959eb863f1dcafd9eac4c71a9
|
3 |
+
size 1354721008
|
model.safetensors.index.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
quant_log.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
[{"layer": 1, "module": "self_attn.k_proj", "avg_loss": "0.0025", "time": "3.9430"}, {"layer": 1, "module": "self_attn.v_proj", "avg_loss": "0.0000", "time": "3.5821"}, {"layer": 1, "module": "self_attn.q_proj", "avg_loss": "0.0079", "time": "3.7697"}, {"layer": 1, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.6587"}, {"layer": 1, "module": "mlp.up_proj", "avg_loss": "0.0026", "time": "3.9051"}, {"layer": 1, "module": "mlp.gate_proj", "avg_loss": "0.0027", "time": "3.9021"}, {"layer": 1, "module": "mlp.down_proj", "avg_loss": "0.0000", "time": "12.7383"}, {"layer": 2, "module": "self_attn.k_proj", "avg_loss": "0.0028", "time": "3.5138"}, {"layer": 2, "module": "self_attn.v_proj", "avg_loss": "0.0001", "time": "3.5689"}, {"layer": 2, "module": "self_attn.q_proj", "avg_loss": "0.0168", "time": "3.6371"}, {"layer": 2, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.7407"}, {"layer": 2, "module": "mlp.up_proj", "avg_loss": "0.0170", "time": "4.0165"}, {"layer": 2, "module": "mlp.gate_proj", "avg_loss": "0.0177", "time": "3.8721"}, {"layer": 2, "module": "mlp.down_proj", "avg_loss": "0.0000", "time": "11.9172"}, {"layer": 3, "module": "self_attn.k_proj", "avg_loss": "0.0041", "time": "3.4972"}, {"layer": 3, "module": "self_attn.v_proj", "avg_loss": "0.0003", "time": "3.4600"}, {"layer": 3, "module": "self_attn.q_proj", "avg_loss": "0.0221", "time": "3.6098"}, {"layer": 3, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.7080"}, {"layer": 3, "module": "mlp.up_proj", "avg_loss": "0.0487", "time": "3.8764"}, {"layer": 3, "module": "mlp.gate_proj", "avg_loss": "0.0517", "time": "3.9636"}, {"layer": 3, "module": "mlp.down_proj", "avg_loss": "0.0123", "time": "11.6276"}, {"layer": 4, "module": "self_attn.k_proj", "avg_loss": "0.0115", "time": "3.5182"}, {"layer": 4, "module": "self_attn.v_proj", "avg_loss": "0.0035", "time": "3.4788"}, {"layer": 4, "module": "self_attn.q_proj", "avg_loss": "0.0860", "time": "3.6308"}, {"layer": 4, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.6321"}, {"layer": 4, "module": "mlp.up_proj", "avg_loss": "0.0856", "time": "3.8590"}, {"layer": 4, "module": "mlp.gate_proj", "avg_loss": "0.0931", "time": "3.8134"}, {"layer": 4, "module": "mlp.down_proj", "avg_loss": "0.0000", "time": "11.7583"}, {"layer": 5, "module": "self_attn.k_proj", "avg_loss": "0.0146", "time": "3.4929"}, {"layer": 5, "module": "self_attn.v_proj", "avg_loss": "0.0048", "time": "3.3852"}, {"layer": 5, "module": "self_attn.q_proj", "avg_loss": "0.1096", "time": "3.6727"}, {"layer": 5, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.6602"}, {"layer": 5, "module": "mlp.up_proj", "avg_loss": "0.1057", "time": "3.8731"}, {"layer": 5, "module": "mlp.gate_proj", "avg_loss": "0.1108", "time": "3.9829"}, {"layer": 5, "module": "mlp.down_proj", "avg_loss": "0.0000", "time": "11.7367"}, {"layer": 6, "module": "self_attn.k_proj", "avg_loss": "0.0261", "time": "3.5531"}, {"layer": 6, "module": "self_attn.v_proj", "avg_loss": "0.0083", "time": "3.5269"}, {"layer": 6, "module": "self_attn.q_proj", "avg_loss": "0.1931", "time": "3.6740"}, {"layer": 6, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.7331"}, {"layer": 6, "module": "mlp.up_proj", "avg_loss": "0.1370", "time": "3.9503"}, {"layer": 6, "module": "mlp.gate_proj", "avg_loss": "0.1435", "time": "3.9085"}, {"layer": 6, "module": "mlp.down_proj", "avg_loss": "0.0000", "time": "11.4534"}, {"layer": 7, "module": "self_attn.k_proj", "avg_loss": "0.0281", "time": "3.6149"}, {"layer": 7, "module": "self_attn.v_proj", "avg_loss": "0.0106", "time": "3.5840"}, {"layer": 7, "module": "self_attn.q_proj", "avg_loss": "0.2135", "time": "3.7308"}, {"layer": 7, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.7764"}, {"layer": 7, "module": "mlp.up_proj", "avg_loss": "0.1612", "time": "4.0147"}, {"layer": 7, "module": "mlp.gate_proj", "avg_loss": "0.1687", "time": "3.9550"}, {"layer": 7, "module": "mlp.down_proj", "avg_loss": "0.0000", "time": "11.8158"}, {"layer": 8, "module": "self_attn.k_proj", "avg_loss": "0.0323", "time": "3.6679"}, {"layer": 8, "module": "self_attn.v_proj", "avg_loss": "0.0109", "time": "3.5058"}, {"layer": 8, "module": "self_attn.q_proj", "avg_loss": "0.2263", "time": "3.6912"}, {"layer": 8, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.6699"}, {"layer": 8, "module": "mlp.up_proj", "avg_loss": "0.2145", "time": "4.0389"}, {"layer": 8, "module": "mlp.gate_proj", "avg_loss": "0.2268", "time": "3.8976"}, {"layer": 8, "module": "mlp.down_proj", "avg_loss": "0.0001", "time": "11.8285"}, {"layer": 9, "module": "self_attn.k_proj", "avg_loss": "0.0461", "time": "3.5866"}, {"layer": 9, "module": "self_attn.v_proj", "avg_loss": "0.0137", "time": "3.5351"}, {"layer": 9, "module": "self_attn.q_proj", "avg_loss": "0.3110", "time": "3.7307"}, {"layer": 9, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.7076"}, {"layer": 9, "module": "mlp.up_proj", "avg_loss": "0.2721", "time": "3.9269"}, {"layer": 9, "module": "mlp.gate_proj", "avg_loss": "0.2882", "time": "3.8941"}, {"layer": 9, "module": "mlp.down_proj", "avg_loss": "0.0001", "time": "11.8393"}, {"layer": 10, "module": "self_attn.k_proj", "avg_loss": "0.0505", "time": "3.6592"}, {"layer": 10, "module": "self_attn.v_proj", "avg_loss": "0.0157", "time": "3.6651"}, {"layer": 10, "module": "self_attn.q_proj", "avg_loss": "0.3526", "time": "3.7701"}, {"layer": 10, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.7747"}, {"layer": 10, "module": "mlp.up_proj", "avg_loss": "0.3309", "time": "4.0116"}, {"layer": 10, "module": "mlp.gate_proj", "avg_loss": "0.3639", "time": "3.8496"}, {"layer": 10, "module": "mlp.down_proj", "avg_loss": "0.0001", "time": "11.7583"}, {"layer": 11, "module": "self_attn.k_proj", "avg_loss": "0.0347", "time": "3.6636"}, {"layer": 11, "module": "self_attn.v_proj", "avg_loss": "0.0128", "time": "3.5234"}, {"layer": 11, "module": "self_attn.q_proj", "avg_loss": "0.2627", "time": "3.7812"}, {"layer": 11, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.7428"}, {"layer": 11, "module": "mlp.up_proj", "avg_loss": "0.3212", "time": "3.8970"}, {"layer": 11, "module": "mlp.gate_proj", "avg_loss": "0.3520", "time": "3.8134"}, {"layer": 11, "module": "mlp.down_proj", "avg_loss": "0.0001", "time": "11.7106"}, {"layer": 12, "module": "self_attn.k_proj", "avg_loss": "0.0826", "time": "3.5860"}, {"layer": 12, "module": "self_attn.v_proj", "avg_loss": "0.0214", "time": "3.5148"}, {"layer": 12, "module": "self_attn.q_proj", "avg_loss": "0.5046", "time": "3.7010"}, {"layer": 12, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.6910"}, {"layer": 12, "module": "mlp.up_proj", "avg_loss": "0.3613", "time": "3.9269"}, {"layer": 12, "module": "mlp.gate_proj", "avg_loss": "0.4272", "time": "3.8697"}, {"layer": 12, "module": "mlp.down_proj", "avg_loss": "0.0001", "time": "11.6295"}, {"layer": 13, "module": "self_attn.k_proj", "avg_loss": "0.0565", "time": "3.6458"}, {"layer": 13, "module": "self_attn.v_proj", "avg_loss": "0.0216", "time": "3.5546"}, {"layer": 13, "module": "self_attn.q_proj", "avg_loss": "0.4277", "time": "3.6714"}, {"layer": 13, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.6126"}, {"layer": 13, "module": "mlp.up_proj", "avg_loss": "0.4043", "time": "4.0138"}, {"layer": 13, "module": "mlp.gate_proj", "avg_loss": "0.4465", "time": "3.9006"}, {"layer": 13, "module": "mlp.down_proj", "avg_loss": "0.0001", "time": "11.8862"}, {"layer": 14, "module": "self_attn.k_proj", "avg_loss": "0.0949", "time": "3.6098"}, {"layer": 14, "module": "self_attn.v_proj", "avg_loss": "0.0312", "time": "3.5785"}, {"layer": 14, "module": "self_attn.q_proj", "avg_loss": "0.6491", "time": "3.6542"}, {"layer": 14, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.6990"}, {"layer": 14, "module": "mlp.up_proj", "avg_loss": "0.4534", "time": "3.9758"}, {"layer": 14, "module": "mlp.gate_proj", "avg_loss": "0.4915", "time": "3.9699"}, {"layer": 14, "module": "mlp.down_proj", "avg_loss": "0.0001", "time": "11.6193"}, {"layer": 15, "module": "self_attn.k_proj", "avg_loss": "0.0739", "time": "3.5077"}, {"layer": 15, "module": "self_attn.v_proj", "avg_loss": "0.0272", "time": "3.4824"}, {"layer": 15, "module": "self_attn.q_proj", "avg_loss": "0.5153", "time": "3.6961"}, {"layer": 15, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.6840"}, {"layer": 15, "module": "mlp.up_proj", "avg_loss": "0.4634", "time": "3.8446"}, {"layer": 15, "module": "mlp.gate_proj", "avg_loss": "0.4965", "time": "3.8595"}, {"layer": 15, "module": "mlp.down_proj", "avg_loss": "0.0002", "time": "11.7244"}, {"layer": 16, "module": "self_attn.k_proj", "avg_loss": "0.0650", "time": "3.5181"}, {"layer": 16, "module": "self_attn.v_proj", "avg_loss": "0.0267", "time": "3.5330"}, {"layer": 16, "module": "self_attn.q_proj", "avg_loss": "0.4788", "time": "3.7050"}, {"layer": 16, "module": "self_attn.o_proj", "avg_loss": "0.0000", "time": "3.7908"}, {"layer": 16, "module": "mlp.up_proj", "avg_loss": "0.5263", "time": "3.9376"}, {"layer": 16, "module": "mlp.gate_proj", "avg_loss": "0.5800", "time": "4.0861"}, {"layer": 16, "module": "mlp.down_proj", "avg_loss": "0.0002", "time": "11.6976"}, {"layer": 17, "module": "self_attn.k_proj", "avg_loss": "0.1015", "time": "3.5787"}, {"layer": 17, "module": "self_attn.v_proj", "avg_loss": "0.0326", "time": "3.5316"}, {"layer": 17, "module": "self_attn.q_proj", "avg_loss": "0.6401", "time": "3.5908"}, {"layer": 17, "module": "self_attn.o_proj", "avg_loss": "0.0001", "time": "3.8021"}, {"layer": 17, "module": "mlp.up_proj", "avg_loss": "0.6475", "time": "4.1244"}, {"layer": 17, "module": "mlp.gate_proj", "avg_loss": "0.6981", "time": "3.9328"}, {"layer": 17, "module": "mlp.down_proj", "avg_loss": "0.0003", "time": "11.7255"}, {"layer": 18, "module": "self_attn.k_proj", "avg_loss": "0.1059", "time": "3.6481"}, {"layer": 18, "module": "self_attn.v_proj", "avg_loss": "0.0395", "time": "3.5398"}, {"layer": 18, "module": "self_attn.q_proj", "avg_loss": "0.7317", "time": "3.6736"}, {"layer": 18, "module": "self_attn.o_proj", "avg_loss": "0.0001", "time": "3.6573"}, {"layer": 18, "module": "mlp.up_proj", "avg_loss": "0.7629", "time": "3.9780"}, {"layer": 18, "module": "mlp.gate_proj", "avg_loss": "0.8354", "time": "3.9255"}, {"layer": 18, "module": "mlp.down_proj", "avg_loss": "0.0004", "time": "11.7254"}, {"layer": 19, "module": "self_attn.k_proj", "avg_loss": "0.1430", "time": "3.6487"}, {"layer": 19, "module": "self_attn.v_proj", "avg_loss": "0.0407", "time": "3.5973"}, {"layer": 19, "module": "self_attn.q_proj", "avg_loss": "0.8637", "time": "3.6511"}, {"layer": 19, "module": "self_attn.o_proj", "avg_loss": "0.0001", "time": "3.7463"}, {"layer": 19, "module": "mlp.up_proj", "avg_loss": "0.9320", "time": "4.0624"}, {"layer": 19, "module": "mlp.gate_proj", "avg_loss": "1.0366", "time": "4.0140"}, {"layer": 19, "module": "mlp.down_proj", "avg_loss": "0.0005", "time": "12.2197"}, {"layer": 20, "module": "self_attn.k_proj", "avg_loss": "0.1745", "time": "3.7125"}, {"layer": 20, "module": "self_attn.v_proj", "avg_loss": "0.0483", "time": "3.5988"}, {"layer": 20, "module": "self_attn.q_proj", "avg_loss": "0.8670", "time": "3.7990"}, {"layer": 20, "module": "self_attn.o_proj", "avg_loss": "0.0001", "time": "3.7153"}, {"layer": 20, "module": "mlp.up_proj", "avg_loss": "1.0832", "time": "3.9766"}, {"layer": 20, "module": "mlp.gate_proj", "avg_loss": "1.1906", "time": "3.9694"}, {"layer": 20, "module": "mlp.down_proj", "avg_loss": "0.0005", "time": "11.7890"}, {"layer": 21, "module": "self_attn.k_proj", "avg_loss": "0.1835", "time": "3.6087"}, {"layer": 21, "module": "self_attn.v_proj", "avg_loss": "0.0546", "time": "3.5776"}, {"layer": 21, "module": "self_attn.q_proj", "avg_loss": "0.9983", "time": "3.7362"}, {"layer": 21, "module": "self_attn.o_proj", "avg_loss": "0.0001", "time": "3.7335"}, {"layer": 21, "module": "mlp.up_proj", "avg_loss": "1.2314", "time": "3.9565"}, {"layer": 21, "module": "mlp.gate_proj", "avg_loss": "1.3457", "time": "3.9273"}, {"layer": 21, "module": "mlp.down_proj", "avg_loss": "0.0006", "time": "11.8717"}, {"layer": 22, "module": "self_attn.k_proj", "avg_loss": "0.2266", "time": "3.6553"}, {"layer": 22, "module": "self_attn.v_proj", "avg_loss": "0.0579", "time": "3.5986"}, {"layer": 22, "module": "self_attn.q_proj", "avg_loss": "1.1286", "time": "3.7292"}, {"layer": 22, "module": "self_attn.o_proj", "avg_loss": "0.0001", "time": "3.7200"}, {"layer": 22, "module": "mlp.up_proj", "avg_loss": "1.3790", "time": "3.9985"}, {"layer": 22, "module": "mlp.gate_proj", "avg_loss": "1.5122", "time": "3.8870"}, {"layer": 22, "module": "mlp.down_proj", "avg_loss": "0.0007", "time": "11.7676"}, {"layer": 23, "module": "self_attn.k_proj", "avg_loss": "0.2982", "time": "3.5428"}, {"layer": 23, "module": "self_attn.v_proj", "avg_loss": "0.0690", "time": "3.5895"}, {"layer": 23, "module": "self_attn.q_proj", "avg_loss": "1.2860", "time": "3.7150"}, {"layer": 23, "module": "self_attn.o_proj", "avg_loss": "0.0001", "time": "3.6924"}, {"layer": 23, "module": "mlp.up_proj", "avg_loss": "1.5459", "time": "3.9762"}, {"layer": 23, "module": "mlp.gate_proj", "avg_loss": "1.6938", "time": "3.8419"}, {"layer": 23, "module": "mlp.down_proj", "avg_loss": "0.0008", "time": "11.8584"}, {"layer": 24, "module": "self_attn.k_proj", "avg_loss": "0.2844", "time": "3.6613"}, {"layer": 24, "module": "self_attn.v_proj", "avg_loss": "0.0691", "time": "3.5066"}, {"layer": 24, "module": "self_attn.q_proj", "avg_loss": "1.2747", "time": "3.7642"}, {"layer": 24, "module": "self_attn.o_proj", "avg_loss": "0.0002", "time": "3.8202"}, {"layer": 24, "module": "mlp.up_proj", "avg_loss": "1.6841", "time": "4.0549"}, {"layer": 24, "module": "mlp.gate_proj", "avg_loss": "1.8360", "time": "3.9242"}, {"layer": 24, "module": "mlp.down_proj", "avg_loss": "0.0009", "time": "12.3400"}, {"layer": 25, "module": "self_attn.k_proj", "avg_loss": "0.2917", "time": "3.4928"}, {"layer": 25, "module": "self_attn.v_proj", "avg_loss": "0.0715", "time": "3.4684"}, {"layer": 25, "module": "self_attn.q_proj", "avg_loss": "1.3385", "time": "3.7036"}, {"layer": 25, "module": "self_attn.o_proj", "avg_loss": "0.0002", "time": "3.6985"}, {"layer": 25, "module": "mlp.up_proj", "avg_loss": "1.8068", "time": "3.9971"}, {"layer": 25, "module": "mlp.gate_proj", "avg_loss": "1.9682", "time": "3.9406"}, {"layer": 25, "module": "mlp.down_proj", "avg_loss": "0.0011", "time": "11.7741"}, {"layer": 26, "module": "self_attn.k_proj", "avg_loss": "0.3805", "time": "3.6929"}, {"layer": 26, "module": "self_attn.v_proj", "avg_loss": "0.0794", "time": "3.6030"}, {"layer": 26, "module": "self_attn.q_proj", "avg_loss": "1.6878", "time": "3.6637"}, {"layer": 26, "module": "self_attn.o_proj", "avg_loss": "0.0003", "time": "3.7181"}, {"layer": 26, "module": "mlp.up_proj", "avg_loss": "1.9025", "time": "3.9762"}, {"layer": 26, "module": "mlp.gate_proj", "avg_loss": "2.0636", "time": "3.9290"}, {"layer": 26, "module": "mlp.down_proj", "avg_loss": "0.0012", "time": "12.0235"}, {"layer": 27, "module": "self_attn.k_proj", "avg_loss": "0.3337", "time": "3.5478"}, {"layer": 27, "module": "self_attn.v_proj", "avg_loss": "0.0553", "time": "3.5094"}, {"layer": 27, "module": "self_attn.q_proj", "avg_loss": "1.3671", "time": "3.6409"}, {"layer": 27, "module": "self_attn.o_proj", "avg_loss": "0.0003", "time": "3.7515"}, {"layer": 27, "module": "mlp.up_proj", "avg_loss": "2.0408", "time": "3.9865"}, {"layer": 27, "module": "mlp.gate_proj", "avg_loss": "2.2171", "time": "3.8970"}, {"layer": 27, "module": "mlp.down_proj", "avg_loss": "0.0014", "time": "11.8036"}, {"layer": 28, "module": "self_attn.k_proj", "avg_loss": "0.3945", "time": "3.6690"}, {"layer": 28, "module": "self_attn.v_proj", "avg_loss": "0.0678", "time": "3.5868"}, {"layer": 28, "module": "self_attn.q_proj", "avg_loss": "1.5244", "time": "3.7090"}, {"layer": 28, "module": "self_attn.o_proj", "avg_loss": "0.0003", "time": "3.7004"}, {"layer": 28, "module": "mlp.up_proj", "avg_loss": "2.1207", "time": "3.9975"}, {"layer": 28, "module": "mlp.gate_proj", "avg_loss": "2.2865", "time": "4.1420"}, {"layer": 28, "module": "mlp.down_proj", "avg_loss": "0.0016", "time": "11.8675"}, {"layer": 29, "module": "self_attn.k_proj", "avg_loss": "0.3457", "time": "3.6036"}, {"layer": 29, "module": "self_attn.v_proj", "avg_loss": "0.0690", "time": "3.6355"}, {"layer": 29, "module": "self_attn.q_proj", "avg_loss": "1.4106", "time": "3.6846"}, {"layer": 29, "module": "self_attn.o_proj", "avg_loss": "0.0004", "time": "4.5707"}, {"layer": 29, "module": "mlp.up_proj", "avg_loss": "2.1569", "time": "3.9368"}, {"layer": 29, "module": "mlp.gate_proj", "avg_loss": "2.3140", "time": "4.0804"}, {"layer": 29, "module": "mlp.down_proj", "avg_loss": "0.0018", "time": "12.0573"}, {"layer": 30, "module": "self_attn.k_proj", "avg_loss": "0.3163", "time": "3.5912"}, {"layer": 30, "module": "self_attn.v_proj", "avg_loss": "0.0520", "time": "3.5652"}, {"layer": 30, "module": "self_attn.q_proj", "avg_loss": "1.3983", "time": "3.6971"}, {"layer": 30, "module": "self_attn.o_proj", "avg_loss": "0.0005", "time": "3.7531"}, {"layer": 30, "module": "mlp.up_proj", "avg_loss": "2.0575", "time": "3.9372"}, {"layer": 30, "module": "mlp.gate_proj", "avg_loss": "2.1877", "time": "3.9539"}, {"layer": 30, "module": "mlp.down_proj", "avg_loss": "0.0020", "time": "11.8149"}, {"layer": 31, "module": "self_attn.k_proj", "avg_loss": "0.4881", "time": "3.6033"}, {"layer": 31, "module": "self_attn.v_proj", "avg_loss": "0.0568", "time": "3.5511"}, {"layer": 31, "module": "self_attn.q_proj", "avg_loss": "1.7629", "time": "3.7402"}, {"layer": 31, "module": "self_attn.o_proj", "avg_loss": "0.0009", "time": "3.7413"}, {"layer": 31, "module": "mlp.up_proj", "avg_loss": "2.1317", "time": "4.0000"}, {"layer": 31, "module": "mlp.gate_proj", "avg_loss": "2.2585", "time": "3.8814"}, {"layer": 31, "module": "mlp.down_proj", "avg_loss": "0.0022", "time": "11.8026"}, {"layer": 32, "module": "self_attn.k_proj", "avg_loss": "0.4483", "time": "3.6030"}, {"layer": 32, "module": "self_attn.v_proj", "avg_loss": "0.0727", "time": "3.5339"}, {"layer": 32, "module": "self_attn.q_proj", "avg_loss": "1.8765", "time": "3.7153"}, {"layer": 32, "module": "self_attn.o_proj", "avg_loss": "0.0008", "time": "3.8006"}, {"layer": 32, "module": "mlp.up_proj", "avg_loss": "2.3229", "time": "4.0355"}, {"layer": 32, "module": "mlp.gate_proj", "avg_loss": "2.4635", "time": "3.9204"}, {"layer": 32, "module": "mlp.down_proj", "avg_loss": "0.0024", "time": "11.8216"}, {"layer": 33, "module": "self_attn.k_proj", "avg_loss": "0.5678", "time": "3.5985"}, {"layer": 33, "module": "self_attn.v_proj", "avg_loss": "0.0776", "time": "3.4685"}, {"layer": 33, "module": "self_attn.q_proj", "avg_loss": "2.1782", "time": "3.6328"}, {"layer": 33, "module": "self_attn.o_proj", "avg_loss": "0.0011", "time": "3.7313"}, {"layer": 33, "module": "mlp.up_proj", "avg_loss": "2.4188", "time": "4.2000"}, {"layer": 33, "module": "mlp.gate_proj", "avg_loss": "2.5342", "time": "4.1106"}, {"layer": 33, "module": "mlp.down_proj", "avg_loss": "0.0026", "time": "13.9096"}, {"layer": 34, "module": "self_attn.k_proj", "avg_loss": "0.5044", "time": "3.6018"}, {"layer": 34, "module": "self_attn.v_proj", "avg_loss": "0.0982", "time": "3.5662"}, {"layer": 34, "module": "self_attn.q_proj", "avg_loss": "2.1173", "time": "3.6320"}, {"layer": 34, "module": "self_attn.o_proj", "avg_loss": "0.0011", "time": "3.8298"}, {"layer": 34, "module": "mlp.up_proj", "avg_loss": "2.5169", "time": "3.9818"}, {"layer": 34, "module": "mlp.gate_proj", "avg_loss": "2.6281", "time": "3.9163"}, {"layer": 34, "module": "mlp.down_proj", "avg_loss": "0.0026", "time": "11.9129"}, {"layer": 35, "module": "self_attn.k_proj", "avg_loss": "0.6986", "time": "3.6367"}, {"layer": 35, "module": "self_attn.v_proj", "avg_loss": "0.0888", "time": "3.6052"}, {"layer": 35, "module": "self_attn.q_proj", "avg_loss": "2.4523", "time": "3.7002"}, {"layer": 35, "module": "self_attn.o_proj", "avg_loss": "0.0010", "time": "3.7958"}, {"layer": 35, "module": "mlp.up_proj", "avg_loss": "2.6986", "time": "4.0462"}, {"layer": 35, "module": "mlp.gate_proj", "avg_loss": "2.8242", "time": "3.8969"}, {"layer": 35, "module": "mlp.down_proj", "avg_loss": "0.0029", "time": "11.8783"}, {"layer": 36, "module": "self_attn.k_proj", "avg_loss": "0.6310", "time": "3.5755"}, {"layer": 36, "module": "self_attn.v_proj", "avg_loss": "0.0801", "time": "3.5192"}, {"layer": 36, "module": "self_attn.q_proj", "avg_loss": "2.2166", "time": "3.6436"}, {"layer": 36, "module": "self_attn.o_proj", "avg_loss": "0.0011", "time": "3.7524"}, {"layer": 36, "module": "mlp.up_proj", "avg_loss": "2.7669", "time": "4.2172"}, {"layer": 36, "module": "mlp.gate_proj", "avg_loss": "2.8966", "time": "3.9530"}, {"layer": 36, "module": "mlp.down_proj", "avg_loss": "0.0030", "time": "11.8029"}, {"layer": 37, "module": "self_attn.k_proj", "avg_loss": "0.6704", "time": "3.6502"}, {"layer": 37, "module": "self_attn.v_proj", "avg_loss": "0.0927", "time": "3.5242"}, {"layer": 37, "module": "self_attn.q_proj", "avg_loss": "2.4231", "time": "3.6496"}, {"layer": 37, "module": "self_attn.o_proj", "avg_loss": "0.0013", "time": "3.8203"}, {"layer": 37, "module": "mlp.up_proj", "avg_loss": "2.7531", "time": "4.0241"}, {"layer": 37, "module": "mlp.gate_proj", "avg_loss": "2.8480", "time": "3.9566"}, {"layer": 37, "module": "mlp.down_proj", "avg_loss": "0.0032", "time": "11.9351"}, {"layer": 38, "module": "self_attn.k_proj", "avg_loss": "0.7999", "time": "3.7549"}, {"layer": 38, "module": "self_attn.v_proj", "avg_loss": "0.1089", "time": "3.5996"}, {"layer": 38, "module": "self_attn.q_proj", "avg_loss": "2.9220", "time": "3.7424"}, {"layer": 38, "module": "self_attn.o_proj", "avg_loss": "0.0014", "time": "4.4659"}, {"layer": 38, "module": "mlp.up_proj", "avg_loss": "2.9611", "time": "4.1480"}, {"layer": 38, "module": "mlp.gate_proj", "avg_loss": "3.0881", "time": "4.0150"}, {"layer": 38, "module": "mlp.down_proj", "avg_loss": "0.0033", "time": "11.8780"}, {"layer": 39, "module": "self_attn.k_proj", "avg_loss": "0.6765", "time": "3.6428"}, {"layer": 39, "module": "self_attn.v_proj", "avg_loss": "0.0777", "time": "3.6088"}, {"layer": 39, "module": "self_attn.q_proj", "avg_loss": "2.4000", "time": "3.6312"}, {"layer": 39, "module": "self_attn.o_proj", "avg_loss": "0.0011", "time": "3.6542"}, {"layer": 39, "module": "mlp.up_proj", "avg_loss": "2.9237", "time": "4.0044"}, {"layer": 39, "module": "mlp.gate_proj", "avg_loss": "3.0770", "time": "4.0230"}, {"layer": 39, "module": "mlp.down_proj", "avg_loss": "0.0033", "time": "12.1243"}, {"layer": 40, "module": "self_attn.k_proj", "avg_loss": "0.8040", "time": "3.5859"}, {"layer": 40, "module": "self_attn.v_proj", "avg_loss": "0.0909", "time": "3.6230"}, {"layer": 40, "module": "self_attn.q_proj", "avg_loss": "2.7060", "time": "3.6770"}, {"layer": 40, "module": "self_attn.o_proj", "avg_loss": "0.0011", "time": "3.7613"}, {"layer": 40, "module": "mlp.up_proj", "avg_loss": "2.9374", "time": "4.0460"}, {"layer": 40, "module": "mlp.gate_proj", "avg_loss": "3.1342", "time": "3.9735"}, {"layer": 40, "module": "mlp.down_proj", "avg_loss": "0.0033", "time": "11.7667"}, {"layer": 41, "module": "self_attn.k_proj", "avg_loss": "0.8877", "time": "3.6561"}, {"layer": 41, "module": "self_attn.v_proj", "avg_loss": "0.0950", "time": "3.6716"}, {"layer": 41, "module": "self_attn.q_proj", "avg_loss": "2.6594", "time": "3.8057"}, {"layer": 41, "module": "self_attn.o_proj", "avg_loss": "0.0012", "time": "3.9191"}, {"layer": 41, "module": "mlp.up_proj", "avg_loss": "2.8836", "time": "4.2024"}, {"layer": 41, "module": "mlp.gate_proj", "avg_loss": "3.1162", "time": "3.9432"}, {"layer": 41, "module": "mlp.down_proj", "avg_loss": "0.0033", "time": "11.7886"}, {"layer": 42, "module": "self_attn.k_proj", "avg_loss": "0.7239", "time": "3.6527"}, {"layer": 42, "module": "self_attn.v_proj", "avg_loss": "0.1037", "time": "3.6142"}, {"layer": 42, "module": "self_attn.q_proj", "avg_loss": "2.4116", "time": "3.6570"}, {"layer": 42, "module": "self_attn.o_proj", "avg_loss": "0.0009", "time": "3.7208"}, {"layer": 42, "module": "mlp.up_proj", "avg_loss": "2.9915", "time": "3.9793"}, {"layer": 42, "module": "mlp.gate_proj", "avg_loss": "3.3006", "time": "3.8994"}, {"layer": 42, "module": "mlp.down_proj", "avg_loss": "0.0034", "time": "13.6567"}, {"layer": 43, "module": "self_attn.k_proj", "avg_loss": "0.7917", "time": "3.6009"}, {"layer": 43, "module": "self_attn.v_proj", "avg_loss": "0.1152", "time": "3.5447"}, {"layer": 43, "module": "self_attn.q_proj", "avg_loss": "2.3214", "time": "3.6226"}, {"layer": 43, "module": "self_attn.o_proj", "avg_loss": "0.0006", "time": "3.6621"}, {"layer": 43, "module": "mlp.up_proj", "avg_loss": "3.1404", "time": "4.0371"}, {"layer": 43, "module": "mlp.gate_proj", "avg_loss": "3.5280", "time": "3.9244"}, {"layer": 43, "module": "mlp.down_proj", "avg_loss": "0.0033", "time": "11.8960"}, {"layer": 44, "module": "self_attn.k_proj", "avg_loss": "0.8937", "time": "3.6073"}, {"layer": 44, "module": "self_attn.v_proj", "avg_loss": "0.1286", "time": "3.5405"}, {"layer": 44, "module": "self_attn.q_proj", "avg_loss": "2.4856", "time": "3.6957"}, {"layer": 44, "module": "self_attn.o_proj", "avg_loss": "0.0006", "time": "3.7327"}, {"layer": 44, "module": "mlp.up_proj", "avg_loss": "3.3180", "time": "4.0490"}, {"layer": 44, "module": "mlp.gate_proj", "avg_loss": "3.7821", "time": "4.1017"}, {"layer": 44, "module": "mlp.down_proj", "avg_loss": "0.0033", "time": "11.8381"}, {"layer": 45, "module": "self_attn.k_proj", "avg_loss": "0.8555", "time": "3.5999"}, {"layer": 45, "module": "self_attn.v_proj", "avg_loss": "0.1253", "time": "3.5168"}, {"layer": 45, "module": "self_attn.q_proj", "avg_loss": "2.1693", "time": "3.6792"}, {"layer": 45, "module": "self_attn.o_proj", "avg_loss": "0.0004", "time": "3.7418"}, {"layer": 45, "module": "mlp.up_proj", "avg_loss": "3.4879", "time": "3.9804"}, {"layer": 45, "module": "mlp.gate_proj", "avg_loss": "4.0077", "time": "3.9104"}, {"layer": 45, "module": "mlp.down_proj", "avg_loss": "0.0032", "time": "12.1252"}, {"layer": 46, "module": "self_attn.k_proj", "avg_loss": "0.8914", "time": "3.5769"}, {"layer": 46, "module": "self_attn.v_proj", "avg_loss": "0.1480", "time": "3.5794"}, {"layer": 46, "module": "self_attn.q_proj", "avg_loss": "2.3671", "time": "3.7300"}, {"layer": 46, "module": "self_attn.o_proj", "avg_loss": "0.0004", "time": "3.7491"}, {"layer": 46, "module": "mlp.up_proj", "avg_loss": "3.6635", "time": "3.9687"}, {"layer": 46, "module": "mlp.gate_proj", "avg_loss": "4.1606", "time": "3.8363"}, {"layer": 46, "module": "mlp.down_proj", "avg_loss": "0.0032", "time": "11.9044"}, {"layer": 47, "module": "self_attn.k_proj", "avg_loss": "0.9606", "time": "3.5784"}, {"layer": 47, "module": "self_attn.v_proj", "avg_loss": "0.1288", "time": "3.6685"}, {"layer": 47, "module": "self_attn.q_proj", "avg_loss": "2.4137", "time": "3.8965"}, {"layer": 47, "module": "self_attn.o_proj", "avg_loss": "0.0005", "time": "4.0686"}, {"layer": 47, "module": "mlp.up_proj", "avg_loss": "3.7951", "time": "4.0023"}, {"layer": 47, "module": "mlp.gate_proj", "avg_loss": "4.3118", "time": "3.8637"}, {"layer": 47, "module": "mlp.down_proj", "avg_loss": "0.0032", "time": "11.9658"}, {"layer": 48, "module": "self_attn.k_proj", "avg_loss": "1.0697", "time": "3.6270"}, {"layer": 48, "module": "self_attn.v_proj", "avg_loss": "0.1546", "time": "3.5347"}, {"layer": 48, "module": "self_attn.q_proj", "avg_loss": "2.9056", "time": "3.7976"}, {"layer": 48, "module": "self_attn.o_proj", "avg_loss": "0.0006", "time": "3.7911"}, {"layer": 48, "module": "mlp.up_proj", "avg_loss": "3.9273", "time": "4.0797"}, {"layer": 48, "module": "mlp.gate_proj", "avg_loss": "4.4460", "time": "3.9236"}, {"layer": 48, "module": "mlp.down_proj", "avg_loss": "0.0034", "time": "11.8109"}, {"layer": 49, "module": "self_attn.k_proj", "avg_loss": "0.9970", "time": "3.6171"}, {"layer": 49, "module": "self_attn.v_proj", "avg_loss": "0.1636", "time": "3.5113"}, {"layer": 49, "module": "self_attn.q_proj", "avg_loss": "2.3955", "time": "3.7774"}, {"layer": 49, "module": "self_attn.o_proj", "avg_loss": "0.0005", "time": "3.8233"}, {"layer": 49, "module": "mlp.up_proj", "avg_loss": "4.0716", "time": "3.9465"}, {"layer": 49, "module": "mlp.gate_proj", "avg_loss": "4.6030", "time": "3.9485"}, {"layer": 49, "module": "mlp.down_proj", "avg_loss": "0.0034", "time": "11.7469"}, {"layer": 50, "module": "self_attn.k_proj", "avg_loss": "1.0779", "time": "3.6123"}, {"layer": 50, "module": "self_attn.v_proj", "avg_loss": "0.1898", "time": "3.5563"}, {"layer": 50, "module": "self_attn.q_proj", "avg_loss": "2.6592", "time": "3.6779"}, {"layer": 50, "module": "self_attn.o_proj", "avg_loss": "0.0005", "time": "3.7540"}, {"layer": 50, "module": "mlp.up_proj", "avg_loss": "4.2222", "time": "4.0438"}, {"layer": 50, "module": "mlp.gate_proj", "avg_loss": "4.7642", "time": "3.9324"}, {"layer": 50, "module": "mlp.down_proj", "avg_loss": "0.0035", "time": "11.7959"}, {"layer": 51, "module": "self_attn.k_proj", "avg_loss": "0.9110", "time": "3.6481"}, {"layer": 51, "module": "self_attn.v_proj", "avg_loss": "0.1447", "time": "3.6143"}, {"layer": 51, "module": "self_attn.q_proj", "avg_loss": "2.0845", "time": "3.6991"}, {"layer": 51, "module": "self_attn.o_proj", "avg_loss": "0.0005", "time": "3.7687"}, {"layer": 51, "module": "mlp.up_proj", "avg_loss": "4.3734", "time": "4.0391"}, {"layer": 51, "module": "mlp.gate_proj", "avg_loss": "4.9299", "time": "3.9584"}, {"layer": 51, "module": "mlp.down_proj", "avg_loss": "0.0036", "time": "13.9056"}, {"layer": 52, "module": "self_attn.k_proj", "avg_loss": "0.8645", "time": "3.6196"}, {"layer": 52, "module": "self_attn.v_proj", "avg_loss": "0.1576", "time": "3.5362"}, {"layer": 52, "module": "self_attn.q_proj", "avg_loss": "2.0036", "time": "3.5851"}, {"layer": 52, "module": "self_attn.o_proj", "avg_loss": "0.0005", "time": "3.7652"}, {"layer": 52, "module": "mlp.up_proj", "avg_loss": "4.5005", "time": "3.9495"}, {"layer": 52, "module": "mlp.gate_proj", "avg_loss": "5.0637", "time": "3.8855"}, {"layer": 52, "module": "mlp.down_proj", "avg_loss": "0.0037", "time": "11.9277"}, {"layer": 53, "module": "self_attn.k_proj", "avg_loss": "0.8993", "time": "3.6531"}, {"layer": 53, "module": "self_attn.v_proj", "avg_loss": "0.1608", "time": "3.6273"}, {"layer": 53, "module": "self_attn.q_proj", "avg_loss": "1.9692", "time": "3.7483"}, {"layer": 53, "module": "self_attn.o_proj", "avg_loss": "0.0004", "time": "3.8052"}, {"layer": 53, "module": "mlp.up_proj", "avg_loss": "4.6704", "time": "4.0584"}, {"layer": 53, "module": "mlp.gate_proj", "avg_loss": "5.2544", "time": "3.9982"}, {"layer": 53, "module": "mlp.down_proj", "avg_loss": "0.0039", "time": "12.1397"}, {"layer": 54, "module": "self_attn.k_proj", "avg_loss": "0.9499", "time": "3.7097"}, {"layer": 54, "module": "self_attn.v_proj", "avg_loss": "0.1536", "time": "3.6516"}, {"layer": 54, "module": "self_attn.q_proj", "avg_loss": "2.2825", "time": "3.7814"}, {"layer": 54, "module": "self_attn.o_proj", "avg_loss": "0.0006", "time": "3.8075"}, {"layer": 54, "module": "mlp.up_proj", "avg_loss": "4.7929", "time": "3.9499"}, {"layer": 54, "module": "mlp.gate_proj", "avg_loss": "5.3582", "time": "3.9634"}, {"layer": 54, "module": "mlp.down_proj", "avg_loss": "0.0040", "time": "11.9171"}, {"layer": 55, "module": "self_attn.k_proj", "avg_loss": "0.9963", "time": "3.7170"}, {"layer": 55, "module": "self_attn.v_proj", "avg_loss": "0.1791", "time": "3.7634"}, {"layer": 55, "module": "self_attn.q_proj", "avg_loss": "2.6388", "time": "3.9354"}, {"layer": 55, "module": "self_attn.o_proj", "avg_loss": "0.0006", "time": "4.0424"}, {"layer": 55, "module": "mlp.up_proj", "avg_loss": "4.9179", "time": "4.0807"}, {"layer": 55, "module": "mlp.gate_proj", "avg_loss": "5.4607", "time": "4.1138"}, {"layer": 55, "module": "mlp.down_proj", "avg_loss": "0.0041", "time": "12.1341"}, {"layer": 56, "module": "self_attn.k_proj", "avg_loss": "0.8880", "time": "3.5580"}, {"layer": 56, "module": "self_attn.v_proj", "avg_loss": "0.1647", "time": "3.6216"}, {"layer": 56, "module": "self_attn.q_proj", "avg_loss": "2.0842", "time": "3.7906"}, {"layer": 56, "module": "self_attn.o_proj", "avg_loss": "0.0004", "time": "3.8185"}, {"layer": 56, "module": "mlp.up_proj", "avg_loss": "5.0468", "time": "3.9736"}, {"layer": 56, "module": "mlp.gate_proj", "avg_loss": "5.5880", "time": "3.9168"}, {"layer": 56, "module": "mlp.down_proj", "avg_loss": "0.0043", "time": "11.9298"}, {"layer": 57, "module": "self_attn.k_proj", "avg_loss": "1.0778", "time": "3.5114"}, {"layer": 57, "module": "self_attn.v_proj", "avg_loss": "0.1909", "time": "3.5606"}, {"layer": 57, "module": "self_attn.q_proj", "avg_loss": "2.8574", "time": "3.7472"}, {"layer": 57, "module": "self_attn.o_proj", "avg_loss": "0.0008", "time": "3.7566"}, {"layer": 57, "module": "mlp.up_proj", "avg_loss": "5.2040", "time": "3.9869"}, {"layer": 57, "module": "mlp.gate_proj", "avg_loss": "5.7768", "time": "3.8639"}, {"layer": 57, "module": "mlp.down_proj", "avg_loss": "0.0045", "time": "11.7866"}, {"layer": 58, "module": "self_attn.k_proj", "avg_loss": "0.9410", "time": "3.5534"}, {"layer": 58, "module": "self_attn.v_proj", "avg_loss": "0.1526", "time": "3.5091"}, {"layer": 58, "module": "self_attn.q_proj", "avg_loss": "2.4459", "time": "3.6895"}, {"layer": 58, "module": "self_attn.o_proj", "avg_loss": "0.0008", "time": "3.7585"}, {"layer": 58, "module": "mlp.up_proj", "avg_loss": "5.2746", "time": "3.9907"}, {"layer": 58, "module": "mlp.gate_proj", "avg_loss": "5.8565", "time": "3.8959"}, {"layer": 58, "module": "mlp.down_proj", "avg_loss": "0.0047", "time": "11.8353"}, {"layer": 59, "module": "self_attn.k_proj", "avg_loss": "1.0092", "time": "3.6749"}, {"layer": 59, "module": "self_attn.v_proj", "avg_loss": "0.1800", "time": "3.5995"}, {"layer": 59, "module": "self_attn.q_proj", "avg_loss": "2.4753", "time": "3.7784"}, {"layer": 59, "module": "self_attn.o_proj", "avg_loss": "0.0006", "time": "3.7685"}, {"layer": 59, "module": "mlp.up_proj", "avg_loss": "5.4385", "time": "4.1838"}, {"layer": 59, "module": "mlp.gate_proj", "avg_loss": "6.0128", "time": "4.1679"}, {"layer": 59, "module": "mlp.down_proj", "avg_loss": "0.0048", "time": "11.9641"}, {"layer": 60, "module": "self_attn.k_proj", "avg_loss": "0.9603", "time": "3.6656"}, {"layer": 60, "module": "self_attn.v_proj", "avg_loss": "0.2304", "time": "3.6013"}, {"layer": 60, "module": "self_attn.q_proj", "avg_loss": "2.7000", "time": "3.7967"}, {"layer": 60, "module": "self_attn.o_proj", "avg_loss": "0.0005", "time": "3.7872"}, {"layer": 60, "module": "mlp.up_proj", "avg_loss": "5.6130", "time": "3.9814"}, {"layer": 60, "module": "mlp.gate_proj", "avg_loss": "6.2268", "time": "3.9266"}, {"layer": 60, "module": "mlp.down_proj", "avg_loss": "0.0050", "time": "11.8381"}, {"layer": 61, "module": "self_attn.k_proj", "avg_loss": "1.1918", "time": "3.5366"}, {"layer": 61, "module": "self_attn.v_proj", "avg_loss": "0.2259", "time": "3.5180"}, {"layer": 61, "module": "self_attn.q_proj", "avg_loss": "3.3278", "time": "3.6514"}, {"layer": 61, "module": "self_attn.o_proj", "avg_loss": "0.0006", "time": "3.9203"}, {"layer": 61, "module": "mlp.up_proj", "avg_loss": "5.7760", "time": "3.9616"}, {"layer": 61, "module": "mlp.gate_proj", "avg_loss": "6.3744", "time": "3.9085"}, {"layer": 61, "module": "mlp.down_proj", "avg_loss": "0.0052", "time": "11.8419"}, {"layer": 62, "module": "self_attn.k_proj", "avg_loss": "0.9675", "time": "3.6616"}, {"layer": 62, "module": "self_attn.v_proj", "avg_loss": "0.1976", "time": "3.5889"}, {"layer": 62, "module": "self_attn.q_proj", "avg_loss": "2.6642", "time": "3.7647"}, {"layer": 62, "module": "self_attn.o_proj", "avg_loss": "0.0006", "time": "3.6845"}, {"layer": 62, "module": "mlp.up_proj", "avg_loss": "5.9024", "time": "3.9358"}, {"layer": 62, "module": "mlp.gate_proj", "avg_loss": "6.5311", "time": "3.9316"}, {"layer": 62, "module": "mlp.down_proj", "avg_loss": "0.0054", "time": "12.2295"}, {"layer": 63, "module": "self_attn.k_proj", "avg_loss": "1.1736", "time": "3.6830"}, {"layer": 63, "module": "self_attn.v_proj", "avg_loss": "0.2412", "time": "3.6188"}, {"layer": 63, "module": "self_attn.q_proj", "avg_loss": "3.3070", "time": "3.8136"}, {"layer": 63, "module": "self_attn.o_proj", "avg_loss": "0.0007", "time": "3.8055"}, {"layer": 63, "module": "mlp.up_proj", "avg_loss": "6.0964", "time": "4.0439"}, {"layer": 63, "module": "mlp.gate_proj", "avg_loss": "6.7347", "time": "3.9783"}, {"layer": 63, "module": "mlp.down_proj", "avg_loss": "0.0055", "time": "11.9519"}, {"layer": 64, "module": "self_attn.k_proj", "avg_loss": "1.2197", "time": "3.6612"}, {"layer": 64, "module": "self_attn.v_proj", "avg_loss": "0.2590", "time": "3.6102"}, {"layer": 64, "module": "self_attn.q_proj", "avg_loss": "3.5749", "time": "3.7886"}, {"layer": 64, "module": "self_attn.o_proj", "avg_loss": "0.0007", "time": "3.8238"}, {"layer": 64, "module": "mlp.up_proj", "avg_loss": "6.2931", "time": "3.9225"}, {"layer": 64, "module": "mlp.gate_proj", "avg_loss": "6.9756", "time": "3.9901"}, {"layer": 64, "module": "mlp.down_proj", "avg_loss": "0.0058", "time": "11.9221"}, {"layer": 65, "module": "self_attn.k_proj", "avg_loss": "1.3096", "time": "3.6243"}, {"layer": 65, "module": "self_attn.v_proj", "avg_loss": "0.2408", "time": "3.5356"}, {"layer": 65, "module": "self_attn.q_proj", "avg_loss": "3.5934", "time": "3.6890"}, {"layer": 65, "module": "self_attn.o_proj", "avg_loss": "0.0008", "time": "3.7581"}, {"layer": 65, "module": "mlp.up_proj", "avg_loss": "6.4612", "time": "4.0091"}, {"layer": 65, "module": "mlp.gate_proj", "avg_loss": "7.1544", "time": "3.8598"}, {"layer": 65, "module": "mlp.down_proj", "avg_loss": "0.0059", "time": "11.7037"}, {"layer": 66, "module": "self_attn.k_proj", "avg_loss": "1.2110", "time": "3.6307"}, {"layer": 66, "module": "self_attn.v_proj", "avg_loss": "0.2500", "time": "3.5985"}, {"layer": 66, "module": "self_attn.q_proj", "avg_loss": "3.3302", "time": "3.7208"}, {"layer": 66, "module": "self_attn.o_proj", "avg_loss": "0.0008", "time": "3.8707"}, {"layer": 66, "module": "mlp.up_proj", "avg_loss": "6.6390", "time": "4.1213"}, {"layer": 66, "module": "mlp.gate_proj", "avg_loss": "7.3525", "time": "3.9252"}, {"layer": 66, "module": "mlp.down_proj", "avg_loss": "0.0062", "time": "12.0034"}, {"layer": 67, "module": "self_attn.k_proj", "avg_loss": "1.1292", "time": "3.6527"}, {"layer": 67, "module": "self_attn.v_proj", "avg_loss": "0.2271", "time": "3.6036"}, {"layer": 67, "module": "self_attn.q_proj", "avg_loss": "2.8288", "time": "3.8208"}, {"layer": 67, "module": "self_attn.o_proj", "avg_loss": "0.0006", "time": "3.8349"}, {"layer": 67, "module": "mlp.up_proj", "avg_loss": "6.8246", "time": "4.1048"}, {"layer": 67, "module": "mlp.gate_proj", "avg_loss": "7.5065", "time": "3.9740"}, {"layer": 67, "module": "mlp.down_proj", "avg_loss": "0.0065", "time": "12.0016"}, {"layer": 68, "module": "self_attn.k_proj", "avg_loss": "1.0013", "time": "3.6289"}, {"layer": 68, "module": "self_attn.v_proj", "avg_loss": "0.2041", "time": "3.5846"}, {"layer": 68, "module": "self_attn.q_proj", "avg_loss": "2.3366", "time": "3.7436"}, {"layer": 68, "module": "self_attn.o_proj", "avg_loss": "0.0005", "time": "3.7452"}, {"layer": 68, "module": "mlp.up_proj", "avg_loss": "7.0602", "time": "4.0488"}, {"layer": 68, "module": "mlp.gate_proj", "avg_loss": "7.8033", "time": "3.9584"}, {"layer": 68, "module": "mlp.down_proj", "avg_loss": "0.0067", "time": "11.7866"}, {"layer": 69, "module": "self_attn.k_proj", "avg_loss": "1.0759", "time": "3.5552"}, {"layer": 69, "module": "self_attn.v_proj", "avg_loss": "0.2363", "time": "3.5249"}, {"layer": 69, "module": "self_attn.q_proj", "avg_loss": "2.6166", "time": "3.7058"}, {"layer": 69, "module": "self_attn.o_proj", "avg_loss": "0.0007", "time": "3.6614"}, {"layer": 69, "module": "mlp.up_proj", "avg_loss": "7.3135", "time": "4.0178"}, {"layer": 69, "module": "mlp.gate_proj", "avg_loss": "8.1067", "time": "3.9018"}, {"layer": 69, "module": "mlp.down_proj", "avg_loss": "0.0071", "time": "11.9415"}, {"layer": 70, "module": "self_attn.k_proj", "avg_loss": "1.0260", "time": "3.6120"}, {"layer": 70, "module": "self_attn.v_proj", "avg_loss": "0.2727", "time": "3.5708"}, {"layer": 70, "module": "self_attn.q_proj", "avg_loss": "2.9408", "time": "3.8807"}, {"layer": 70, "module": "self_attn.o_proj", "avg_loss": "0.0006", "time": "3.7959"}, {"layer": 70, "module": "mlp.up_proj", "avg_loss": "7.5817", "time": "4.0329"}, {"layer": 70, "module": "mlp.gate_proj", "avg_loss": "8.4192", "time": "3.9826"}, {"layer": 70, "module": "mlp.down_proj", "avg_loss": "0.0077", "time": "11.9067"}, {"layer": 71, "module": "self_attn.k_proj", "avg_loss": "0.9423", "time": "3.6236"}, {"layer": 71, "module": "self_attn.v_proj", "avg_loss": "0.1988", "time": "3.5677"}, {"layer": 71, "module": "self_attn.q_proj", "avg_loss": "2.2510", "time": "3.7313"}, {"layer": 71, "module": "self_attn.o_proj", "avg_loss": "0.0006", "time": "3.7299"}, {"layer": 71, "module": "mlp.up_proj", "avg_loss": "7.8468", "time": "3.9166"}, {"layer": 71, "module": "mlp.gate_proj", "avg_loss": "8.7427", "time": "3.9616"}, {"layer": 71, "module": "mlp.down_proj", "avg_loss": "0.0083", "time": "11.8346"}, {"layer": 72, "module": "self_attn.k_proj", "avg_loss": "1.0319", "time": "3.5089"}, {"layer": 72, "module": "self_attn.v_proj", "avg_loss": "0.2568", "time": "3.5787"}, {"layer": 72, "module": "self_attn.q_proj", "avg_loss": "2.8738", "time": "3.8718"}, {"layer": 72, "module": "self_attn.o_proj", "avg_loss": "0.0010", "time": "3.8125"}, {"layer": 72, "module": "mlp.up_proj", "avg_loss": "8.1786", "time": "3.9481"}, {"layer": 72, "module": "mlp.gate_proj", "avg_loss": "9.1846", "time": "3.9570"}, {"layer": 72, "module": "mlp.down_proj", "avg_loss": "0.0092", "time": "11.8155"}, {"layer": 73, "module": "self_attn.k_proj", "avg_loss": "1.0881", "time": "3.5724"}, {"layer": 73, "module": "self_attn.v_proj", "avg_loss": "0.2292", "time": "3.5698"}, {"layer": 73, "module": "self_attn.q_proj", "avg_loss": "2.9177", "time": "3.7221"}, {"layer": 73, "module": "self_attn.o_proj", "avg_loss": "0.0008", "time": "3.7822"}, {"layer": 73, "module": "mlp.up_proj", "avg_loss": "8.4576", "time": "4.0587"}, {"layer": 73, "module": "mlp.gate_proj", "avg_loss": "9.5120", "time": "3.9233"}, {"layer": 73, "module": "mlp.down_proj", "avg_loss": "0.0100", "time": "11.8303"}, {"layer": 74, "module": "self_attn.k_proj", "avg_loss": "1.2013", "time": "3.6497"}, {"layer": 74, "module": "self_attn.v_proj", "avg_loss": "0.2412", "time": "3.5369"}, {"layer": 74, "module": "self_attn.q_proj", "avg_loss": "2.8897", "time": "3.8182"}, {"layer": 74, "module": "self_attn.o_proj", "avg_loss": "0.0011", "time": "3.8046"}, {"layer": 74, "module": "mlp.up_proj", "avg_loss": "8.7546", "time": "4.0030"}, {"layer": 74, "module": "mlp.gate_proj", "avg_loss": "9.9366", "time": "3.8286"}, {"layer": 74, "module": "mlp.down_proj", "avg_loss": "0.0111", "time": "11.6892"}, {"layer": 75, "module": "self_attn.k_proj", "avg_loss": "1.0863", "time": "3.6169"}, {"layer": 75, "module": "self_attn.v_proj", "avg_loss": "0.3251", "time": "3.5636"}, {"layer": 75, "module": "self_attn.q_proj", "avg_loss": "3.2701", "time": "3.7531"}, {"layer": 75, "module": "self_attn.o_proj", "avg_loss": "0.0011", "time": "3.7622"}, {"layer": 75, "module": "mlp.up_proj", "avg_loss": "9.0605", "time": "3.9552"}, {"layer": 75, "module": "mlp.gate_proj", "avg_loss": "10.3254", "time": "3.8833"}, {"layer": 75, "module": "mlp.down_proj", "avg_loss": "0.0126", "time": "11.9040"}, {"layer": 76, "module": "self_attn.k_proj", "avg_loss": "1.0734", "time": "3.6568"}, {"layer": 76, "module": "self_attn.v_proj", "avg_loss": "0.2992", "time": "3.6260"}, {"layer": 76, "module": "self_attn.q_proj", "avg_loss": "3.4440", "time": "3.6789"}, {"layer": 76, "module": "self_attn.o_proj", "avg_loss": "0.0019", "time": "3.9001"}, {"layer": 76, "module": "mlp.up_proj", "avg_loss": "9.3262", "time": "4.0293"}, {"layer": 76, "module": "mlp.gate_proj", "avg_loss": "10.7493", "time": "3.9984"}, {"layer": 76, "module": "mlp.down_proj", "avg_loss": "0.0144", "time": "12.2011"}, {"layer": 77, "module": "self_attn.k_proj", "avg_loss": "0.8396", "time": "3.5907"}, {"layer": 77, "module": "self_attn.v_proj", "avg_loss": "0.2656", "time": "3.5766"}, {"layer": 77, "module": "self_attn.q_proj", "avg_loss": "3.6134", "time": "3.6346"}, {"layer": 77, "module": "self_attn.o_proj", "avg_loss": "0.0021", "time": "3.6877"}, {"layer": 77, "module": "mlp.up_proj", "avg_loss": "9.2740", "time": "4.0934"}, {"layer": 77, "module": "mlp.gate_proj", "avg_loss": "10.5867", "time": "4.0863"}, {"layer": 77, "module": "mlp.down_proj", "avg_loss": "0.0143", "time": "12.0220"}, {"layer": 78, "module": "self_attn.k_proj", "avg_loss": "0.9963", "time": "3.7160"}, {"layer": 78, "module": "self_attn.v_proj", "avg_loss": "0.2930", "time": "3.6547"}, {"layer": 78, "module": "self_attn.q_proj", "avg_loss": "3.8025", "time": "3.7370"}, {"layer": 78, "module": "self_attn.o_proj", "avg_loss": "0.0029", "time": "3.7142"}, {"layer": 78, "module": "mlp.up_proj", "avg_loss": "9.4593", "time": "4.0884"}, {"layer": 78, "module": "mlp.gate_proj", "avg_loss": "10.7965", "time": "4.0212"}, {"layer": 78, "module": "mlp.down_proj", "avg_loss": "0.0151", "time": "12.0228"}, {"layer": 79, "module": "self_attn.k_proj", "avg_loss": "0.9371", "time": "3.6828"}, {"layer": 79, "module": "self_attn.v_proj", "avg_loss": "0.3049", "time": "3.6302"}, {"layer": 79, "module": "self_attn.q_proj", "avg_loss": "3.5195", "time": "3.7535"}, {"layer": 79, "module": "self_attn.o_proj", "avg_loss": "0.0034", "time": "3.7349"}, {"layer": 79, "module": "mlp.up_proj", "avg_loss": "9.6700", "time": "4.0304"}, {"layer": 79, "module": "mlp.gate_proj", "avg_loss": "11.0421", "time": "3.9397"}, {"layer": 79, "module": "mlp.down_proj", "avg_loss": "0.0166", "time": "11.8862"}, {"layer": 80, "module": "self_attn.k_proj", "avg_loss": "1.0123", "time": "3.6652"}, {"layer": 80, "module": "self_attn.v_proj", "avg_loss": "0.3363", "time": "3.5501"}, {"layer": 80, "module": "self_attn.q_proj", "avg_loss": "3.8552", "time": "3.7215"}, {"layer": 80, "module": "self_attn.o_proj", "avg_loss": "0.0036", "time": "3.8266"}, {"layer": 80, "module": "mlp.up_proj", "avg_loss": "9.8459", "time": "4.0268"}, {"layer": 80, "module": "mlp.gate_proj", "avg_loss": "11.1943", "time": "4.0234"}, {"layer": 80, "module": "mlp.down_proj", "avg_loss": "0.0195", "time": "11.8995"}, {"layer": 81, "module": "self_attn.k_proj", "avg_loss": "0.9649", "time": "3.7145"}, {"layer": 81, "module": "self_attn.v_proj", "avg_loss": "0.3641", "time": "3.6642"}, {"layer": 81, "module": "self_attn.q_proj", "avg_loss": "3.4963", "time": "3.8552"}, {"layer": 81, "module": "self_attn.o_proj", "avg_loss": "0.0030", "time": "3.8592"}, {"layer": 81, "module": "mlp.up_proj", "avg_loss": "10.0585", "time": "4.0073"}, {"layer": 81, "module": "mlp.gate_proj", "avg_loss": "11.4395", "time": "3.9976"}, {"layer": 81, "module": "mlp.down_proj", "avg_loss": "0.0216", "time": "12.0885"}, {"layer": 82, "module": "self_attn.k_proj", "avg_loss": "0.9336", "time": "3.5824"}, {"layer": 82, "module": "self_attn.v_proj", "avg_loss": "0.3737", "time": "3.6121"}, {"layer": 82, "module": "self_attn.q_proj", "avg_loss": "3.8633", "time": "3.8937"}, {"layer": 82, "module": "self_attn.o_proj", "avg_loss": "0.0049", "time": "3.7321"}, {"layer": 82, "module": "mlp.up_proj", "avg_loss": "10.3276", "time": "3.9802"}, {"layer": 82, "module": "mlp.gate_proj", "avg_loss": "11.6951", "time": "3.9390"}, {"layer": 82, "module": "mlp.down_proj", "avg_loss": "0.0253", "time": "11.9801"}, {"layer": 83, "module": "self_attn.k_proj", "avg_loss": "0.9643", "time": "3.6343"}, {"layer": 83, "module": "self_attn.v_proj", "avg_loss": "0.4042", "time": "3.6475"}, {"layer": 83, "module": "self_attn.q_proj", "avg_loss": "3.8394", "time": "3.8081"}, {"layer": 83, "module": "self_attn.o_proj", "avg_loss": "0.0057", "time": "3.8287"}, {"layer": 83, "module": "mlp.up_proj", "avg_loss": "10.5981", "time": "4.0586"}, {"layer": 83, "module": "mlp.gate_proj", "avg_loss": "12.0066", "time": "3.9761"}, {"layer": 83, "module": "mlp.down_proj", "avg_loss": "0.0294", "time": "11.8259"}, {"layer": 84, "module": "self_attn.k_proj", "avg_loss": "0.8708", "time": "3.6880"}, {"layer": 84, "module": "self_attn.v_proj", "avg_loss": "0.5064", "time": "3.5529"}, {"layer": 84, "module": "self_attn.q_proj", "avg_loss": "4.1224", "time": "3.7348"}, {"layer": 84, "module": "self_attn.o_proj", "avg_loss": "0.0069", "time": "3.8581"}, {"layer": 84, "module": "mlp.up_proj", "avg_loss": "11.0361", "time": "4.0229"}, {"layer": 84, "module": "mlp.gate_proj", "avg_loss": "12.6110", "time": "3.9011"}, {"layer": 84, "module": "mlp.down_proj", "avg_loss": "0.0360", "time": "11.8198"}, {"layer": 85, "module": "self_attn.k_proj", "avg_loss": "0.8766", "time": "3.6604"}, {"layer": 85, "module": "self_attn.v_proj", "avg_loss": "0.4285", "time": "3.6006"}, {"layer": 85, "module": "self_attn.q_proj", "avg_loss": "4.0646", "time": "3.8021"}, {"layer": 85, "module": "self_attn.o_proj", "avg_loss": "0.0078", "time": "3.8102"}, {"layer": 85, "module": "mlp.up_proj", "avg_loss": "11.4223", "time": "4.0734"}, {"layer": 85, "module": "mlp.gate_proj", "avg_loss": "13.5067", "time": "3.9993"}, {"layer": 85, "module": "mlp.down_proj", "avg_loss": "0.0488", "time": "11.8960"}, {"layer": 86, "module": "self_attn.k_proj", "avg_loss": "0.9249", "time": "3.6291"}, {"layer": 86, "module": "self_attn.v_proj", "avg_loss": "0.6580", "time": "3.5443"}, {"layer": 86, "module": "self_attn.q_proj", "avg_loss": "4.1922", "time": "3.7226"}, {"layer": 86, "module": "self_attn.o_proj", "avg_loss": "0.0163", "time": "3.7462"}, {"layer": 86, "module": "mlp.up_proj", "avg_loss": "12.1530", "time": "3.9807"}, {"layer": 86, "module": "mlp.gate_proj", "avg_loss": "15.1818", "time": "3.9209"}, {"layer": 86, "module": "mlp.down_proj", "avg_loss": "0.0913", "time": "11.9119"}, {"layer": 87, "module": "self_attn.k_proj", "avg_loss": "0.8835", "time": "3.6829"}, {"layer": 87, "module": "self_attn.v_proj", "avg_loss": "0.5141", "time": "3.6417"}, {"layer": 87, "module": "self_attn.q_proj", "avg_loss": "3.7549", "time": "3.6958"}, {"layer": 87, "module": "self_attn.o_proj", "avg_loss": "0.0194", "time": "3.7708"}, {"layer": 87, "module": "mlp.up_proj", "avg_loss": "10.3101", "time": "4.0664"}, {"layer": 87, "module": "mlp.gate_proj", "avg_loss": "13.4449", "time": "4.0199"}, {"layer": 87, "module": "mlp.down_proj", "avg_loss": "0.2066", "time": "11.9694"}, {"layer": 88, "module": "self_attn.k_proj", "avg_loss": "0.2975", "time": "3.6723"}, {"layer": 88, "module": "self_attn.v_proj", "avg_loss": "0.1288", "time": "3.6419"}, {"layer": 88, "module": "self_attn.q_proj", "avg_loss": "1.2304", "time": "3.8193"}, {"layer": 88, "module": "self_attn.o_proj", "avg_loss": "0.0069", "time": "3.6703"}, {"layer": 88, "module": "mlp.up_proj", "avg_loss": "4.0579", "time": "4.0620"}, {"layer": 88, "module": "mlp.gate_proj", "avg_loss": "4.8354", "time": "4.0405"}, {"layer": 88, "module": "mlp.down_proj", "avg_loss": "0.0770", "time": "12.1413"}]
|
quantize_config.json
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bits": 4,
|
3 |
+
"group_size": 128,
|
4 |
+
"desc_act": true,
|
5 |
+
"static_groups": false,
|
6 |
+
"sym": true,
|
7 |
+
"lm_head": false,
|
8 |
+
"damp_percent": 0.0025,
|
9 |
+
"true_sequential": true,
|
10 |
+
"model_name_or_path": "/monster/data/model/Mistral-Large-Instruct-2407/gptq_4bit_07-25_19-06-37",
|
11 |
+
"model_file_base_name": "model",
|
12 |
+
"quant_method": "gptq",
|
13 |
+
"checkpoint_format": "gptq",
|
14 |
+
"meta": {
|
15 |
+
"quantizer": "gptqmodel:0.9.10-dev0"
|
16 |
+
}
|
17 |
+
}
|
special_tokens_map.json
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<s>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "</s>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
},
|
16 |
+
"unk_token": {
|
17 |
+
"content": "<unk>",
|
18 |
+
"lstrip": false,
|
19 |
+
"normalized": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"single_word": false
|
22 |
+
}
|
23 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:59f95e28944c062244741268596badc900df86c7f5ded05088d2da22a7379e06
|
3 |
+
size 587583
|
tokenizer_config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|