Guo
commited on
Commit
•
aaad630
1
Parent(s):
dd9f628
debug
Browse files- modeling_jetmoe.py +2 -2
modeling_jetmoe.py
CHANGED
@@ -654,7 +654,7 @@ class JetMoEAttention(nn.Module):
|
|
654 |
self.num_heads = self.num_key_value_heads * self.top_k
|
655 |
self.hidden_size_per_attention_head = config.kv_channels
|
656 |
|
657 |
-
self.experts =
|
658 |
input_size=config.hidden_size,
|
659 |
hidden_size=self.kv_projection_size,
|
660 |
num_experts=config.moe_num_experts,
|
@@ -1072,7 +1072,7 @@ class JetMoEBlock(nn.Module):
|
|
1072 |
# moe_args.activation_fn = F.silu
|
1073 |
# moe_args.return_bias = False
|
1074 |
# self.mlp = megablocks.layers.dmoe.dMoE(moe_args)
|
1075 |
-
self.mlp =
|
1076 |
input_size=config.hidden_size,
|
1077 |
hidden_size=config.ffn_hidden_size,
|
1078 |
num_experts=config.moe_num_experts,
|
|
|
654 |
self.num_heads = self.num_key_value_heads * self.top_k
|
655 |
self.hidden_size_per_attention_head = config.kv_channels
|
656 |
|
657 |
+
self.experts = MoE(
|
658 |
input_size=config.hidden_size,
|
659 |
hidden_size=self.kv_projection_size,
|
660 |
num_experts=config.moe_num_experts,
|
|
|
1072 |
# moe_args.activation_fn = F.silu
|
1073 |
# moe_args.return_bias = False
|
1074 |
# self.mlp = megablocks.layers.dmoe.dMoE(moe_args)
|
1075 |
+
self.mlp = MoE(
|
1076 |
input_size=config.hidden_size,
|
1077 |
hidden_size=config.ffn_hidden_size,
|
1078 |
num_experts=config.moe_num_experts,
|