Guo commited on
Commit
aaad630
1 Parent(s): dd9f628
Files changed (1) hide show
  1. modeling_jetmoe.py +2 -2
modeling_jetmoe.py CHANGED
@@ -654,7 +654,7 @@ class JetMoEAttention(nn.Module):
654
  self.num_heads = self.num_key_value_heads * self.top_k
655
  self.hidden_size_per_attention_head = config.kv_channels
656
 
657
- self.experts = moe.MoE(
658
  input_size=config.hidden_size,
659
  hidden_size=self.kv_projection_size,
660
  num_experts=config.moe_num_experts,
@@ -1072,7 +1072,7 @@ class JetMoEBlock(nn.Module):
1072
  # moe_args.activation_fn = F.silu
1073
  # moe_args.return_bias = False
1074
  # self.mlp = megablocks.layers.dmoe.dMoE(moe_args)
1075
- self.mlp = moe.MoE(
1076
  input_size=config.hidden_size,
1077
  hidden_size=config.ffn_hidden_size,
1078
  num_experts=config.moe_num_experts,
 
654
  self.num_heads = self.num_key_value_heads * self.top_k
655
  self.hidden_size_per_attention_head = config.kv_channels
656
 
657
+ self.experts = MoE(
658
  input_size=config.hidden_size,
659
  hidden_size=self.kv_projection_size,
660
  num_experts=config.moe_num_experts,
 
1072
  # moe_args.activation_fn = F.silu
1073
  # moe_args.return_bias = False
1074
  # self.mlp = megablocks.layers.dmoe.dMoE(moe_args)
1075
+ self.mlp = MoE(
1076
  input_size=config.hidden_size,
1077
  hidden_size=config.ffn_hidden_size,
1078
  num_experts=config.moe_num_experts,