jetmoe
/

jetmoe-8b-chat

@@ -1,100 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class top_k_gating(nn.Module):
-    def __init__(
-        self,
-        input_size,
-        num_experts,
-        top_k,
-    ):
-        """
-        Initialize the top-k gating mechanism.
-        Args:
-            input_size (int): Size of the input.
-            num_experts (int): Number of experts.
-            top_k (int): Number of top experts to select.
-            acc_aux_loss (bool): Whether to accumulate auxiliary loss statistics.
-            dropout (float): Dropout rate for gating network.
-            hidden_size (int): Hidden size of the gating network.
-            sample_topk (int): Number of top-k experts to sample during training.
-            aux_loss (str): Type of auxiliary loss ('mi' or 'switch').
-            gate_type (str): Type of gating mechanism ('mlp', 'linear', or 'gmm').
-        """
-        super().__init__()
-        self.num_experts = num_experts
-        self.input_size = input_size
-        assert top_k <= num_experts
-        self.top_k = top_k
-        self.layer = nn.Linear(input_size, num_experts, bias=False)
-    def extra_repr(self):
-        """
-        Return extra representation string for the module.
-        """
-        return 'k={}, num_experts={}'.format(
-            self.top_k, self.num_experts)
-    def compute_aux_loss(self, probs, logits, gates):
-        """
-        Calculate and return the auxiliary loss based on the accumulated statistics.
-        Args:
-            eps (float): Small epsilon value for numerical stability.
-        Returns:
-            torch.Tensor: The calculated auxiliary loss.
-        """
-        count = logits.size(0)
-        probs = probs.sum(0)
-        freq = (gates > 0).float().sum(0)
-        lsesq = (torch.log(torch.exp(logits).sum(dim=-1)) ** 2).sum()
-        switchloss =  self.num_experts * (
-            F.normalize(probs, p=1, dim=0) *
-            F.normalize(freq, p=1, dim=0)
-        ).sum()
-        zloss = lsesq / count
-        loss = switchloss + 0.1 * zloss
-        return loss
-    def forward(self, x):
-        """
-        Compute the top-k gating for the input.
-        See paper: https://arxiv.org/abs/1701.06538.
-        Args:
-            x (torch.Tensor): Input tensor with shape [batch_size, input_size].
-            skip_mask (torch.Tensor): Skip mask tensor (binary) with the same shape as `x`.
-            x: input Tensor with shape [batch_size, input_size]
-            train: a boolean - we only add noise at training time.
-            noise_epsilon: a float
-        Returns:
-            torch.Tensor: Top-k indices.
-            torch.Tensor: Top-k gating values.
-            torch.Tensor: Probability values for each expert.
-            gates: a Tensor with shape [batch_size, num_experts]
-            load: a Tensor with shape [num_experts]
-        """
-        logits = self.layer(x).float()
-        top_k_logits, top_k_indices = logits.topk(self.top_k, dim=1)
-        top_k_gates = torch.softmax(top_k_logits, dim=1).type_as(x)
-        if self.training:
-            probs = torch.softmax(logits, dim=1)
-            zeros = torch.zeros_like(probs)
-            zeros = zeros.to(top_k_gates.dtype)  # Convert zeros to match top_k_gates dtype
-            gates = zeros.scatter(1, top_k_indices, top_k_gates)
-            self.loss = self.compute_aux_loss(probs, logits, gates)
-        else:
-            self.loss = 0
-        return top_k_indices, top_k_gates

modeling_jetmoe.py CHANGED Viewed

@@ -27,7 +27,7 @@ from transformers.utils import (
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
 from transformers.cache_utils import Cache, DynamicCache
 from .configuration_jetmoe import JetMoEConfig
-from . import moe
 try:
     if is_flash_attn_2_available():
@@ -43,6 +43,369 @@ logger = logging.get_logger(__name__)
 _CHECKPOINT_FOR_DOC = "jetmoe"
 _CONFIG_FOR_DOC = "JetMoEConfig"
 @dataclass
 class JetMoEBaseModelOutputWithPast(BaseModelOutputWithPast):

 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
 from transformers.cache_utils import Cache, DynamicCache
 from .configuration_jetmoe import JetMoEConfig
+import scattermoe
 try:
     if is_flash_attn_2_available():
 _CHECKPOINT_FOR_DOC = "jetmoe"
 _CONFIG_FOR_DOC = "JetMoEConfig"
+class top_k_gating(nn.Module):
+    def __init__(
+        self,
+        input_size,
+        num_experts,
+        top_k,
+    ):
+        """
+        Initialize the top-k gating mechanism.
+        Args:
+            input_size (int): Size of the input.
+            num_experts (int): Number of experts.
+            top_k (int): Number of top experts to select.
+            acc_aux_loss (bool): Whether to accumulate auxiliary loss statistics.
+            dropout (float): Dropout rate for gating network.
+            hidden_size (int): Hidden size of the gating network.
+            sample_topk (int): Number of top-k experts to sample during training.
+            aux_loss (str): Type of auxiliary loss ('mi' or 'switch').
+            gate_type (str): Type of gating mechanism ('mlp', 'linear', or 'gmm').
+        """
+        super().__init__()
+        self.num_experts = num_experts
+        self.input_size = input_size
+        assert top_k <= num_experts
+        self.top_k = top_k
+        self.layer = nn.Linear(input_size, num_experts, bias=False)
+    def extra_repr(self):
+        """
+        Return extra representation string for the module.
+        """
+        return 'k={}, num_experts={}'.format(
+            self.top_k, self.num_experts)
+    def compute_aux_loss(self, probs, logits, gates):
+        """
+        Calculate and return the auxiliary loss based on the accumulated statistics.
+        Args:
+            eps (float): Small epsilon value for numerical stability.
+        Returns:
+            torch.Tensor: The calculated auxiliary loss.
+        """
+        count = logits.size(0)
+        probs = probs.sum(0)
+        freq = (gates > 0).float().sum(0)
+        lsesq = (torch.log(torch.exp(logits).sum(dim=-1)) ** 2).sum()
+        switchloss =  self.num_experts * (
+            F.normalize(probs, p=1, dim=0) *
+            F.normalize(freq, p=1, dim=0)
+        ).sum()
+        zloss = lsesq / count
+        loss = switchloss + 0.1 * zloss
+        return loss
+    def forward(self, x):
+        """
+        Compute the top-k gating for the input.
+        See paper: https://arxiv.org/abs/1701.06538.
+        Args:
+            x (torch.Tensor): Input tensor with shape [batch_size, input_size].
+            skip_mask (torch.Tensor): Skip mask tensor (binary) with the same shape as `x`.
+            x: input Tensor with shape [batch_size, input_size]
+            train: a boolean - we only add noise at training time.
+            noise_epsilon: a float
+        Returns:
+            torch.Tensor: Top-k indices.
+            torch.Tensor: Top-k gating values.
+            torch.Tensor: Probability values for each expert.
+            gates: a Tensor with shape [batch_size, num_experts]
+            load: a Tensor with shape [num_experts]
+        """
+        logits = self.layer(x).float()
+        top_k_logits, top_k_indices = logits.topk(self.top_k, dim=1)
+        top_k_gates = torch.softmax(top_k_logits, dim=1).type_as(x)
+        if self.training:
+            probs = torch.softmax(logits, dim=1)
+            zeros = torch.zeros_like(probs)
+            zeros = zeros.to(top_k_gates.dtype)  # Convert zeros to match top_k_gates dtype
+            gates = zeros.scatter(1, top_k_indices, top_k_gates)
+            self.loss = self.compute_aux_loss(probs, logits, gates)
+        else:
+            self.loss = 0
+        return top_k_indices, top_k_gates
+class MoE(nn.Module):
+    """
+    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.
+    Args:
+        input_size: integer - size of the input
+        head_size: integer - size of the expert's hidden layer
+        num_experts: an integer - number of experts
+        top_k: an integer - how many experts to use for each batch element
+        bias: a boolean - whether to include bias in linear layers
+        activation: an activation function to apply to expert's outputs
+        acc_aux_loss: a boolean - whether to accumulate auxiliary loss
+        hidden_size: an integer - hidden size of the experts
+        gating_dropout: a float - dropout rate for gating network
+        sample_topk: an integer - how many experts to sample during training
+        gating_size: an integer - size of the gating network
+        aux_loss: a string - type of auxiliary loss ('mi' or 'sparse')
+        gate_type: a string - type of gating mechanism ('mlp' or 'topk')
+    """
+    def __init__(
+        self,
+        input_size,
+        hidden_size,
+        num_experts,
+        top_k,
+        bias=True,
+        activation=None,
+        glu=True,
+        ):
+        super(MoE, self).__init__()
+        self.num_experts = num_experts
+        self.input_size = input_size
+        self.glu = glu
+        if bias:
+            self.bias = torch.nn.Parameter(torch.empty(input_size))
+            torch.nn.init.zeros_(self.bias)
+        else:
+            self.bias = None
+        self.input_linear = scattermoe.parallel_experts.ParallelExperts(num_experts, input_size, hidden_size * 2 if glu else hidden_size)
+        self.output_linear = scattermoe.parallel_experts.ParallelExperts(num_experts, hidden_size, input_size)
+        self.top_k = min(top_k, self.num_experts)
+        self.activation = activation
+        self.router = top_k_gating(
+            input_size=input_size,
+            num_experts=num_experts,
+            top_k=top_k,
+            )
+    def extra_repr(self):
+        return 'k={}, e={}'.format(
+            self.top_k, self.num_experts)
+    def get_aux_loss_and_clear(self):
+        """
+        Get the accumulated auxiliary loss and clear it.
+        Returns:
+            float: Accumulated auxiliary loss.
+        """
+        return self.gate.get_aux_loss_and_clear()
+    def compute_gate(self, x):
+        top_k_indices, self.top_k_gates = self.router(x)
+        with torch.no_grad():
+            self.sorted_expert_idxs, self.sorted_scattered_idxs = scattermoe.kernels.ops.flatten_and_sort(top_k_indices)
+            self.padded_block_idxs, self.expert_offsets = scattermoe.kernels.ops.padded_block_indices(self.sorted_expert_idxs, self.num_experts)
+        return self.router.loss
+    def batch_forward(self, x):
+        """
+        Forward pass of the mixture of experts layer.
+        Args:
+            x (Tensor): Input tensor.
+        Returns:
+            Tensor: Output tensor.
+        """
+        bsz, length, emb_size = x.size()
+        x = x.reshape(-1, emb_size)
+        loss = self.compute_gate(x)
+        h = self.input_linear(
+            x, self.top_k,
+            self.sorted_expert_idxs, self.sorted_scattered_idxs,
+            self.padded_block_idxs, self.expert_offsets,
+            grouped_out=True
+        )
+        if self.glu:
+            h, g = h.chunk(2, dim=-1)
+            h = self.activation(h) * g
+        else:
+            h = self.activation(h)
+        y = self.output_linear(
+            h, 1,
+            self.sorted_expert_idxs, self.sorted_scattered_idxs,
+            self.padded_block_idxs, self.expert_offsets,
+            grouped_in=True,
+            gates=self.top_k_gates,
+        )
+        y = y.view(bsz, length, self.input_size)
+        if self.bias is not None:
+            y = y + self.bias
+        return y, loss
+    def single_forward(self, x):
+        bsz, length, emb_size = x.size()
+        x = x.reshape(1, self.input_size)
+        top_k_indices, top_k_gates = self.router(x)
+        loss = self.router.loss
+        y_list = []
+        for i in range(self.top_k):
+            expert_idx = top_k_indices[0,i]
+            h = F.linear(x, self.input_linear.weight[expert_idx])
+            if self.glu:
+                h, g = h.chunk(2, dim=-1)
+                h = self.activation(h) * g
+            else:
+                h = self.activation(h)
+            y = F.linear(h, self.output_linear.weight[expert_idx]) * top_k_gates[0,i]
+            y_list.append(y)
+        y = sum(y_list)
+        y = y.view(bsz, length, self.input_size)
+        if self.bias is not None:
+            y = y + self.bias
+        return y, loss
+    def forward(self, x):
+        """
+        Forward pass of the mixture of experts layer.
+        Args:
+            x (Tensor): Input tensor.
+        Returns:
+            Tensor: Output tensor.
+        """
+        bsz, length, emb_size = x.size()
+        if bsz * length ==1:
+            return self.single_forward(x)
+        else:
+            return self.batch_forward(x)
+    def batch_map(self, x):
+        """
+        Map input through the mixture of experts layer.
+        Args:
+            x (Tensor): Input tensor.
+        Returns:
+            Tensor: Output tensor.
+        """
+        bsz, length, emb_size = x.size()
+        x = x.reshape(-1, emb_size)
+        loss = self.compute_gate(x)
+        y = self.input_linear(
+            x, self.top_k,
+            self.sorted_expert_idxs, self.sorted_scattered_idxs,
+            self.padded_block_idxs, self.expert_offsets,
+        )
+        y = y.view(bsz, length, self.top_k, -1)
+        return y, loss
+    def single_map(self, x):
+        bsz, length, emb_size = x.size()
+        x = x.reshape(1, self.input_size)
+        self.top_k_indices, self.top_k_gates = self.router(x)
+        loss = self.router.loss
+        y_list = []
+        for i in range(self.top_k):
+            expert_idx = self.top_k_indices[0,i]
+            y = F.linear(x, self.input_linear.weight[expert_idx])
+            y_list.append(y)
+        y = torch.cat(y_list, dim=0)
+        y = y.view(bsz, length, self.top_k, -1)
+        return y, loss
+    def map(self, x):
+        """
+        Map input through the mixture of experts layer.
+        Args:
+            x (Tensor): Input tensor.
+        Returns:
+            Tensor: Output tensor.
+        """
+        bsz, length, emb_size = x.size()
+        if bsz * length ==1:
+            return self.single_map(x)
+        else:
+            return self.batch_map(x)
+    def batch_reduce(self, x):
+        """
+        Reduce the mapped output.
+        Args:
+            x (Tensor): Mapped output tensor.
+        Returns:
+            Tensor: Reduced output tensor.
+        """
+        bsz, length, k, emb_size = x.size()
+        assert k == self.top_k
+        x = x.reshape(-1, emb_size)
+        y = self.output_linear(
+            x, 1,
+            self.sorted_expert_idxs, self.sorted_scattered_idxs,
+            self.padded_block_idxs, self.expert_offsets,
+            gates=self.top_k_gates,
+        )
+        y = y.view(bsz, length, self.input_size)
+        return y
+    def single_reduce(self, x):
+        bsz, length, k, emb_size = x.size()
+        x = x.reshape(k, emb_size)
+        y_list = []
+        for i in range(self.top_k):
+            expert_idx = self.top_k_indices[0,i]
+            y = F.linear(x[i], self.output_linear.weight[expert_idx]) * self.top_k_gates[0,i]
+            y_list.append(y)
+        y = sum(y_list)
+        y = y.view(bsz, length, self.input_size)
+        return y
+    def reduce(self, x):
+        """
+        Reduce the mapped output.
+        Args:
+            x (Tensor): Mapped output tensor.
+        Returns:
+            Tensor: Reduced output tensor.
+        """
+        bsz, length, k, emb_size = x.size()
+        if bsz * length ==1:
+            return self.single_reduce(x)
+        else:
+            return self.batch_reduce(x)
 @dataclass
 class JetMoEBaseModelOutputWithPast(BaseModelOutputWithPast):

moe.py DELETED Viewed

@@ -1,277 +0,0 @@
-import math
-from typing import List
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import scattermoe
-from .gate import top_k_gating
-class MoE(nn.Module):
-    """
-    A Sparsely gated mixture of experts layer with 1-layer Feed-Forward networks as experts.
-    Args:
-        input_size: integer - size of the input
-        head_size: integer - size of the expert's hidden layer
-        num_experts: an integer - number of experts
-        top_k: an integer - how many experts to use for each batch element
-        bias: a boolean - whether to include bias in linear layers
-        activation: an activation function to apply to expert's outputs
-        acc_aux_loss: a boolean - whether to accumulate auxiliary loss
-        hidden_size: an integer - hidden size of the experts
-        gating_dropout: a float - dropout rate for gating network
-        sample_topk: an integer - how many experts to sample during training
-        gating_size: an integer - size of the gating network
-        aux_loss: a string - type of auxiliary loss ('mi' or 'sparse')
-        gate_type: a string - type of gating mechanism ('mlp' or 'topk')
-    """
-    def __init__(
-        self,
-        input_size,
-        hidden_size,
-        num_experts,
-        top_k,
-        bias=True,
-        activation=None,
-        glu=True,
-        ):
-        super(MoE, self).__init__()
-        self.num_experts = num_experts
-        self.input_size = input_size
-        self.glu = glu
-        if bias:
-            self.bias = torch.nn.Parameter(torch.empty(input_size))
-            torch.nn.init.zeros_(self.bias)
-        else:
-            self.bias = None
-        self.input_linear = scattermoe.parallel_experts.ParallelExperts(num_experts, input_size, hidden_size * 2 if glu else hidden_size)
-        self.output_linear = scattermoe.parallel_experts.ParallelExperts(num_experts, hidden_size, input_size)
-        self.top_k = min(top_k, self.num_experts)
-        self.activation = activation
-        self.router = top_k_gating(
-            input_size=input_size,
-            num_experts=num_experts,
-            top_k=top_k,
-            )
-    def extra_repr(self):
-        return 'k={}, e={}'.format(
-            self.top_k, self.num_experts)
-    def get_aux_loss_and_clear(self):
-        """
-        Get the accumulated auxiliary loss and clear it.
-        Returns:
-            float: Accumulated auxiliary loss.
-        """
-        return self.gate.get_aux_loss_and_clear()
-    def compute_gate(self, x):
-        top_k_indices, self.top_k_gates = self.router(x)
-        with torch.no_grad():
-            self.sorted_expert_idxs, self.sorted_scattered_idxs = scattermoe.kernels.ops.flatten_and_sort(top_k_indices)
-            self.padded_block_idxs, self.expert_offsets = scattermoe.kernels.ops.padded_block_indices(self.sorted_expert_idxs, self.num_experts)
-        return self.router.loss
-    def batch_forward(self, x):
-        """
-        Forward pass of the mixture of experts layer.
-        Args:
-            x (Tensor): Input tensor.
-        Returns:
-            Tensor: Output tensor.
-        """
-        bsz, length, emb_size = x.size()
-        x = x.reshape(-1, emb_size)
-        loss = self.compute_gate(x)
-        h = self.input_linear(
-            x, self.top_k,
-            self.sorted_expert_idxs, self.sorted_scattered_idxs,
-            self.padded_block_idxs, self.expert_offsets,
-            grouped_out=True
-        )
-        if self.glu:
-            h, g = h.chunk(2, dim=-1)
-            h = self.activation(h) * g
-        else:
-            h = self.activation(h)
-        y = self.output_linear(
-            h, 1,
-            self.sorted_expert_idxs, self.sorted_scattered_idxs,
-            self.padded_block_idxs, self.expert_offsets,
-            grouped_in=True,
-            gates=self.top_k_gates,
-        )
-        y = y.view(bsz, length, self.input_size)
-        if self.bias is not None:
-            y = y + self.bias
-        return y, loss
-    def single_forward(self, x):
-        bsz, length, emb_size = x.size()
-        x = x.reshape(1, self.input_size)
-        top_k_indices, top_k_gates = self.router(x)
-        loss = self.router.loss
-        y_list = []
-        for i in range(self.top_k):
-            expert_idx = top_k_indices[0,i]
-            h = F.linear(x, self.input_linear.weight[expert_idx])
-            if self.glu:
-                h, g = h.chunk(2, dim=-1)
-                h = self.activation(h) * g
-            else:
-                h = self.activation(h)
-            y = F.linear(h, self.output_linear.weight[expert_idx]) * top_k_gates[0,i]
-            y_list.append(y)
-        y = sum(y_list)
-        y = y.view(bsz, length, self.input_size)
-        if self.bias is not None:
-            y = y + self.bias
-        return y, loss
-    def forward(self, x):
-        """
-        Forward pass of the mixture of experts layer.
-        Args:
-            x (Tensor): Input tensor.
-        Returns:
-            Tensor: Output tensor.
-        """
-        bsz, length, emb_size = x.size()
-        if bsz * length ==1:
-            return self.single_forward(x)
-        else:
-            return self.batch_forward(x)
-    def batch_map(self, x):
-        """
-        Map input through the mixture of experts layer.
-        Args:
-            x (Tensor): Input tensor.
-        Returns:
-            Tensor: Output tensor.
-        """
-        bsz, length, emb_size = x.size()
-        x = x.reshape(-1, emb_size)
-        loss = self.compute_gate(x)
-        y = self.input_linear(
-            x, self.top_k,
-            self.sorted_expert_idxs, self.sorted_scattered_idxs,
-            self.padded_block_idxs, self.expert_offsets,
-        )
-        y = y.view(bsz, length, self.top_k, -1)
-        return y, loss
-    def single_map(self, x):
-        bsz, length, emb_size = x.size()
-        x = x.reshape(1, self.input_size)
-        self.top_k_indices, self.top_k_gates = self.router(x)
-        loss = self.router.loss
-        y_list = []
-        for i in range(self.top_k):
-            expert_idx = self.top_k_indices[0,i]
-            y = F.linear(x, self.input_linear.weight[expert_idx])
-            y_list.append(y)
-        y = torch.cat(y_list, dim=0)
-        y = y.view(bsz, length, self.top_k, -1)
-        return y, loss
-    def map(self, x):
-        """
-        Map input through the mixture of experts layer.
-        Args:
-            x (Tensor): Input tensor.
-        Returns:
-            Tensor: Output tensor.
-        """
-        bsz, length, emb_size = x.size()
-        if bsz * length ==1:
-            return self.single_map(x)
-        else:
-            return self.batch_map(x)
-    def batch_reduce(self, x):
-        """
-        Reduce the mapped output.
-        Args:
-            x (Tensor): Mapped output tensor.
-        Returns:
-            Tensor: Reduced output tensor.
-        """
-        bsz, length, k, emb_size = x.size()
-        assert k == self.top_k
-        x = x.reshape(-1, emb_size)
-        y = self.output_linear(
-            x, 1,
-            self.sorted_expert_idxs, self.sorted_scattered_idxs,
-            self.padded_block_idxs, self.expert_offsets,
-            gates=self.top_k_gates,
-        )
-        y = y.view(bsz, length, self.input_size)
-        return y
-    def single_reduce(self, x):
-        bsz, length, k, emb_size = x.size()
-        x = x.reshape(k, emb_size)
-        y_list = []
-        for i in range(self.top_k):
-            expert_idx = self.top_k_indices[0,i]
-            y = F.linear(x[i], self.output_linear.weight[expert_idx]) * self.top_k_gates[0,i]
-            y_list.append(y)
-        y = sum(y_list)
-        y = y.view(bsz, length, self.input_size)
-        return y
-    def reduce(self, x):
-        """
-        Reduce the mapped output.
-        Args:
-            x (Tensor): Mapped output tensor.
-        Returns:
-            Tensor: Reduced output tensor.
-        """
-        bsz, length, k, emb_size = x.size()
-        if bsz * length ==1:
-            return self.single_reduce(x)
-        else:
-            return self.batch_reduce(x)