Spaces:

alibabasglab
/

ClearVoice

Runtime error

App Files Files Community

alibabasglab commited on Jan 16

Commit

126c408

verified ·

1 Parent(s): e1ed673

Upload 32 files

Browse files

Files changed (32) hide show

models/mossformer2_sr/__init__.py +0 -0
models/mossformer2_sr/__pycache__/__init__.cpython-312.pyc +0 -0
models/mossformer2_sr/__pycache__/__init__.cpython-38.pyc +0 -0
models/mossformer2_sr/__pycache__/conv_module.cpython-312.pyc +0 -0
models/mossformer2_sr/__pycache__/conv_module.cpython-38.pyc +0 -0
models/mossformer2_sr/__pycache__/env.cpython-312.pyc +0 -0
models/mossformer2_sr/__pycache__/fsmn.cpython-312.pyc +0 -0
models/mossformer2_sr/__pycache__/fsmn.cpython-38.pyc +0 -0
models/mossformer2_sr/__pycache__/generator.cpython-312.pyc +0 -0
models/mossformer2_sr/__pycache__/layer_norm.cpython-312.pyc +0 -0
models/mossformer2_sr/__pycache__/layer_norm.cpython-38.pyc +0 -0
models/mossformer2_sr/__pycache__/mossformer.cpython-38.pyc +0 -0
models/mossformer2_sr/__pycache__/mossformer2.cpython-312.pyc +0 -0
models/mossformer2_sr/__pycache__/mossformer2.cpython-38.pyc +0 -0
models/mossformer2_sr/__pycache__/mossformer2_block.cpython-312.pyc +0 -0
models/mossformer2_sr/__pycache__/mossformer2_block.cpython-38.pyc +0 -0
models/mossformer2_sr/__pycache__/mossformer2_se_wrapper.cpython-312.pyc +0 -0
models/mossformer2_sr/__pycache__/mossformer2_se_wrapper.cpython-38.pyc +0 -0
models/mossformer2_sr/__pycache__/mossformer2_sr_wrapper.cpython-312.pyc +0 -0
models/mossformer2_sr/__pycache__/mossformer_block.cpython-38.pyc +0 -0
models/mossformer2_sr/__pycache__/snake.cpython-312.pyc +0 -0
models/mossformer2_sr/__pycache__/utils.cpython-312.pyc +0 -0
models/mossformer2_sr/conv_module.py +388 -0
models/mossformer2_sr/env.py +15 -0
models/mossformer2_sr/fsmn.py +214 -0
models/mossformer2_sr/generator.py +448 -0
models/mossformer2_sr/layer_norm.py +126 -0
models/mossformer2_sr/mossformer2.py +711 -0
models/mossformer2_sr/mossformer2_block.py +735 -0
models/mossformer2_sr/mossformer2_sr_wrapper.py +52 -0
models/mossformer2_sr/snake.py +33 -0
models/mossformer2_sr/utils.py +37 -0

models/mossformer2_sr/__init__.py ADDED Viewed

File without changes

models/mossformer2_sr/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (198 Bytes). View file

models/mossformer2_sr/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (193 Bytes). View file

models/mossformer2_sr/__pycache__/conv_module.cpython-312.pyc ADDED Viewed

Binary file (20.4 kB). View file

models/mossformer2_sr/__pycache__/conv_module.cpython-38.pyc ADDED Viewed

Binary file (13.9 kB). View file

models/mossformer2_sr/__pycache__/env.cpython-312.pyc ADDED Viewed

Binary file (1.19 kB). View file

models/mossformer2_sr/__pycache__/fsmn.cpython-312.pyc ADDED Viewed

Binary file (12.4 kB). View file

models/mossformer2_sr/__pycache__/fsmn.cpython-38.pyc ADDED Viewed

Binary file (8.51 kB). View file

models/mossformer2_sr/__pycache__/generator.cpython-312.pyc ADDED Viewed

Binary file (24.6 kB). View file

models/mossformer2_sr/__pycache__/layer_norm.cpython-312.pyc ADDED Viewed

Binary file (6.59 kB). View file

models/mossformer2_sr/__pycache__/layer_norm.cpython-38.pyc ADDED Viewed

Binary file (4.24 kB). View file

models/mossformer2_sr/__pycache__/mossformer.cpython-38.pyc ADDED Viewed

Binary file (16.3 kB). View file

models/mossformer2_sr/__pycache__/mossformer2.cpython-312.pyc ADDED Viewed

Binary file (22.8 kB). View file

models/mossformer2_sr/__pycache__/mossformer2.cpython-38.pyc ADDED Viewed

Binary file (15.9 kB). View file

models/mossformer2_sr/__pycache__/mossformer2_block.cpython-312.pyc ADDED Viewed

Binary file (30.8 kB). View file

models/mossformer2_sr/__pycache__/mossformer2_block.cpython-38.pyc ADDED Viewed

Binary file (23.5 kB). View file

models/mossformer2_sr/__pycache__/mossformer2_se_wrapper.cpython-312.pyc ADDED Viewed

Binary file (4.05 kB). View file

models/mossformer2_sr/__pycache__/mossformer2_se_wrapper.cpython-38.pyc ADDED Viewed

Binary file (3.6 kB). View file

models/mossformer2_sr/__pycache__/mossformer2_sr_wrapper.cpython-312.pyc ADDED Viewed

Binary file (2.36 kB). View file

models/mossformer2_sr/__pycache__/mossformer_block.cpython-38.pyc ADDED Viewed

Binary file (21.2 kB). View file

models/mossformer2_sr/__pycache__/snake.cpython-312.pyc ADDED Viewed

Binary file (2.24 kB). View file

models/mossformer2_sr/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (2.46 kB). View file

models/mossformer2_sr/conv_module.py ADDED Viewed

	@@ -0,0 +1,388 @@

+import torch
+import torch.nn as nn
+from torch import Tensor
+import torch.nn.init as init
+import torch.nn.functional as F
+EPS = 1e-8
+class GlobalLayerNorm(nn.Module):
+    """Calculate Global Layer Normalization.
+    Arguments
+    ---------
+       dim : (int or list or torch.Size)
+           Input shape from an expected input of size.
+       eps : float
+           A value added to the denominator for numerical stability.
+       elementwise_affine : bool
+          A boolean value that when set to True,
+          this module has learnable per-element affine parameters
+          initialized to ones (for weights) and zeros (for biases).
+    Example
+    -------
+    >>> x = torch.randn(5, 10, 20)
+    >>> GLN = GlobalLayerNorm(10, 3)
+    >>> x_norm = GLN(x)
+    """
+    def __init__(self, dim, shape, eps=1e-8, elementwise_affine=True):
+        super(GlobalLayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            if shape == 3:
+                self.weight = nn.Parameter(torch.ones(self.dim, 1))
+                self.bias = nn.Parameter(torch.zeros(self.dim, 1))
+            if shape == 4:
+                self.weight = nn.Parameter(torch.ones(self.dim, 1, 1))
+                self.bias = nn.Parameter(torch.zeros(self.dim, 1, 1))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+    def forward(self, x):
+        """Returns the normalized tensor.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor of size [N, C, K, S] or [N, C, L].
+        """
+        # x = N x C x K x S or N x C x L
+        # N x 1 x 1
+        # cln: mean,var N x 1 x K x S
+        # gln: mean,var N x 1 x 1
+        if x.dim() == 3:
+            mean = torch.mean(x, (1, 2), keepdim=True)
+            var = torch.mean((x - mean) ** 2, (1, 2), keepdim=True)
+            if self.elementwise_affine:
+                x = (
+                    self.weight * (x - mean) / torch.sqrt(var + self.eps)
+                    + self.bias
+                )
+            else:
+                x = (x - mean) / torch.sqrt(var + self.eps)
+        if x.dim() == 4:
+            mean = torch.mean(x, (1, 2, 3), keepdim=True)
+            var = torch.mean((x - mean) ** 2, (1, 2, 3), keepdim=True)
+            if self.elementwise_affine:
+                x = (
+                    self.weight * (x - mean) / torch.sqrt(var + self.eps)
+                    + self.bias
+                )
+            else:
+                x = (x - mean) / torch.sqrt(var + self.eps)
+        return x
+class CumulativeLayerNorm(nn.LayerNorm):
+    """Calculate Cumulative Layer Normalization.
+       Arguments
+       ---------
+       dim : int
+        Dimension that you want to normalize.
+       elementwise_affine : True
+        Learnable per-element affine parameters.
+    Example
+    -------
+    >>> x = torch.randn(5, 10, 20)
+    >>> CLN = CumulativeLayerNorm(10)
+    >>> x_norm = CLN(x)
+    """
+    def __init__(self, dim, elementwise_affine=True):
+        super(CumulativeLayerNorm, self).__init__(
+            dim, elementwise_affine=elementwise_affine, eps=1e-8
+        )
+    def forward(self, x):
+        """Returns the normalized tensor.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor size [N, C, K, S] or [N, C, L]
+        """
+        # x: N x C x K x S or N x C x L
+        # N x K x S x C
+        if x.dim() == 4:
+            x = x.permute(0, 2, 3, 1).contiguous()
+            # N x K x S x C == only channel norm
+            x = super().forward(x)
+            # N x C x K x S
+            x = x.permute(0, 3, 1, 2).contiguous()
+        if x.dim() == 3:
+            x = torch.transpose(x, 1, 2)
+            # N x L x C == only channel norm
+            x = super().forward(x)
+            # N x C x L
+            x = torch.transpose(x, 1, 2)
+        return x
+def select_norm(norm, dim, shape):
+    """Just a wrapper to select the normalization type.
+    """
+    if norm == "gln":
+        return GlobalLayerNorm(dim, shape, elementwise_affine=True)
+    if norm == "cln":
+        return CumulativeLayerNorm(dim, elementwise_affine=True)
+    if norm == "ln":
+        return nn.GroupNorm(1, dim, eps=1e-8)
+    else:
+        return nn.BatchNorm1d(dim)
+class Swish(nn.Module):
+    """
+    Swish is a smooth, non-monotonic function that consistently matches or outperforms ReLU on deep networks applied
+    to a variety of challenging domains such as Image classification and Machine translation.
+    """
+    def __init__(self):
+        super(Swish, self).__init__()
+    def forward(self, inputs: Tensor) -> Tensor:
+        return inputs * inputs.sigmoid()
+class GLU(nn.Module):
+    """
+    The gating mechanism is called Gated Linear Units (GLU), which was first introduced for natural language processing
+    in the paper “Language Modeling with Gated Convolutional Networks”
+    """
+    def __init__(self, dim: int) -> None:
+        super(GLU, self).__init__()
+        self.dim = dim
+    def forward(self, inputs: Tensor) -> Tensor:
+        outputs, gate = inputs.chunk(2, dim=self.dim)
+        return outputs * gate.sigmoid()
+class Transpose(nn.Module):
+    """ Wrapper class of torch.transpose() for Sequential module. """
+    def __init__(self, shape: tuple):
+        super(Transpose, self).__init__()
+        self.shape = shape
+    def forward(self, x: Tensor) -> Tensor:
+        return x.transpose(*self.shape)
+class Linear(nn.Module):
+    """
+    Wrapper class of torch.nn.Linear
+    Weight initialize by xavier initialization and bias initialize to zeros.
+    """
+    def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
+        super(Linear, self).__init__()
+        self.linear = nn.Linear(in_features, out_features, bias=bias)
+        init.xavier_uniform_(self.linear.weight)
+        if bias:
+            init.zeros_(self.linear.bias)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.linear(x)
+class DepthwiseConv1d(nn.Module):
+    """
+    When groups == in_channels and out_channels == K * in_channels, where K is a positive integer,
+    this operation is termed in literature as depthwise convolution.
+    Args:
+        in_channels (int): Number of channels in the input
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
+    Inputs: inputs
+        - **inputs** (batch, in_channels, time): Tensor containing input vector
+    Returns: outputs
+        - **outputs** (batch, out_channels, time): Tensor produces by depthwise 1-D convolution.
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            kernel_size: int,
+            stride: int = 1,
+            padding: int = 0,
+            bias: bool = False,
+    ) -> None:
+        super(DepthwiseConv1d, self).__init__()
+        assert out_channels % in_channels == 0, "out_channels should be constant multiple of in_channels"
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            groups=in_channels,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+    def forward(self, inputs: Tensor) -> Tensor:
+        return self.conv(inputs)
+class PointwiseConv1d(nn.Module):
+    """
+    When kernel size == 1 conv1d, this operation is termed in literature as pointwise convolution.
+    This operation often used to match dimensions.
+    Args:
+        in_channels (int): Number of channels in the input
+        out_channels (int): Number of channels produced by the convolution
+        stride (int, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of the input. Default: 0
+        bias (bool, optional): If True, adds a learnable bias to the output. Default: True
+    Inputs: inputs
+        - **inputs** (batch, in_channels, time): Tensor containing input vector
+    Returns: outputs
+        - **outputs** (batch, out_channels, time): Tensor produces by pointwise 1-D convolution.
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            stride: int = 1,
+            padding: int = 0,
+            bias: bool = True,
+    ) -> None:
+        super(PointwiseConv1d, self).__init__()
+        self.conv = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=stride,
+            padding=padding,
+            bias=bias,
+        )
+    def forward(self, inputs: Tensor) -> Tensor:
+        return self.conv(inputs)
+class ConvModule(nn.Module):
+    """
+    Conformer convolution module starts with a pointwise convolution and a gated linear unit (GLU).
+    This is followed by a single 1-D depthwise convolution layer. Batchnorm is  deployed just after the convolution
+    to aid training deep models.
+    Args:
+        in_channels (int): Number of channels in the input
+        kernel_size (int or tuple, optional): Size of the convolving kernel Default: 31
+        dropout_p (float, optional): probability of dropout
+    Inputs: inputs
+        inputs (batch, time, dim): Tensor contains input sequences
+    Outputs: outputs
+        outputs (batch, time, dim): Tensor produces by conformer convolution module.
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            kernel_size: int = 17,
+            expansion_factor: int = 2,
+            dropout_p: float = 0.1,
+    ) -> None:
+        super(ConvModule, self).__init__()
+        assert (kernel_size - 1) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding"
+        assert expansion_factor == 2, "Currently, Only Supports expansion_factor 2"
+        self.sequential = nn.Sequential(
+            Transpose(shape=(1, 2)),
+            DepthwiseConv1d(in_channels, in_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2),
+        )
+    def forward(self, inputs: Tensor) -> Tensor:
+        return inputs + self.sequential(inputs).transpose(1, 2)
+class ConvModule_Dilated(nn.Module):
+    """
+    Conformer convolution module starts with a pointwise convolution and a gated linear unit (GLU).
+    This is followed by a single 1-D depthwise convolution layer. Batchnorm is  deployed just after the convolution
+    to aid training deep models.
+    Args:
+        in_channels (int): Number of channels in the input
+        kernel_size (int or tuple, optional): Size of the convolving kernel Default: 31
+        dropout_p (float, optional): probability of dropout
+    Inputs: inputs
+        inputs (batch, time, dim): Tensor contains input sequences
+    Outputs: outputs
+        outputs (batch, time, dim): Tensor produces by conformer convolution module.
+    """
+    def __init__(
+            self,
+            in_channels: int,
+            kernel_size: int = 17,
+            expansion_factor: int = 2,
+            dropout_p: float = 0.1,
+    ) -> None:
+        super(ConvModule_Gating, self).__init__()
+        assert (kernel_size - 1) % 2 == 0, "kernel_size should be a odd number for 'SAME' padding"
+        assert expansion_factor == 2, "Currently, Only Supports expansion_factor 2"
+        self.sequential = nn.Sequential(
+            Transpose(shape=(1, 2)),
+            DepthwiseConv1d(in_channels, in_channels, kernel_size, stride=1, padding=(kernel_size - 1) // 2),
+        )
+    def forward(self, inputs: Tensor) -> Tensor:
+        return inputs + self.sequential(inputs).transpose(1, 2)
+class DilatedDenseNet(nn.Module):
+    def __init__(self, depth=4, lorder=20, in_channels=64):
+        super(DilatedDenseNet, self).__init__()
+        self.depth = depth
+        self.in_channels = in_channels
+        self.pad = nn.ConstantPad2d((1, 1, 1, 0), value=0.)
+        self.twidth = lorder*2-1
+        self.kernel_size = (self.twidth, 1)
+        for i in range(self.depth):
+            dil = 2 ** i
+            pad_length = lorder + (dil - 1) * (lorder - 1) - 1
+            setattr(self, 'pad{}'.format(i + 1), nn.ConstantPad2d((0, 0, pad_length, pad_length), value=0.))
+            setattr(self, 'conv{}'.format(i + 1),
+                    nn.Conv2d(self.in_channels*(i+1), self.in_channels, kernel_size=self.kernel_size,
+                              dilation=(dil, 1), groups=self.in_channels, bias=False))
+            setattr(self, 'norm{}'.format(i + 1), nn.InstanceNorm2d(in_channels, affine=True))
+            setattr(self, 'prelu{}'.format(i + 1), nn.PReLU(self.in_channels))
+    def forward(self, x):
+        x = torch.unsqueeze(x, 1)
+        x_per = x.permute(0, 3, 2, 1)
+        skip = x_per
+        for i in range(self.depth):
+            out = getattr(self, 'pad{}'.format(i + 1))(skip)
+            out = getattr(self, 'conv{}'.format(i + 1))(out)
+            out = getattr(self, 'norm{}'.format(i + 1))(out)
+            out = getattr(self, 'prelu{}'.format(i + 1))(out)
+            skip = torch.cat([out, skip], dim=1)
+        out1 = out.permute(0, 3, 2, 1)
+        return out1.squeeze(1)
+class FFConvM_Dilated(nn.Module):
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        norm_klass = nn.LayerNorm,
+        dropout = 0.1
+    ):
+        super().__init__()
+        self.mdl = nn.Sequential(
+            norm_klass(dim_in),
+            nn.Linear(dim_in, dim_out),
+            nn.SiLU(),
+            DilatedDenseNet(depth=2, lorder=17, in_channels=dim_out),
+            nn.Dropout(dropout)
+        )
+    def forward(
+        self,
+        x,
+    ):
+        output = self.mdl(x)
+        return output

models/mossformer2_sr/env.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import os
+import shutil
+class AttrDict(dict):
+    def __init__(self, *args, **kwargs):
+        super(AttrDict, self).__init__(*args, **kwargs)
+        self.__dict__ = self
+def build_env(config, config_name, path):
+    t_path = os.path.join(path, config_name)
+    if config != t_path:
+        os.makedirs(path, exist_ok=True)
+        shutil.copyfile(config, os.path.join(path, config_name))

models/mossformer2_sr/fsmn.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import torch.nn as nn
+import torch.nn.functional as F
+import torch as th
+from torch.nn.parameter import Parameter
+import numpy as np
+import os
+class UniDeepFsmn(nn.Module):
+    """
+    UniDeepFsmn is a neural network module that implements a single-deep feedforward sequence memory network (FSMN).
+    Attributes:
+        input_dim (int): Dimension of the input features.
+        output_dim (int): Dimension of the output features.
+        lorder (int): Length of the order for the convolution layers.
+        hidden_size (int): Number of hidden units in the linear layer.
+        linear (nn.Linear): Linear layer to project input features to hidden size.
+        project (nn.Linear): Linear layer to project hidden features to output dimensions.
+        conv1 (nn.Conv2d): Convolutional layer for processing the output in a grouped manner.
+    """
+    def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None):
+        super(UniDeepFsmn, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        if lorder is None:
+            return
+        self.lorder = lorder
+        self.hidden_size = hidden_size
+        # Initialize the layers
+        self.linear = nn.Linear(input_dim, hidden_size)  # Linear transformation to hidden size
+        self.project = nn.Linear(hidden_size, output_dim, bias=False)  # Project hidden size to output dimension
+        self.conv1 = nn.Conv2d(output_dim, output_dim, [lorder + lorder - 1, 1], [1, 1], groups=output_dim, bias=False)  # Convolution layer
+    def forward(self, input):
+        """
+        Forward pass for the UniDeepFsmn model.
+        Args:
+            input (torch.Tensor): Input tensor of shape (batch_size, input_dim).
+        Returns:
+            torch.Tensor: The output tensor of the same shape as input, enhanced by the network.
+        """
+        f1 = F.relu(self.linear(input))  # Apply linear layer followed by ReLU activation
+        p1 = self.project(f1)  # Project to output dimension
+        x = th.unsqueeze(p1, 1)  # Add a dimension for compatibility with Conv2d
+        x_per = x.permute(0, 3, 2, 1)  # Permute dimensions for convolution
+        y = F.pad(x_per, [0, 0, self.lorder - 1, self.lorder - 1])  # Pad for causal convolution
+        out = x_per + self.conv1(y)  # Add original input to convolution output
+        out1 = out.permute(0, 3, 2, 1)  # Permute back to original dimensions
+        return input + out1.squeeze()  # Return enhanced input
+class UniDeepFsmn_dual(nn.Module):
+    """
+    UniDeepFsmn_dual is a neural network module that implements a dual-deep feedforward sequence memory network (FSMN).
+    This class extends the UniDeepFsmn by adding a second convolution layer for richer feature extraction.
+    Attributes:
+        input_dim (int): Dimension of the input features.
+        output_dim (int): Dimension of the output features.
+        lorder (int): Length of the order for the convolution layers.
+        hidden_size (int): Number of hidden units in the linear layer.
+        linear (nn.Linear): Linear layer to project input features to hidden size.
+        project (nn.Linear): Linear layer to project hidden features to output dimensions.
+        conv1 (nn.Conv2d): First convolutional layer for processing the output.
+        conv2 (nn.Conv2d): Second convolutional layer for further processing the features.
+    """
+    def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None):
+        super(UniDeepFsmn_dual, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        if lorder is None:
+            return
+        self.lorder = lorder
+        self.hidden_size = hidden_size
+        # Initialize the layers
+        self.linear = nn.Linear(input_dim, hidden_size)  # Linear transformation to hidden size
+        self.project = nn.Linear(hidden_size, output_dim, bias=False)  # Project hidden size to output dimension
+        self.conv1 = nn.Conv2d(output_dim, output_dim, [lorder + lorder - 1, 1], [1, 1], groups=output_dim, bias=False)  # First convolution layer
+        self.conv2 = nn.Conv2d(output_dim, output_dim, [lorder + lorder - 1, 1], [1, 1], groups=output_dim // 4, bias=False)  # Second convolution layer
+    def forward(self, input):
+        """
+        Forward pass for the UniDeepFsmn_dual model.
+        Args:
+            input (torch.Tensor): Input tensor of shape (batch_size, input_dim).
+        Returns:
+            torch.Tensor: The output tensor of the same shape as input, enhanced by the network.
+        """
+        f1 = F.relu(self.linear(input))  # Apply linear layer followed by ReLU activation
+        p1 = self.project(f1)  # Project to output dimension
+        x = th.unsqueeze(p1, 1)  # Add a dimension for compatibility with Conv2d
+        x_per = x.permute(0, 3, 2, 1)  # Permute dimensions for convolution
+        y = F.pad(x_per, [0, 0, self.lorder - 1, self.lorder - 1])  # Pad for causal convolution
+        conv1_out = x_per + self.conv1(y)  # Add original input to first convolution output
+        z = F.pad(conv1_out, [0, 0, self.lorder - 1, self.lorder - 1])  # Pad for second convolution
+        out = conv1_out + self.conv2(z)  # Add output of second convolution
+        out1 = out.permute(0, 3, 2, 1)  # Permute back to original dimensions
+        return input + out1.squeeze()  # Return enhanced input
+class DilatedDenseNet(nn.Module):
+    """
+    DilatedDenseNet implements a dense network structure with dilated convolutions.
+    This architecture enables wider receptive fields while maintaining a lower number of parameters.
+    It consists of multiple convolutional layers with dilation rates that increase at each layer.
+    Attributes:
+        depth (int): Number of convolutional layers in the network.
+        in_channels (int): Number of input channels for the first layer.
+        pad (nn.ConstantPad2d): Padding layer to maintain dimensions.
+        twidth (int): Width of the kernel used in convolution.
+        kernel_size (tuple): Kernel size for convolution operations.
+    """
+    def __init__(self, depth=4, lorder=20, in_channels=64):
+        super(DilatedDenseNet, self).__init__()
+        self.depth = depth
+        self.in_channels = in_channels
+        self.pad = nn.ConstantPad2d((1, 1, 1, 0), value=0.)  # Padding for the input
+        self.twidth = lorder * 2 - 1  # Width of the kernel
+        self.kernel_size = (self.twidth, 1)  # Kernel size for convolutions
+        # Initialize layers dynamically based on depth
+        for i in range(self.depth):
+            dil = 2 ** i  # Calculate dilation rate
+            pad_length = lorder + (dil - 1) * (lorder - 1) - 1  # Calculate padding length
+            setattr(self, 'pad{}'.format(i + 1), nn.ConstantPad2d((0, 0, pad_length, pad_length), value=0.))  # Padding for dilation
+            setattr(self, 'conv{}'.format(i + 1),
+                    nn.Conv2d(self.in_channels * (i + 1), self.in_channels, kernel_size=self.kernel_size,
+                              dilation=(dil, 1), groups=self.in_channels, bias=False))  # Convolution layer with dilation
+            setattr(self, 'norm{}'.format(i + 1), nn.InstanceNorm2d(in_channels, affine=True))  # Normalization layer
+            setattr(self, 'prelu{}'.format(i + 1), nn.PReLU(self.in_channels))  # Activation layer
+    def forward(self, x):
+        """
+        Forward pass for the DilatedDenseNet model.
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).
+        Returns:
+            torch.Tensor: Output tensor after applying dense layers.
+        """
+        skip = x  # Initialize skip connection
+        for i in range(self.depth):
+            out = getattr(self, 'pad{}'.format(i + 1))(skip)  # Apply padding
+            out = getattr(self, 'conv{}'.format(i + 1))(out)  # Apply convolution
+            out = getattr(self, 'norm{}'.format(i + 1))(out)  # Apply normalization
+            out = getattr(self, 'prelu{}'.format(i + 1))(out)  # Apply PReLU activation
+            skip = th.cat([out, skip], dim=1)  # Concatenate the output with the skip connection
+        return out  # Return the final output
+class UniDeepFsmn_dilated(nn.Module):
+    """
+    UniDeepFsmn_dilated combines the UniDeepFsmn architecture with a dilated dense network
+    to enhance feature extraction while maintaining efficient computation.
+    Attributes:
+        input_dim (int): Dimension of the input features.
+        output_dim (int): Dimension of the output features.
+        depth (int): Depth of the dilated dense network.
+        lorder (int): Length of the order for the convolution layers.
+        hidden_size (int): Number of hidden units in the linear layer.
+        linear (nn.Linear): Linear layer to project input features to hidden size.
+        project (nn.Linear): Linear layer to project hidden features to output dimensions.
+        conv (DilatedDenseNet): Instance of the DilatedDenseNet for feature extraction.
+    """
+    def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None, depth=2):
+        super(UniDeepFsmn_dilated, self).__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.depth = depth
+        if lorder is None:
+            return
+        self.lorder = lorder
+        self.hidden_size = hidden_size
+        # Initialize layers
+        self.linear = nn.Linear(input_dim, hidden_size)  # Linear transformation to hidden size
+        self.project = nn.Linear(hidden_size, output_dim, bias=False)  # Project hidden size to output dimension
+        self.conv = DilatedDenseNet(depth=self.depth, lorder=lorder, in_channels=output_dim)  # Dilated dense network for feature extraction
+    def forward(self, input):
+        """
+        Forward pass for the UniDeepFsmn_dilated model.
+        Args:
+            input (torch.Tensor): Input tensor of shape (batch_size, input_dim).
+        Returns:
+            torch.Tensor: The output tensor of the same shape as input, enhanced by the network.
+        """
+        f1 = F.relu(self.linear(input))  # Apply linear layer followed by ReLU activation
+        p1 = self.project(f1)  # Project to output dimension
+        x = th.unsqueeze(p1, 1)  # Add a dimension for compatibility with Conv2d
+        x_per = x.permute(0, 3, 2, 1)  # Permute dimensions for convolution
+        out = self.conv(x_per)  # Pass through the dilated dense network
+        out1 = out.permute(0, 3, 2, 1)  # Permute back to original dimensions
+        return input + out1.squeeze()  # Return enhanced input

models/mossformer2_sr/generator.py ADDED Viewed

	@@ -0,0 +1,448 @@

+import torch
+import torch.nn.functional as F
+import torch.nn as nn
+from torch.nn import Conv1d, ConvTranspose1d, AvgPool1d, Conv2d
+from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
+from models.mossformer2_sr.utils import init_weights, get_padding
+from models.mossformer2_sr.mossformer2 import MossFormer_MaskNet
+from models.mossformer2_sr.snake import Snake1d
+from typing import Optional, List, Union, Dict, Tuple
+from models.mossformer2_sr.env import AttrDict
+import typing
+from torchaudio.transforms import Spectrogram, Resample
+LRELU_SLOPE = 0.1
+class ResBlock1(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3, 5)):
+        super(ResBlock1, self).__init__()
+        self.h = h
+        self.convs1 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[2],
+                               padding=get_padding(kernel_size, dilation[2])))
+            #Snake1d(channels)
+        ])
+        self.convs1.apply(init_weights)
+        self.convs1_activates = nn.ModuleList([
+             Snake1d(channels),
+             Snake1d(channels),
+             Snake1d(channels)
+        ])
+        self.convs2 = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=1,
+                               padding=get_padding(kernel_size, 1)))
+            #Snake1d(channels)
+        ])
+        self.convs2.apply(init_weights)
+        #self.convs2_activate = Snake1d(channels)
+        self.convs2_activates = nn.ModuleList([
+             Snake1d(channels),
+             Snake1d(channels),
+             Snake1d(channels)
+        ])
+    def forward(self, x):
+        for c1, c2, act1, act2 in zip(self.convs1, self.convs2, self.convs1_activates, self.convs2_activates):
+            #xt = F.leaky_relu(x, LRELU_SLOPE)
+            #print(f'xt: {xt.shape}')
+            xt = act1(x)
+            xt = c1(xt)
+            #xt = F.leaky_relu(xt, LRELU_SLOPE)
+            xt = act2(xt)
+            xt = c2(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs1:
+            remove_weight_norm(l)
+        for l in self.convs2:
+            remove_weight_norm(l)
+class ResBlock2(torch.nn.Module):
+    def __init__(self, h, channels, kernel_size=3, dilation=(1, 3)):
+        super(ResBlock2, self).__init__()
+        self.h = h
+        self.convs = nn.ModuleList([
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[0],
+                               padding=get_padding(kernel_size, dilation[0]))),
+            weight_norm(Conv1d(channels, channels, kernel_size, 1, dilation=dilation[1],
+                               padding=get_padding(kernel_size, dilation[1])))
+            #Snake1d(channels)
+        ])
+        self.convs.apply(init_weights)
+        #self.convs_activate = Snake1d(channels)
+        self.convs_activates = nn.ModuleList([
+             Snake1d(channels),
+             Snake1d(channels)
+        ])
+    def forward(self, x):
+        for c, act in zip(self.convs, self.convs_activates):
+            #xt = F.leaky_relu(x, LRELU_SLOPE)
+            xt = act(x)
+            xt = c(xt)
+            x = xt + x
+        return x
+    def remove_weight_norm(self):
+        for l in self.convs:
+            remove_weight_norm(l)
+class Generator(torch.nn.Module):
+    def __init__(self, h):
+        super(Generator, self).__init__()
+        self.h = h
+        self.num_kernels = len(h.resblock_kernel_sizes)
+        self.num_upsamples = len(h.upsample_rates)
+        self.conv_pre = weight_norm(Conv1d(80, h.upsample_initial_channel, 7, 1, padding=3))
+        resblock = ResBlock1 if h.resblock == '1' else ResBlock2
+        self.ups = nn.ModuleList()
+        self.snakes = nn.ModuleList()
+        for i, (u, k) in enumerate(zip(h.upsample_rates, h.upsample_kernel_sizes)):
+            self.snakes.append(Snake1d(h.upsample_initial_channel//(2**i)))
+            self.ups.append(weight_norm(
+                ConvTranspose1d(h.upsample_initial_channel//(2**i), h.upsample_initial_channel//(2**(i+1)),
+                                k, u, padding=(k-u)//2)))
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.ups)):
+            ch = h.upsample_initial_channel//(2**(i+1))
+            for j, (k, d) in enumerate(zip(h.resblock_kernel_sizes, h.resblock_dilation_sizes)):
+                self.resblocks.append(resblock(h, ch, k, d))
+        self.snake_post = Snake1d(ch)
+        self.conv_post = weight_norm(Conv1d(ch, 1, 7, 1, padding=3))
+        self.ups.apply(init_weights)
+        self.conv_post.apply(init_weights)
+    def forward(self, x):
+        x = self.conv_pre(x)
+        for i in range(self.num_upsamples):
+            #x = F.leaky_relu(x, LRELU_SLOPE)
+            #print(f'x {i}: {x.shape}')
+            x = self.snakes[i](x)
+            x = self.ups[i](x)
+            xs = None
+            for j in range(self.num_kernels):
+                if xs is None:
+                    xs = self.resblocks[i*self.num_kernels+j](x)
+                else:
+                    xs += self.resblocks[i*self.num_kernels+j](x)
+            x = xs / self.num_kernels
+        #x = F.leaky_relu(x)
+        x = self.snake_post(x)
+        x = self.conv_post(x)
+        x = torch.tanh(x)
+        return x
+    def remove_weight_norm(self):
+        #print('Removing weight norm...')
+        for l in self.ups:
+            remove_weight_norm(l)
+        for l in self.resblocks:
+            l.remove_weight_norm()
+        remove_weight_norm(self.conv_pre)
+        remove_weight_norm(self.conv_post)
+class DiscriminatorP(torch.nn.Module):
+    def __init__(self, period, kernel_size=5, stride=3, use_spectral_norm=False):
+        super(DiscriminatorP, self).__init__()
+        self.period = period
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv2d(1, 32, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(32, 128, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(128, 512, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(512, 1024, (kernel_size, 1), (stride, 1), padding=(get_padding(5, 1), 0))),
+            norm_f(Conv2d(1024, 1024, (kernel_size, 1), 1, padding=(2, 0))),
+        ])
+        self.conv_post = norm_f(Conv2d(1024, 1, (3, 1), 1, padding=(1, 0)))
+    def forward(self, x):
+        fmap = []
+        # 1d to 2d
+        b, c, t = x.shape
+        if t % self.period != 0: # pad first
+            n_pad = self.period - (t % self.period)
+            x = F.pad(x, (0, n_pad), "reflect")
+            t = t + n_pad
+        x = x.view(b, c, t // self.period, self.period)
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiPeriodDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiPeriodDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorP(2),
+            DiscriminatorP(3),
+            DiscriminatorP(5),
+            DiscriminatorP(7),
+            DiscriminatorP(11),
+        ])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+class DiscriminatorS(torch.nn.Module):
+    def __init__(self, use_spectral_norm=False):
+        super(DiscriminatorS, self).__init__()
+        norm_f = weight_norm if use_spectral_norm == False else spectral_norm
+        self.convs = nn.ModuleList([
+            norm_f(Conv1d(1, 128, 15, 1, padding=7)),
+            norm_f(Conv1d(128, 128, 41, 2, groups=4, padding=20)),
+            norm_f(Conv1d(128, 256, 41, 2, groups=16, padding=20)),
+            norm_f(Conv1d(256, 512, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(512, 1024, 41, 4, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 41, 1, groups=16, padding=20)),
+            norm_f(Conv1d(1024, 1024, 5, 1, padding=2)),
+        ])
+        self.conv_post = norm_f(Conv1d(1024, 1, 3, 1, padding=1))
+    def forward(self, x):
+        fmap = []
+        for l in self.convs:
+            x = l(x)
+            x = F.leaky_relu(x, LRELU_SLOPE)
+            fmap.append(x)
+        x = self.conv_post(x)
+        fmap.append(x)
+        x = torch.flatten(x, 1, -1)
+        return x, fmap
+class MultiScaleDiscriminator(torch.nn.Module):
+    def __init__(self):
+        super(MultiScaleDiscriminator, self).__init__()
+        self.discriminators = nn.ModuleList([
+            DiscriminatorS(use_spectral_norm=True),
+            DiscriminatorS(),
+            DiscriminatorS(),
+        ])
+        self.meanpools = nn.ModuleList([
+            AvgPool1d(4, 2, padding=2),
+            AvgPool1d(4, 2, padding=2)
+        ])
+    def forward(self, y, y_hat):
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for i, d in enumerate(self.discriminators):
+            if i != 0:
+                y = self.meanpools[i-1](y)
+                y_hat = self.meanpools[i-1](y_hat)
+            y_d_r, fmap_r = d(y)
+            y_d_g, fmap_g = d(y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+# Method based on descript-audio-codec: https://github.com/descriptinc/descript-audio-codec
+# Modified code adapted from https://github.com/gemelo-ai/vocos under the MIT license.
+#   LICENSE is in incl_licenses directory.
+class DiscriminatorB(nn.Module):
+    def __init__(
+        self,
+        window_length: int,
+        channels: int = 32,
+        hop_factor: float = 0.25,
+        bands: Tuple[Tuple[float, float], ...] = (
+            (0.0, 0.1),
+            (0.1, 0.25),
+            (0.25, 0.5),
+            (0.5, 0.75),
+            (0.75, 1.0),
+        ),
+    ):
+        super().__init__()
+        self.window_length = window_length
+        self.hop_factor = hop_factor
+        self.spec_fn = Spectrogram(
+            n_fft=window_length,
+            hop_length=int(window_length * hop_factor),
+            win_length=window_length,
+            power=None,
+        )
+        n_fft = window_length // 2 + 1
+        bands = [(int(b[0] * n_fft), int(b[1] * n_fft)) for b in bands]
+        self.bands = bands
+        convs = lambda: nn.ModuleList(
+            [
+                weight_norm(nn.Conv2d(2, channels, (3, 9), (1, 1), padding=(1, 4))),
+                weight_norm(
+                    nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
+                ),
+                weight_norm(
+                    nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
+                ),
+                weight_norm(
+                    nn.Conv2d(channels, channels, (3, 9), (1, 2), padding=(1, 4))
+                ),
+                weight_norm(
+                    nn.Conv2d(channels, channels, (3, 3), (1, 1), padding=(1, 1))
+                ),
+            ]
+        )
+        self.band_convs = nn.ModuleList([convs() for _ in range(len(self.bands))])
+        self.conv_post = weight_norm(
+            nn.Conv2d(channels, 1, (3, 3), (1, 1), padding=(1, 1))
+        )
+    def spectrogram(self, x: torch.Tensor) -> List[torch.Tensor]:
+        # Remove DC offset
+        x = x - x.mean(dim=-1, keepdims=True)
+        # Peak normalize the volume of input audio
+        x = 0.8 * x / (x.abs().max(dim=-1, keepdim=True)[0] + 1e-9)
+        x = self.spec_fn(x)
+        x = torch.view_as_real(x)
+        x = x.permute(0, 3, 2, 1)  # [B, F, T, C] -> [B, C, T, F]
+        # Split into bands
+        x_bands = [x[..., b[0] : b[1]] for b in self.bands]
+        return x_bands
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        x_bands = self.spectrogram(x.squeeze(1))
+        fmap = []
+        x = []
+        for band, stack in zip(x_bands, self.band_convs):
+            for i, layer in enumerate(stack):
+                band = layer(band)
+                band = torch.nn.functional.leaky_relu(band, 0.1)
+                if i > 0:
+                    fmap.append(band)
+            x.append(band)
+        x = torch.cat(x, dim=-1)
+        x = self.conv_post(x)
+        fmap.append(x)
+        return x, fmap
+# Method based on descript-audio-codec: https://github.com/descriptinc/descript-audio-codec
+# Modified code adapted from https://github.com/gemelo-ai/vocos under the MIT license.
+#   LICENSE is in incl_licenses directory.
+class MultiBandDiscriminator(nn.Module):
+    def __init__(
+        self,
+        h,
+    ):
+        """
+        Multi-band multi-scale STFT discriminator, with the architecture based on https://github.com/descriptinc/descript-audio-codec.
+        and the modified code adapted from https://github.com/gemelo-ai/vocos.
+        """
+        super().__init__()
+        # fft_sizes (list[int]): Tuple of window lengths for FFT. Defaults to [2048, 1024, 512] if not set in h.
+        self.fft_sizes = h.get("mbd_fft_sizes", [2048, 1024, 512])
+        self.discriminators = nn.ModuleList(
+            [DiscriminatorB(window_length=w) for w in self.fft_sizes]
+        )
+    def forward(self, y: torch.Tensor, y_hat: torch.Tensor) -> Tuple[
+        List[torch.Tensor],
+        List[torch.Tensor],
+        List[List[torch.Tensor]],
+        List[List[torch.Tensor]],
+    ]:
+        y_d_rs = []
+        y_d_gs = []
+        fmap_rs = []
+        fmap_gs = []
+        for d in self.discriminators:
+            y_d_r, fmap_r = d(x=y)
+            y_d_g, fmap_g = d(x=y_hat)
+            y_d_rs.append(y_d_r)
+            fmap_rs.append(fmap_r)
+            y_d_gs.append(y_d_g)
+            fmap_gs.append(fmap_g)
+        return y_d_rs, y_d_gs, fmap_rs, fmap_gs
+def feature_loss(fmap_r, fmap_g):
+    loss = 0
+    for dr, dg in zip(fmap_r, fmap_g):
+        for rl, gl in zip(dr, dg):
+            loss += torch.mean(torch.abs(rl - gl))
+    return loss*2
+def discriminator_loss(disc_real_outputs, disc_generated_outputs):
+    loss = 0
+    r_losses = []
+    g_losses = []
+    for dr, dg in zip(disc_real_outputs, disc_generated_outputs):
+        r_loss = torch.mean((1-dr)**2)
+        g_loss = torch.mean(dg**2)
+        loss += (r_loss + g_loss)
+        r_losses.append(r_loss.item())
+        g_losses.append(g_loss.item())
+    return loss, r_losses, g_losses
+def generator_loss(disc_outputs):
+    loss = 0
+    gen_losses = []
+    for dg in disc_outputs:
+        l = torch.mean((1-dg)**2)
+        gen_losses.append(l)
+        loss += l
+    return loss, gen_losses
+class Mossformer(nn.Module):
+    def __init__(self):
+        super(Mossformer, self).__init__()
+        self.mossformer = MossFormer_MaskNet(in_channels=80, out_channels=512, out_channels_final=80)
+    def forward(self, input):
+        out = self.mossformer(input)
+        return out

models/mossformer2_sr/layer_norm.py ADDED Viewed

	@@ -0,0 +1,126 @@

+#!/usr/bin/env python -u
+# -*- coding: utf-8 -*-
+# Copyright  2018  Northwestern Polytechnical University (author: Ke Wang)
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import torch
+import torch.nn as nn
+class CLayerNorm(nn.LayerNorm):
+    """Channel-wise layer normalization."""
+    def __init__(self, *args, **kwargs):
+        super(CLayerNorm, self).__init__(*args, **kwargs)
+    def forward(self, sample):
+        """Forward function.
+        Args:
+            sample: [batch_size, channels, length]
+        """
+        if sample.dim() != 3:
+            raise RuntimeError('{} only accept 3-D tensor as input'.format(
+                self.__name__))
+        # [N, C, T] -> [N, T, C]
+        sample = torch.transpose(sample, 1, 2)
+        # LayerNorm
+        sample = super().forward(sample)
+        # [N, T, C] -> [N, C, T]
+        sample = torch.transpose(sample, 1, 2)
+        return sample
+class ILayerNorm(nn.InstanceNorm1d):
+    """Channel-wise layer normalization."""
+    def __init__(self, *args, **kwargs):
+        super(ILayerNorm, self).__init__(*args, **kwargs)
+    def forward(self, sample):
+        """Forward function.
+        Args:
+            sample: [batch_size, channels, length]
+        """
+        if sample.dim() != 3:
+            raise RuntimeError('{} only accept 3-D tensor as input'.format(
+                self.__name__))
+        # [N, C, T] -> [N, T, C]
+        sample = torch.transpose(sample, 1, 2)
+        # LayerNorm
+        sample = super().forward(sample)
+        # [N, T, C] -> [N, C, T]
+        sample = torch.transpose(sample, 1, 2)
+        return sample
+class GLayerNorm(nn.Module):
+    """Global Layer Normalization for TasNet."""
+    def __init__(self, channels, eps=1e-5):
+        super(GLayerNorm, self).__init__()
+        self.eps = eps
+        self.norm_dim = channels
+        self.gamma = nn.Parameter(torch.Tensor(channels))
+        self.beta = nn.Parameter(torch.Tensor(channels))
+        self.reset_parameters()
+    def reset_parameters(self):
+        nn.init.ones_(self.gamma)
+        nn.init.zeros_(self.beta)
+    def forward(self, sample):
+        """Forward function.
+        Args:
+            sample: [batch_size, channels, length]
+        """
+        if sample.dim() != 3:
+            raise RuntimeError('{} only accept 3-D tensor as input'.format(
+                self.__name__))
+        # [N, C, T] -> [N, T, C]
+        sample = torch.transpose(sample, 1, 2)
+        # Mean and variance [N, 1, 1]
+        mean = torch.mean(sample, (1, 2), keepdim=True)
+        var = torch.mean((sample - mean)**2, (1, 2), keepdim=True)
+        sample = (sample - mean) / torch.sqrt(var + self.eps) * \
+                self.gamma + self.beta
+        # [N, T, C] -> [N, C, T]
+        sample = torch.transpose(sample, 1, 2)
+        return sample
+class _LayerNorm(nn.Module):
+    """Layer Normalization base class."""
+    def __init__(self, channel_size):
+        super(_LayerNorm, self).__init__()
+        self.channel_size = channel_size
+        self.gamma = nn.Parameter(torch.ones(channel_size),
+                                  requires_grad=True)
+        self.beta = nn.Parameter(torch.zeros(channel_size),
+                                 requires_grad=True)
+    def apply_gain_and_bias(self, normed_x):
+        """ Assumes input of size `[batch, chanel, *]`. """
+        return (self.gamma * normed_x.transpose(1, -1) +
+                self.beta).transpose(1, -1)
+class GlobLayerNorm(_LayerNorm):
+    """Global Layer Normalization (globLN)."""
+    def forward(self, x):
+        """ Applies forward pass.
+        Works for any input size > 2D.
+        Args:
+            x (:class:`torch.Tensor`): Shape `[batch, chan, *]`
+        Returns:
+            :class:`torch.Tensor`: gLN_x `[batch, chan, *]`
+        """
+        dims = list(range(1, len(x.shape)))
+        mean = x.mean(dim=dims, keepdim=True)
+        var = torch.pow(x - mean, 2).mean(dim=dims, keepdim=True)
+        return self.apply_gain_and_bias((x - mean) / (var + 1e-8).sqrt())

models/mossformer2_sr/mossformer2.py ADDED Viewed

	@@ -0,0 +1,711 @@

+"""
+modified from https://github.com/speechbrain/speechbrain/blob/develop/speechbrain/lobes/models/dual_path.py
+Author: Shengkui Zhao
+"""
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import copy
+from models.mossformer2_sr.mossformer2_block import ScaledSinuEmbedding, MossformerBlock_GFSMN, MossformerBlock
+EPS = 1e-8
+class GlobalLayerNorm(nn.Module):
+    """Calculate Global Layer Normalization.
+    Arguments
+    ---------
+       dim : (int or list or torch.Size)
+           Input shape from an expected input of size.
+       eps : float
+           A value added to the denominator for numerical stability.
+       elementwise_affine : bool
+          A boolean value that when set to True,
+          this module has learnable per-element affine parameters
+          initialized to ones (for weights) and zeros (for biases).
+    Example
+    -------
+    >>> x = torch.randn(5, 10, 20)
+    >>> GLN = GlobalLayerNorm(10, 3)
+    >>> x_norm = GLN(x)
+    """
+    def __init__(self, dim, shape, eps=1e-8, elementwise_affine=True):
+        super(GlobalLayerNorm, self).__init__()
+        self.dim = dim
+        self.eps = eps
+        self.elementwise_affine = elementwise_affine
+        if self.elementwise_affine:
+            if shape == 3:
+                self.weight = nn.Parameter(torch.ones(self.dim, 1))
+                self.bias = nn.Parameter(torch.zeros(self.dim, 1))
+            if shape == 4:
+                self.weight = nn.Parameter(torch.ones(self.dim, 1, 1))
+                self.bias = nn.Parameter(torch.zeros(self.dim, 1, 1))
+        else:
+            self.register_parameter("weight", None)
+            self.register_parameter("bias", None)
+    def forward(self, x):
+        """Returns the normalized tensor.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor of size [N, C, K, S] or [N, C, L].
+        """
+        # x = N x C x K x S or N x C x L
+        # N x 1 x 1
+        # cln: mean,var N x 1 x K x S
+        # gln: mean,var N x 1 x 1
+        if x.dim() == 3:
+            mean = torch.mean(x, (1, 2), keepdim=True)
+            var = torch.mean((x - mean) ** 2, (1, 2), keepdim=True)
+            if self.elementwise_affine:
+                x = (
+                    self.weight * (x - mean) / torch.sqrt(var + self.eps)
+                    + self.bias
+                )
+            else:
+                x = (x - mean) / torch.sqrt(var + self.eps)
+        if x.dim() == 4:
+            mean = torch.mean(x, (1, 2, 3), keepdim=True)
+            var = torch.mean((x - mean) ** 2, (1, 2, 3), keepdim=True)
+            if self.elementwise_affine:
+                x = (
+                    self.weight * (x - mean) / torch.sqrt(var + self.eps)
+                    + self.bias
+                )
+            else:
+                x = (x - mean) / torch.sqrt(var + self.eps)
+        return x
+class CumulativeLayerNorm(nn.LayerNorm):
+    """Calculate Cumulative Layer Normalization.
+       Arguments
+       ---------
+       dim : int
+        Dimension that you want to normalize.
+       elementwise_affine : True
+        Learnable per-element affine parameters.
+    Example
+    -------
+    >>> x = torch.randn(5, 10, 20)
+    >>> CLN = CumulativeLayerNorm(10)
+    >>> x_norm = CLN(x)
+    """
+    def __init__(self, dim, elementwise_affine=True):
+        super(CumulativeLayerNorm, self).__init__(
+            dim, elementwise_affine=elementwise_affine, eps=1e-8
+        )
+    def forward(self, x):
+        """Returns the normalized tensor.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Tensor size [N, C, K, S] or [N, C, L]
+        """
+        # x: N x C x K x S or N x C x L
+        # N x K x S x C
+        if x.dim() == 4:
+            x = x.permute(0, 2, 3, 1).contiguous()
+            # N x K x S x C == only channel norm
+            x = super().forward(x)
+            # N x C x K x S
+            x = x.permute(0, 3, 1, 2).contiguous()
+        if x.dim() == 3:
+            x = torch.transpose(x, 1, 2)
+            # N x L x C == only channel norm
+            x = super().forward(x)
+            # N x C x L
+            x = torch.transpose(x, 1, 2)
+        return x
+def select_norm(norm, dim, shape):
+    """Just a wrapper to select the normalization type.
+    """
+    if norm == "gln":
+        return GlobalLayerNorm(dim, shape, elementwise_affine=True)
+    if norm == "cln":
+        return CumulativeLayerNorm(dim, elementwise_affine=True)
+    if norm == "ln":
+        return nn.GroupNorm(1, dim, eps=1e-8)
+    else:
+        return nn.BatchNorm1d(dim)
+class Encoder(nn.Module):
+    """Convolutional Encoder Layer.
+    Arguments
+    ---------
+    kernel_size : int
+        Length of filters.
+    in_channels : int
+        Number of  input channels.
+    out_channels : int
+        Number of output channels.
+    Example
+    -------
+    >>> x = torch.randn(2, 1000)
+    >>> encoder = Encoder(kernel_size=4, out_channels=64)
+    >>> h = encoder(x)
+    >>> h.shape
+    torch.Size([2, 64, 499])
+    """
+    def __init__(self, kernel_size=2, out_channels=64, in_channels=1):
+        super(Encoder, self).__init__()
+        self.conv1d = nn.Conv1d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=kernel_size // 2,
+            groups=1,
+            bias=False,
+        )
+        self.in_channels = in_channels
+    def forward(self, x):
+        """Return the encoded output.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor with dimensionality [B, L].
+        Return
+        ------
+        x : torch.Tensor
+            Encoded tensor with dimensionality [B, N, T_out].
+        where B = Batchsize
+              L = Number of timepoints
+              N = Number of filters
+              T_out = Number of timepoints at the output of the encoder
+        """
+        # B x L -> B x 1 x L
+        if self.in_channels == 1:
+            x = torch.unsqueeze(x, dim=1)
+        # B x 1 x L -> B x N x T_out
+        x = self.conv1d(x)
+        x = F.relu(x)
+        return x
+class Decoder(nn.ConvTranspose1d):
+    """A decoder layer that consists of ConvTranspose1d.
+    Arguments
+    ---------
+    kernel_size : int
+        Length of filters.
+    in_channels : int
+        Number of  input channels.
+    out_channels : int
+        Number of output channels.
+    Example
+    ---------
+    >>> x = torch.randn(2, 100, 1000)
+    >>> decoder = Decoder(kernel_size=4, in_channels=100, out_channels=1)
+    >>> h = decoder(x)
+    >>> h.shape
+    torch.Size([2, 1003])
+    """
+    def __init__(self, *args, **kwargs):
+        super(Decoder, self).__init__(*args, **kwargs)
+    def forward(self, x):
+        """Return the decoded output.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor with dimensionality [B, N, L].
+                where, B = Batchsize,
+                       N = number of filters
+                       L = time points
+        """
+        if x.dim() not in [2, 3]:
+            raise RuntimeError(
+                "{} accept 3/4D tensor as input".format(self.__name__)
+            )
+        x = super().forward(x if x.dim() == 3 else torch.unsqueeze(x, 1))
+        if torch.squeeze(x).dim() == 1:
+            x = torch.squeeze(x, dim=1)
+        else:
+            x = torch.squeeze(x)
+        return x
+class IdentityBlock:
+    """This block is used when we want to have identity transformation within the Dual_path block.
+    Example
+    -------
+    >>> x = torch.randn(10, 100)
+    >>> IB = IdentityBlock()
+    >>> xhat = IB(x)
+    """
+    def _init__(self, **kwargs):
+        pass
+    def __call__(self, x):
+        return x
+class MossFormerM(nn.Module):
+    """This class implements the transformer encoder.
+    Arguments
+    ---------
+    num_blocks : int
+        Number of mossformer blocks to include.
+    d_model : int
+        The dimension of the input embedding.
+    attn_dropout : float
+        Dropout for the self-attention (Optional).
+    group_size: int
+        the chunk size
+    query_key_dim: int
+        the attention vector dimension
+    expansion_factor: int
+        the expansion factor for the linear projection in conv module
+    causal: bool
+        true for causal / false for non causal
+    Example
+    -------
+    >>> import torch
+    >>> x = torch.rand((8, 60, 512))
+    >>> net = TransformerEncoder_MossFormerM(num_blocks=8, d_model=512)
+    >>> output, _ = net(x)
+    >>> output.shape
+    torch.Size([8, 60, 512])
+    """
+    def __init__(
+        self,
+        num_blocks,
+        d_model=None,
+        causal=False,
+        group_size = 256,
+        query_key_dim = 128,
+        expansion_factor = 4.,
+        attn_dropout = 0.1
+    ):
+        super().__init__()
+        self.mossformerM = MossformerBlock_GFSMN(
+                           dim=d_model,
+                           depth=num_blocks,
+                           group_size=group_size,
+                           query_key_dim=query_key_dim,
+                           expansion_factor=expansion_factor,
+                           causal=causal,
+                           attn_dropout=attn_dropout
+                              )
+        self.norm = nn.LayerNorm(d_model, eps=1e-6)
+    def forward(
+        self,
+        src,
+    ):
+        """
+        Arguments
+        ----------
+        src : torch.Tensor
+            Tensor shape [B, L, N],
+            where, B = Batchsize,
+                   L = time points
+                   N = number of filters
+            The sequence to the encoder layer (required).
+        src_mask : tensor
+            The mask for the src sequence (optional).
+        src_key_padding_mask : tensor
+            The mask for the src keys per batch (optional).
+        """
+        output = self.mossformerM(src)
+        output = self.norm(output)
+        return output
+class MossFormerM2(nn.Module):
+    """This class implements the transformer encoder.
+    Arguments
+    ---------
+    num_blocks : int
+        Number of mossformer blocks to include.
+    d_model : int
+        The dimension of the input embedding.
+    attn_dropout : float
+        Dropout for the self-attention (Optional).
+    group_size: int
+        the chunk size
+    query_key_dim: int
+        the attention vector dimension
+    expansion_factor: int
+        the expansion factor for the linear projection in conv module
+    causal: bool
+        true for causal / false for non causal
+    Example
+    -------
+    >>> import torch
+    >>> x = torch.rand((8, 60, 512))
+    >>> net = TransformerEncoder_MossFormerM(num_blocks=8, d_model=512)
+    >>> output, _ = net(x)
+    >>> output.shape
+    torch.Size([8, 60, 512])
+    """
+    def __init__(
+        self,
+        num_blocks,
+        d_model=None,
+        causal=False,
+        group_size = 256,
+        query_key_dim = 128,
+        expansion_factor = 4.,
+        attn_dropout = 0.1
+    ):
+        super().__init__()
+        self.mossformerM = MossformerBlock(
+                           dim=d_model,
+                           depth=num_blocks,
+                           group_size=group_size,
+                           query_key_dim=query_key_dim,
+                           expansion_factor=expansion_factor,
+                           causal=causal,
+                           attn_dropout=attn_dropout
+                              )
+        self.norm = nn.LayerNorm(d_model, eps=1e-6)
+    def forward(
+        self,
+        src,
+    ):
+        """
+        Arguments
+        ----------
+        src : torch.Tensor
+            Tensor shape [B, L, N],
+            where, B = Batchsize,
+                   L = time points
+                   N = number of filters
+            The sequence to the encoder layer (required).
+        src_mask : tensor
+            The mask for the src sequence (optional).
+        src_key_padding_mask : tensor
+            The mask for the src keys per batch (optional).
+        """
+        output = self.mossformerM(src)
+        output = self.norm(output)
+        return output
+class Computation_Block(nn.Module):
+    """Computation block for dual-path processing.
+    Arguments
+    ---------
+    intra_mdl : torch.nn.module
+        Model to process within the chunks.
+     inter_mdl : torch.nn.module
+        Model to process across the chunks.
+     out_channels : int
+        Dimensionality of inter/intra model.
+     norm : str
+        Normalization type.
+     skip_around_intra : bool
+        Skip connection around the intra layer.
+     linear_layer_after_inter_intra : bool
+        Linear layer or not after inter or intra.
+    Example
+    ---------
+        >>> comp_block = Computation_Block(64)
+        >>> x = torch.randn(10, 64, 100)
+        >>> x = comp_block(x)
+        >>> x.shape
+        torch.Size([10, 64, 100])
+    """
+    def __init__(
+        self,
+        num_blocks,
+        out_channels,
+        norm="ln",
+        skip_around_intra=True,
+    ):
+        super(Computation_Block, self).__init__()
+        ##MossFormer+: MossFormer with recurrence
+        self.intra_mdl = MossFormerM(num_blocks=num_blocks, d_model=out_channels)
+        ##MossFormerM2: the orignal MossFormer
+        #self.intra_mdl = MossFormerM2(num_blocks=num_blocks, d_model=out_channels)
+        self.skip_around_intra = skip_around_intra
+        # Norm
+        self.norm = norm
+        if norm is not None:
+            self.intra_norm = select_norm(norm, out_channels, 3)
+    def forward(self, x):
+        """Returns the output tensor.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor of dimension [B, N, S].
+        Return
+        ---------
+        out: torch.Tensor
+            Output tensor of dimension [B, N, S].
+            where, B = Batchsize,
+               N = number of filters
+               S = sequence time index
+        """
+        B, N, S = x.shape
+        # intra RNN
+        # [B, S, N]
+        intra = x.permute(0, 2, 1).contiguous() #.view(B, S, N)
+        intra = self.intra_mdl(intra)
+        # [B, N, S]
+        intra = intra.permute(0, 2, 1).contiguous()
+        if self.norm is not None:
+            intra = self.intra_norm(intra)
+        # [B, N, S]
+        if self.skip_around_intra:
+            intra = intra + x
+        out = intra
+        return out
+class MossFormer_MaskNet(nn.Module):
+    """The dual path model which is the basis for dualpathrnn, sepformer, dptnet.
+    Arguments
+    ---------
+    in_channels : int
+        Number of channels at the output of the encoder.
+    out_channels : int
+        Number of channels that would be inputted to the intra and inter blocks.
+    intra_model : torch.nn.module
+        Model to process within the chunks.
+    num_layers : int
+        Number of layers of Dual Computation Block.
+    norm : str
+        Normalization type.
+    num_spks : int
+        Number of sources (speakers).
+    skip_around_intra : bool
+        Skip connection around intra.
+    use_global_pos_enc : bool
+        Global positional encodings.
+    max_length : int
+        Maximum sequence length.
+    Example
+    ---------
+    >>> mossformer_block = MossFormerM(1, 64, 8)
+    >>> mossformer_masknet = MossFormer_MaskNet(64, 64, intra_block, num_spks=2)
+    >>> x = torch.randn(10, 64, 2000)
+    >>> x = mossformer_masknet(x)
+    >>> x.shape
+    torch.Size([2, 10, 64, 2000])
+    """
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        out_channels_final,
+        num_blocks=24,
+        norm="ln",
+        num_spks=1,
+        skip_around_intra=True,
+        use_global_pos_enc=True,
+        max_length=20000,
+    ):
+        super(MossFormer_MaskNet, self).__init__()
+        self.num_spks = num_spks
+        self.num_blocks = num_blocks
+        self.norm = select_norm(norm, in_channels, 3)
+        self.conv1d_encoder = nn.Conv1d(in_channels, out_channels, 1, bias=False)
+        self.use_global_pos_enc = use_global_pos_enc
+        if self.use_global_pos_enc:
+            self.pos_enc = ScaledSinuEmbedding(out_channels)
+        self.mdl = Computation_Block(
+                    num_blocks,
+                    out_channels,
+                    norm,
+                    skip_around_intra=skip_around_intra,
+                )
+        self.conv1d_out = nn.Conv1d(
+            out_channels, out_channels * num_spks, kernel_size=1
+        )
+        self.conv1_decoder = nn.Conv1d(out_channels, out_channels_final, 1, bias=False)
+        self.prelu = nn.PReLU()
+        self.activation = nn.ReLU()
+        # gated output layer
+        self.output = nn.Sequential(
+            nn.Conv1d(out_channels, out_channels, 1), nn.Tanh()
+        )
+        self.output_gate = nn.Sequential(
+            nn.Conv1d(out_channels, out_channels, 1), nn.Sigmoid()
+        )
+    def forward(self, x):
+        """Returns the output tensor.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor of dimension [B, N, S].
+        Returns
+        -------
+        out : torch.Tensor
+            Output tensor of dimension [spks, B, N, S]
+            where, spks = Number of speakers
+               B = Batchsize,
+               N = number of filters
+               S = the number of time frames
+        """
+        # before each line we indicate the shape after executing the line
+        # [B, N, L]
+        x = self.norm(x)
+        # [B, N, L]
+        x = self.conv1d_encoder(x)
+        if self.use_global_pos_enc:
+            #x = self.pos_enc(x.transpose(1, -1)).transpose(1, -1) + x * (
+            #    x.size(1) ** 0.5)
+            base = x
+            x = x.transpose(1, -1)
+            emb = self.pos_enc(x)
+            emb = emb.transpose(0, -1)
+            #print('base: {}, emb: {}'.format(base.shape, emb.shape))
+            x = base + emb
+        # [B, N, S]
+        #for i in range(self.num_modules):
+        #    x = self.dual_mdl[i](x)
+        x = self.mdl(x)
+        x = self.prelu(x)
+        # [B, N*spks, S]
+        x = self.conv1d_out(x)
+        B, _, S = x.shape
+        # [B*spks, N, S]
+        x = x.view(B * self.num_spks, -1, S)
+        # [B*spks, N, S]
+        x = self.output(x) * self.output_gate(x)
+        # [B*spks, N, S]
+        x = self.conv1_decoder(x)
+        # [B, spks, N, S]
+        _, N, L = x.shape
+        x = x.view(B, self.num_spks, N, L)
+        x = self.activation(x)
+        # [spks, B, N, S]
+        x = x.transpose(0, 1)
+        return x[0]
+class MossFormer(nn.Module):
+    def __init__(
+        self,
+        in_channels=512,
+        out_channels=512,
+        num_blocks=24,
+        kernel_size=16,
+        norm="ln",
+        num_spks=2,
+        skip_around_intra=True,
+        use_global_pos_enc=True,
+        max_length=20000,
+    ):
+        super(MossFormer, self).__init__()
+        self.num_spks = num_spks
+        self.enc = Encoder(kernel_size=kernel_size, out_channels=in_channels, in_channels=180)
+        self.mask_net = MossFormer_MaskNet(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            num_blocks=num_blocks,
+            norm=norm,
+            num_spks=num_spks,
+            skip_around_intra=skip_around_intra,
+            use_global_pos_enc=use_global_pos_enc,
+            max_length=max_length,
+        )
+        self.dec = Decoder(
+           in_channels=out_channels,
+           out_channels=1,
+           kernel_size=kernel_size,
+           stride = kernel_size//2,
+           bias=False
+        )
+    def forward(self, input):
+        x = self.enc(input)
+        mask = self.mask_net(x)
+        x = torch.stack([x] * self.num_spks)
+        sep_x = x * mask
+        # Decoding
+        est_source = torch.cat(
+            [
+                self.dec(sep_x[i]).unsqueeze(-1)
+                for i in range(self.num_spks)
+            ],
+            dim=-1,
+        )
+        T_origin = input.size(1)
+        T_est = est_source.size(1)
+        if T_origin > T_est:
+            est_source = F.pad(est_source, (0, 0, 0, T_origin - T_est))
+        else:
+            est_source = est_source[:, :T_origin, :]
+        out = []
+        for spk in range(self.num_spks):
+            out.append(est_source[:,:,spk])
+        return out

models/mossformer2_sr/mossformer2_block.py ADDED Viewed

	@@ -0,0 +1,735 @@

+"""
+This source code is modified by Shengkui Zhao based on https://github.com/lucidrains/FLASH-pytorch
+"""
+import math
+import torch
+import torch.nn.functional as F
+from torch import nn, einsum
+from einops import rearrange
+from rotary_embedding_torch import RotaryEmbedding
+from models.mossformer2_se.conv_module import ConvModule, GLU, FFConvM_Dilated
+from models.mossformer2_se.fsmn import UniDeepFsmn, UniDeepFsmn_dilated
+from torchinfo import summary
+from models.mossformer2_se.layer_norm import CLayerNorm, GLayerNorm, GlobLayerNorm, ILayerNorm
+# Helper functions
+def identity(t, *args, **kwargs):
+    """
+    Returns the input tensor unchanged.
+    Args:
+        t (torch.Tensor): Input tensor.
+        *args: Additional arguments (ignored).
+        **kwargs: Additional keyword arguments (ignored).
+    Returns:
+        torch.Tensor: The input tensor.
+    """
+    return t
+def append_dims(x, num_dims):
+    """
+    Adds additional dimensions to the input tensor.
+    Args:
+        x (torch.Tensor): Input tensor.
+        num_dims (int): Number of dimensions to append.
+    Returns:
+        torch.Tensor: Tensor with appended dimensions.
+    """
+    if num_dims <= 0:
+        return x
+    return x.view(*x.shape, *((1,) * num_dims))  # Reshape to append dimensions
+def exists(val):
+    """
+    Checks if a value exists (is not None).
+    Args:
+        val: The value to check.
+    Returns:
+        bool: True if value exists, False otherwise.
+    """
+    return val is not None
+def default(val, d):
+    """
+    Returns a default value if the given value does not exist.
+    Args:
+        val: The value to check.
+        d: Default value to return if val does not exist.
+    Returns:
+        The original value if it exists, otherwise the default value.
+    """
+    return val if exists(val) else d
+def padding_to_multiple_of(n, mult):
+    """
+    Calculates the amount of padding needed to make a number a multiple of another.
+    Args:
+        n (int): The number to pad.
+        mult (int): The multiple to match.
+    Returns:
+        int: The padding amount required to make n a multiple of mult.
+    """
+    remainder = n % mult
+    if remainder == 0:
+        return 0
+    return mult - remainder  # Return the required padding
+# Scale Normalization class
+class ScaleNorm(nn.Module):
+    """
+    ScaleNorm implements a scaled normalization technique for neural network layers.
+    Attributes:
+        dim (int): Dimension of the input features.
+        eps (float): Small value to prevent division by zero.
+        g (nn.Parameter): Learnable parameter for scaling.
+    """
+    def __init__(self, dim, eps=1e-5):
+        super().__init__()
+        self.scale = dim ** -0.5  # Calculate scale factor
+        self.eps = eps  # Set epsilon
+        self.g = nn.Parameter(torch.ones(1))  # Initialize scaling parameter
+    def forward(self, x):
+        """
+        Forward pass for the ScaleNorm layer.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Scaled and normalized output tensor.
+        """
+        norm = torch.norm(x, dim=-1, keepdim=True) * self.scale  # Compute norm
+        return x / norm.clamp(min=self.eps) * self.g  # Normalize and scale
+# Absolute positional encodings class
+class ScaledSinuEmbedding(nn.Module):
+    """
+    ScaledSinuEmbedding provides sinusoidal positional encodings for inputs.
+    Attributes:
+        scale (nn.Parameter): Learnable scale factor for the embeddings.
+        inv_freq (torch.Tensor): Inverse frequency used for sine and cosine calculations.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(1,))  # Initialize scale
+        inv_freq = 1. / (10000 ** (torch.arange(0, dim, 2).float() / dim))  # Calculate inverse frequency
+        self.register_buffer('inv_freq', inv_freq)  # Register as a buffer
+    def forward(self, x):
+        """
+        Forward pass for the ScaledSinuEmbedding layer.
+        Args:
+            x (torch.Tensor): Input tensor of shape (batch_size, sequence_length).
+        Returns:
+            torch.Tensor: Positional encoding tensor of shape (batch_size, sequence_length, dim).
+        """
+        n, device = x.shape[1], x.device  # Extract sequence length and device
+        t = torch.arange(n, device=device).type_as(self.inv_freq)  # Create time steps
+        sinu = einsum('i , j -> i j', t, self.inv_freq)  # Calculate sine and cosine embeddings
+        emb = torch.cat((sinu.sin(), sinu.cos()), dim=-1)  # Concatenate sine and cosine embeddings
+        return emb * self.scale  # Scale the embeddings
+class OffsetScale(nn.Module):
+    """
+    OffsetScale applies learned offsets and scales to the input tensor.
+    Attributes:
+        gamma (nn.Parameter): Learnable scale parameter for each head.
+        beta (nn.Parameter): Learnable offset parameter for each head.
+    """
+    def __init__(self, dim, heads=1):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.ones(heads, dim))  # Initialize scale parameters
+        self.beta = nn.Parameter(torch.zeros(heads, dim))  # Initialize offset parameters
+        nn.init.normal_(self.gamma, std=0.02)  # Normal initialization for gamma
+    def forward(self, x):
+        """
+        Forward pass for the OffsetScale layer.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            List[torch.Tensor]: A list of tensors with applied offsets and scales for each head.
+        """
+        out = einsum('... d, h d -> ... h d', x, self.gamma) + self.beta  # Apply scaling and offsets
+        return out.unbind(dim=-2)  # Unbind heads into a list
+# Feed-Forward Convolutional Module
+class FFConvM(nn.Module):
+    """
+    FFConvM is a feed-forward convolutional module with normalization and dropout.
+    Attributes:
+        dim_in (int): Input dimension of the features.
+        dim_out (int): Output dimension after processing.
+        norm_klass (nn.Module): Normalization class to be used.
+        dropout (float): Dropout probability.
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        norm_klass=nn.LayerNorm,
+        dropout=0.1
+    ):
+        super().__init__()
+        self.mdl = nn.Sequential(
+            norm_klass(dim_in),  # Normalize input
+            nn.Linear(dim_in, dim_out),  # Linear transformation
+            nn.SiLU(),  # Activation function
+            ConvModule(dim_out),  # Convolution module
+            nn.Dropout(dropout)  # Apply dropout
+        )
+    def forward(self, x):
+        """
+        Forward pass for the FFConvM module.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Output tensor after processing.
+        """
+        output = self.mdl(x)  # Pass through the model
+        return output
+class FFM(nn.Module):
+    """
+    FFM is a feed-forward module with normalization and dropout.
+    Attributes:
+        dim_in (int): Input dimension of the features.
+        dim_out (int): Output dimension after processing.
+        norm_klass (nn.Module): Normalization class to be used.
+        dropout (float): Dropout probability.
+    """
+    def __init__(
+        self,
+        dim_in,
+        dim_out,
+        norm_klass=nn.LayerNorm,
+        dropout=0.1
+    ):
+        super().__init__()
+        self.mdl = nn.Sequential(
+            norm_klass(dim_in),  # Normalize input
+            nn.Linear(dim_in, dim_out),  # Linear transformation
+            nn.SiLU(),  # Activation function
+            nn.Dropout(dropout)  # Apply dropout
+        )
+    def forward(self, x):
+        """
+        Forward pass for the FFM module.
+        Args:
+            x (torch.Tensor): Input tensor.
+        Returns:
+            torch.Tensor: Output tensor after processing.
+        """
+        output = self.mdl(x)  # Pass through the model
+        return output
+class FLASH_ShareA_FFConvM(nn.Module):
+    """
+    Fast Shared Dual Attention Mechanism with feed-forward convolutional blocks.
+    Published in paper: "MossFormer: Pushing the Performance Limit of Monaural Speech Separation
+    using Gated Single-Head Transformer with Convolution-Augmented Joint Self-Attentions", ICASSP 2023.
+    (https://arxiv.org/abs/2302.11824)
+    Args:
+        dim (int): Input dimension.
+        group_size (int, optional): Size of groups for processing. Defaults to 256.
+        query_key_dim (int, optional): Dimension of the query and key. Defaults to 128.
+        expansion_factor (float, optional): Factor to expand the hidden dimension. Defaults to 1.
+        causal (bool, optional): Whether to use causal masking. Defaults to False.
+        dropout (float, optional): Dropout rate. Defaults to 0.1.
+        rotary_pos_emb (optional): Rotary positional embeddings for attention. Defaults to None.
+        norm_klass (callable, optional): Normalization class to use. Defaults to nn.LayerNorm.
+        shift_tokens (bool, optional): Whether to shift tokens for attention calculation. Defaults to True.
+    """
+    def __init__(
+        self,
+        *,
+        dim,
+        group_size=256,
+        query_key_dim=128,
+        expansion_factor=1.,
+        causal=False,
+        dropout=0.1,
+        rotary_pos_emb=None,
+        norm_klass=nn.LayerNorm,
+        shift_tokens=True
+    ):
+        super().__init__()
+        hidden_dim = int(dim * expansion_factor)
+        self.group_size = group_size
+        self.causal = causal
+        self.shift_tokens = shift_tokens
+        # Initialize positional embeddings, dropout, and projections
+        self.rotary_pos_emb = rotary_pos_emb
+        self.dropout = nn.Dropout(dropout)
+        # Feed-forward layers
+        self.to_hidden = FFConvM(
+            dim_in=dim,
+            dim_out=hidden_dim,
+            norm_klass=norm_klass,
+            dropout=dropout,
+        )
+        self.to_qk = FFConvM(
+            dim_in=dim,
+            dim_out=query_key_dim,
+            norm_klass=norm_klass,
+            dropout=dropout,
+        )
+        # Offset and scale for query and key
+        self.qk_offset_scale = OffsetScale(query_key_dim, heads=4)
+        self.to_out = FFConvM(
+            dim_in=dim * 2,
+            dim_out=dim,
+            norm_klass=norm_klass,
+            dropout=dropout,
+        )
+        self.gateActivate = nn.Sigmoid()
+    def forward(self, x, *, mask=None):
+        """
+        Forward pass for FLASH layer.
+        Args:
+            x (Tensor): Input tensor of shape (batch, seq_len, features).
+            mask (Tensor, optional): Mask for attention. Defaults to None.
+        Returns:
+            Tensor: Output tensor after applying attention and projections.
+        """
+        # Pre-normalization step
+        normed_x = x
+        residual = x  # Save residual for skip connection
+        # Token shifting if enabled
+        if self.shift_tokens:
+            x_shift, x_pass = normed_x.chunk(2, dim=-1)
+            x_shift = F.pad(x_shift, (0, 0, 1, -1), value=0.)
+            normed_x = torch.cat((x_shift, x_pass), dim=-1)
+        # Initial projections
+        v, u = self.to_hidden(normed_x).chunk(2, dim=-1)
+        qk = self.to_qk(normed_x)
+        # Offset and scale
+        quad_q, lin_q, quad_k, lin_k = self.qk_offset_scale(qk)
+        att_v, att_u = self.cal_attention(x, quad_q, lin_q, quad_k, lin_k, v, u)
+        # Output calculation with gating
+        out = (att_u * v) * self.gateActivate(att_v * u)
+        x = x + self.to_out(out)  # Residual connection
+        return x
+    def cal_attention(self, x, quad_q, lin_q, quad_k, lin_k, v, u, mask=None):
+        """
+        Calculate attention output using quadratic and linear attention mechanisms.
+        Args:
+            x (Tensor): Input tensor of shape (batch, seq_len, features).
+            quad_q (Tensor): Quadratic query representation.
+            lin_q (Tensor): Linear query representation.
+            quad_k (Tensor): Quadratic key representation.
+            lin_k (Tensor): Linear key representation.
+            v (Tensor): Value representation.
+            u (Tensor): Additional value representation.
+            mask (Tensor, optional): Mask for attention. Defaults to None.
+        Returns:
+            Tuple[Tensor, Tensor]: Attention outputs for v and u.
+        """
+        b, n, device, g = x.shape[0], x.shape[-2], x.device, self.group_size
+        # Apply mask to linear keys if provided
+        if exists(mask):
+            lin_mask = rearrange(mask, '... -> ... 1')
+            lin_k = lin_k.masked_fill(~lin_mask, 0.)
+        # Rotate queries and keys with rotary positional embeddings
+        if exists(self.rotary_pos_emb):
+            quad_q, lin_q, quad_k, lin_k = map(self.rotary_pos_emb.rotate_queries_or_keys, (quad_q, lin_q, quad_k, lin_k))
+        # Padding for group processing
+        padding = padding_to_multiple_of(n, g)
+        if padding > 0:
+            quad_q, quad_k, lin_q, lin_k, v, u = map(lambda t: F.pad(t, (0, 0, 0, padding), value=0.), (quad_q, quad_k, lin_q, lin_k, v, u))
+            mask = default(mask, torch.ones((b, n), device=device, dtype=torch.bool))
+            mask = F.pad(mask, (0, padding), value=False)
+        # Group along sequence for attention
+        quad_q, quad_k, lin_q, lin_k, v, u = map(lambda t: rearrange(t, 'b (g n) d -> b g n d', n=self.group_size), (quad_q, quad_k, lin_q, lin_k, v, u))
+        if exists(mask):
+            mask = rearrange(mask, 'b (g j) -> b g 1 j', j=g)
+        # Calculate quadratic attention output
+        sim = einsum('... i d, ... j d -> ... i j', quad_q, quad_k) / g
+        attn = F.relu(sim) ** 2  # ReLU activation
+        attn = self.dropout(attn)
+        # Apply mask to attention if provided
+        if exists(mask):
+            attn = attn.masked_fill(~mask, 0.)
+        # Apply causal mask if needed
+        if self.causal:
+            causal_mask = torch.ones((g, g), dtype=torch.bool, device=device).triu(1)
+            attn = attn.masked_fill(causal_mask, 0.)
+        # Calculate output from attention
+        quad_out_v = einsum('... i j, ... j d -> ... i d', attn, v)
+        quad_out_u = einsum('... i j, ... j d -> ... i d', attn, u)
+        # Calculate linear attention output
+        if self.causal:
+            lin_kv = einsum('b g n d, b g n e -> b g d e', lin_k, v) / g
+            lin_kv = lin_kv.cumsum(dim=1)  # Cumulative sum for linear attention
+            lin_kv = F.pad(lin_kv, (0, 0, 0, 0, 1, -1), value=0.)
+            lin_out_v = einsum('b g d e, b g n d -> b g n e', lin_kv, lin_q)
+            lin_ku = einsum('b g n d, b g n e -> b g d e', lin_k, u) / g
+            lin_ku = lin_ku.cumsum(dim=1)  # Cumulative sum for linear attention
+            lin_ku = F.pad(lin_ku, (0, 0, 0, 0, 1, -1), value=0.)
+            lin_out_u = einsum('b g d e, b g n d -> b g n e', lin_ku, lin_q)
+        else:
+            lin_kv = einsum('b g n d, b g n e -> b d e', lin_k, v) / n
+            lin_out_v = einsum('b g n d, b d e -> b g n e', lin_q, lin_kv)
+            lin_ku = einsum('b g n d, b g n e -> b d e', lin_k, u) / n
+            lin_out_u = einsum('b g n d, b d e -> b g n e', lin_q, lin_ku)
+        # Reshape and remove padding from outputs
+        return map(lambda t: rearrange(t, 'b g n d -> b (g n) d')[:, :n], (quad_out_v + lin_out_v, quad_out_u + lin_out_u))
+class Gated_FSMN(nn.Module):
+    """
+    Gated Frequency Selective Memory Network (FSMN) class.
+    This class implements a gated FSMN that combines two feedforward
+    convolutional networks with a frequency selective memory module.
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        lorder (int): Order of the filter for FSMN.
+        hidden_size (int): Number of hidden units in the network.
+    """
+    def __init__(self, in_channels, out_channels, lorder, hidden_size):
+        super().__init__()
+        # Feedforward network for the first branch (u)
+        self.to_u = FFConvM(
+            dim_in=in_channels,
+            dim_out=hidden_size,
+            norm_klass=nn.LayerNorm,
+            dropout=0.1,
+        )
+        # Feedforward network for the second branch (v)
+        self.to_v = FFConvM(
+            dim_in=in_channels,
+            dim_out=hidden_size,
+            norm_klass=nn.LayerNorm,
+            dropout=0.1,
+        )
+        # Frequency selective memory network
+        self.fsmn = UniDeepFsmn(in_channels, out_channels, lorder, hidden_size)
+    def forward(self, x):
+        """
+        Forward pass for the Gated FSMN.
+        Args:
+            x (Tensor): Input tensor of shape (batch_size, in_channels, sequence_length).
+        Returns:
+            Tensor: Output tensor after applying gated FSMN operations.
+        """
+        input = x
+        x_u = self.to_u(x)  # Process input through the first branch
+        x_v = self.to_v(x)  # Process input through the second branch
+        x_u = self.fsmn(x_u)  # Apply FSMN to the output of the first branch
+        x = x_v * x_u + input  # Combine outputs with the original input
+        return x
+class Gated_FSMN_Block(nn.Module):
+    """
+    A 1-D convolutional block that incorporates a gated FSMN.
+    This block consists of two convolutional layers, followed by a
+    gated FSMN and normalization layers.
+    Args:
+        dim (int): Dimensionality of the input.
+        inner_channels (int): Number of channels in the inner layers.
+        group_size (int): Size of the groups for normalization.
+        norm_type (str): Type of normalization to use ('scalenorm' or 'layernorm').
+    """
+    def __init__(self, dim, inner_channels=256, group_size=256, norm_type='scalenorm'):
+        super(Gated_FSMN_Block, self).__init__()
+        # Choose normalization class based on the provided type
+        if norm_type == 'scalenorm':
+            norm_klass = ScaleNorm
+        elif norm_type == 'layernorm':
+            norm_klass = nn.LayerNorm
+        self.group_size = group_size
+        # First convolutional layer with PReLU activation
+        self.conv1 = nn.Sequential(
+            nn.Conv1d(dim, inner_channels, kernel_size=1),
+            nn.PReLU(),
+        )
+        self.norm1 = CLayerNorm(inner_channels)  # Normalization after first convolution
+        self.gated_fsmn = Gated_FSMN(inner_channels, inner_channels, lorder=20, hidden_size=inner_channels)  # Gated FSMN layer
+        self.norm2 = CLayerNorm(inner_channels)  # Normalization after FSMN
+        self.conv2 = nn.Conv1d(inner_channels, dim, kernel_size=1)  # Final convolutional layer
+    def forward(self, input):
+        """
+        Forward pass for the Gated FSMN Block.
+        Args:
+            input (Tensor): Input tensor of shape (batch_size, dim, sequence_length).
+        Returns:
+            Tensor: Output tensor after processing through the block.
+        """
+        conv1 = self.conv1(input.transpose(2, 1))  # Apply first convolution
+        norm1 = self.norm1(conv1)  # Apply normalization
+        seq_out = self.gated_fsmn(norm1.transpose(2, 1))  # Apply gated FSMN
+        norm2 = self.norm2(seq_out.transpose(2, 1))  # Apply second normalization
+        conv2 = self.conv2(norm2)  # Apply final convolution
+        return conv2.transpose(2, 1) + input  # Residual connection
+class MossformerBlock_GFSMN(nn.Module):
+    """
+    Mossformer Block with Gated FSMN.
+    This block combines attention mechanisms and gated FSMN layers
+    to process input sequences.
+    Args:
+        dim (int): Dimensionality of the input.
+        depth (int): Number of layers in the block.
+        group_size (int): Size of the groups for normalization.
+        query_key_dim (int): Dimension of the query and key in attention.
+        expansion_factor (float): Expansion factor for feedforward layers.
+        causal (bool): If True, enables causal attention.
+        attn_dropout (float): Dropout rate for attention layers.
+        norm_type (str): Type of normalization to use ('scalenorm' or 'layernorm').
+        shift_tokens (bool): If True, shifts tokens in the attention layer.
+    """
+    def __init__(self, *, dim, depth, group_size=256, query_key_dim=128, expansion_factor=4., causal=False, attn_dropout=0.1, norm_type='scalenorm', shift_tokens=True):
+        super().__init__()
+        assert norm_type in ('scalenorm', 'layernorm'), 'norm_type must be one of scalenorm or layernorm'
+        if norm_type == 'scalenorm':
+            norm_klass = ScaleNorm
+        elif norm_type == 'layernorm':
+            norm_klass = nn.LayerNorm
+        self.group_size = group_size
+        # Rotary positional embedding for attention
+        rotary_pos_emb = RotaryEmbedding(dim=min(32, query_key_dim))
+        # Create a list of Gated FSMN blocks
+        self.fsmn = nn.ModuleList([Gated_FSMN_Block(dim) for _ in range(depth)])
+        # Create a list of attention layers using FLASH_ShareA_FFConvM
+        self.layers = nn.ModuleList([
+            FLASH_ShareA_FFConvM(
+                dim=dim,
+                group_size=group_size,
+                query_key_dim=query_key_dim,
+                expansion_factor=expansion_factor,
+                causal=causal,
+                dropout=attn_dropout,
+                rotary_pos_emb=rotary_pos_emb,
+                norm_klass=norm_klass,
+                shift_tokens=shift_tokens
+            ) for _ in range(depth)
+        ])
+    def _build_repeats(self, in_channels, out_channels, lorder, hidden_size, repeats=1):
+        """
+        Builds repeated UniDeep FSMN layers.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            lorder (int): Order of the filter for FSMN.
+            hidden_size (int): Number of hidden units.
+            repeats (int): Number of repetitions.
+        Returns:
+            Sequential: A sequential container with repeated layers.
+        """
+        repeats = [
+            UniDeepFsmn(in_channels, out_channels, lorder, hidden_size)
+            for i in range(repeats)
+        ]
+        return nn.Sequential(*repeats)
+    def forward(self, x, *, mask=None):
+        """
+        Forward pass for the Mossformer Block with Gated FSMN.
+        Args:
+            x (Tensor): Input tensor of shape (batch_size, dim, sequence_length).
+            mask (Tensor, optional): Mask tensor for attention operations.
+        Returns:
+            Tensor: Output tensor after processing through the block.
+        """
+        ii = 0
+        for flash in self.layers:  # Process through each layer
+            x = flash(x, mask=mask)
+            x = self.fsmn[ii](x)  # Apply corresponding Gated FSMN block
+            ii += 1
+        return x
+class MossformerBlock(nn.Module):
+    """
+    Mossformer Block with attention mechanisms.
+    This block is designed to process input sequences using attention
+    layers and incorporates rotary positional embeddings. It allows
+    for configurable normalization types and can handle causal
+    attention.
+    Args:
+        dim (int): Dimensionality of the input.
+        depth (int): Number of attention layers in the block.
+        group_size (int, optional): Size of groups for normalization. Default is 256.
+        query_key_dim (int, optional): Dimension of the query and key in attention. Default is 128.
+        expansion_factor (float, optional): Expansion factor for feedforward layers. Default is 4.
+        causal (bool, optional): If True, enables causal attention. Default is False.
+        attn_dropout (float, optional): Dropout rate for attention layers. Default is 0.1.
+        norm_type (str, optional): Type of normalization to use ('scalenorm' or 'layernorm'). Default is 'scalenorm'.
+        shift_tokens (bool, optional): If True, shifts tokens in the attention layer. Default is True.
+    """
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        group_size=256,
+        query_key_dim=128,
+        expansion_factor=4.0,
+        causal=False,
+        attn_dropout=0.1,
+        norm_type='scalenorm',
+        shift_tokens=True
+    ):
+        super().__init__()
+        # Ensure normalization type is valid
+        assert norm_type in ('scalenorm', 'layernorm'), 'norm_type must be one of scalenorm or layernorm'
+        # Select normalization class based on the provided type
+        if norm_type == 'scalenorm':
+            norm_klass = ScaleNorm
+        elif norm_type == 'layernorm':
+            norm_klass = nn.LayerNorm
+        self.group_size = group_size  # Group size for normalization
+        # Rotary positional embedding for attention
+        rotary_pos_emb = RotaryEmbedding(dim=min(32, query_key_dim))
+        # Max rotary embedding dimensions of 32, partial Rotary embeddings, from Wang et al - GPT-J
+        # Create a list of attention layers using FLASH_ShareA_FFConvM
+        self.layers = nn.ModuleList([
+            FLASH_ShareA_FFConvM(
+                dim=dim,
+                group_size=group_size,
+                query_key_dim=query_key_dim,
+                expansion_factor=expansion_factor,
+                causal=causal,
+                dropout=attn_dropout,
+                rotary_pos_emb=rotary_pos_emb,
+                norm_klass=norm_klass,
+                shift_tokens=shift_tokens
+            ) for _ in range(depth)
+        ])
+    def _build_repeats(self, in_channels, out_channels, lorder, hidden_size, repeats=1):
+        """
+        Builds repeated UniDeep FSMN layers.
+        Args:
+            in_channels (int): Number of input channels.
+            out_channels (int): Number of output channels.
+            lorder (int): Order of the filter for FSMN.
+            hidden_size (int): Number of hidden units.
+            repeats (int, optional): Number of repetitions. Default is 1.
+        Returns:
+            Sequential: A sequential container with repeated layers.
+        """
+        repeats = [
+            UniDeepFsmn(in_channels, out_channels, lorder, hidden_size)
+            for _ in range(repeats)
+        ]
+        return nn.Sequential(*repeats)
+    def forward(self, x, *, mask=None):
+        """
+        Forward pass for the Mossformer Block.
+        Args:
+            x (Tensor): Input tensor of shape (batch_size, dim, sequence_length).
+            mask (Tensor, optional): Mask tensor for attention operations.
+        Returns:
+            Tensor: Output tensor after processing through the block.
+        """
+        # Process input through each attention layer
+        for flash in self.layers:
+            x = flash(x, mask=mask)  # Apply attention layer with optional mask
+        return x  # Return the final output tensor

models/mossformer2_sr/mossformer2_sr_wrapper.py ADDED Viewed

	@@ -0,0 +1,52 @@

+from models.mossformer2_sr.generator import Mossformer, Generator
+import torch.nn as nn
+class MossFormer2_SR_48K(nn.Module):
+    """
+    The MossFormer2_SR_48K model for speech super-resolution.
+    This class encapsulates the functionality of the MossFormer2 and HiFi-Gan
+    Generator within a higher-level model. It processes input audio data to produce
+    higher-resolution outputs.
+    Arguments
+    ---------
+    args : Namespace
+        Configuration arguments that may include hyperparameters
+        and model settings (not utilized in this implementation but
+        can be extended for flexibility).
+    Example
+    ---------
+    >>> model = MossFormer2_SR_48K(args).model
+    >>> x = torch.randn(10, 180, 2000)  # Example input
+    >>> outputs = model(x)  # Forward pass
+    >>> outputs.shape, mask.shape  # Check output shapes
+    """
+    def __init__(self, args):
+        super(MossFormer2_SR_48K, self).__init__()
+        # Initialize the TestNet model, which contains the MossFormer MaskNet
+        self.model_m = Mossformer()  # Instance of TestNet
+        self.model_g = Generator(args)
+    def forward(self, x):
+        """
+        Forward pass through the model.
+        Arguments
+        ---------
+        x : torch.Tensor
+            Input tensor of dimension [B, N, S], where B is the batch size,
+            N is the number of mel bins (80 in this case), and S is the
+            sequence length (e.g., time frames).
+        Returns
+        -------
+        outputs : torch.Tensor
+            Bandwidth expanded audio output tensor from the model.
+        """
+        x = self.model_m(x)  # Get outputs and mask from TestNet
+        outpus = self.model_g(x)
+        return outputs  # Return the outputs

models/mossformer2_sr/snake.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch.nn.utils import weight_norm
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+# Scripting this brings model speed up 1.4x
+@torch.jit.script
+def snake(x, alpha):
+    shape = x.shape
+    x = x.reshape(shape[0], shape[1], -1)
+    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
+    x = x.reshape(shape)
+    return x
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+    def forward(self, x):
+        return snake(x, self.alpha)

models/mossformer2_sr/utils.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import glob
+import os
+import torch
+from torch.nn.utils import weight_norm
+def init_weights(m, mean=0.0, std=0.01):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        m.weight.data.normal_(mean, std)
+def apply_weight_norm(m):
+    classname = m.__class__.__name__
+    if classname.find("Conv") != -1:
+        weight_norm(m)
+def get_padding(kernel_size, dilation=1):
+    return int((kernel_size*dilation - dilation)/2)
+def load_checkpoint(filepath, device):
+    assert os.path.isfile(filepath)
+    print("Loading '{}'".format(filepath))
+    checkpoint_dict = torch.load(filepath, map_location=device)
+    print("Complete.")
+    return checkpoint_dict
+def save_checkpoint(filepath, obj):
+    print("Saving checkpoint to {}".format(filepath))
+    torch.save(obj, filepath)
+    print("Complete.")
+def scan_checkpoint(cp_dir, prefix):
+    pattern = os.path.join(cp_dir, prefix + '????????')
+    cp_list = glob.glob(pattern)
+    if len(cp_list) == 0:
+        return None
+    return sorted(cp_list)[-1]