Spaces:

alibabasglab
/

ClearVoice

Running on Zero

File size: 10,621 Bytes

8e8cd3e

import torch.nn as nn
import torch.nn.functional as F
import torch as th
from torch.nn.parameter import Parameter
import numpy as np
import os

class UniDeepFsmn(nn.Module):
    """
    UniDeepFsmn is a neural network module that implements a single-deep feedforward sequence memory network (FSMN).

    Attributes:
        input_dim (int): Dimension of the input features.
        output_dim (int): Dimension of the output features.
        lorder (int): Length of the order for the convolution layers.
        hidden_size (int): Number of hidden units in the linear layer.
        linear (nn.Linear): Linear layer to project input features to hidden size.
        project (nn.Linear): Linear layer to project hidden features to output dimensions.
        conv1 (nn.Conv2d): Convolutional layer for processing the output in a grouped manner.
    """

    def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None):
        super(UniDeepFsmn, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        if lorder is None:
            return
        self.lorder = lorder
        self.hidden_size = hidden_size
        
        # Initialize the layers
        self.linear = nn.Linear(input_dim, hidden_size)  # Linear transformation to hidden size
        self.project = nn.Linear(hidden_size, output_dim, bias=False)  # Project hidden size to output dimension
        self.conv1 = nn.Conv2d(output_dim, output_dim, [lorder + lorder - 1, 1], [1, 1], groups=output_dim, bias=False)  # Convolution layer

    def forward(self, input):
        """
        Forward pass for the UniDeepFsmn model.

        Args:
            input (torch.Tensor): Input tensor of shape (batch_size, input_dim).

        Returns:
            torch.Tensor: The output tensor of the same shape as input, enhanced by the network.
        """
        f1 = F.relu(self.linear(input))  # Apply linear layer followed by ReLU activation
        p1 = self.project(f1)  # Project to output dimension
        x = th.unsqueeze(p1, 1)  # Add a dimension for compatibility with Conv2d
        x_per = x.permute(0, 3, 2, 1)  # Permute dimensions for convolution
        y = F.pad(x_per, [0, 0, self.lorder - 1, self.lorder - 1])  # Pad for causal convolution
        out = x_per + self.conv1(y)  # Add original input to convolution output
        out1 = out.permute(0, 3, 2, 1)  # Permute back to original dimensions
        return input + out1.squeeze()  # Return enhanced input


class UniDeepFsmn_dual(nn.Module):
    """
    UniDeepFsmn_dual is a neural network module that implements a dual-deep feedforward sequence memory network (FSMN).

    This class extends the UniDeepFsmn by adding a second convolution layer for richer feature extraction.

    Attributes:
        input_dim (int): Dimension of the input features.
        output_dim (int): Dimension of the output features.
        lorder (int): Length of the order for the convolution layers.
        hidden_size (int): Number of hidden units in the linear layer.
        linear (nn.Linear): Linear layer to project input features to hidden size.
        project (nn.Linear): Linear layer to project hidden features to output dimensions.
        conv1 (nn.Conv2d): First convolutional layer for processing the output.
        conv2 (nn.Conv2d): Second convolutional layer for further processing the features.
    """

    def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None):
        super(UniDeepFsmn_dual, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        if lorder is None:
            return
        self.lorder = lorder
        self.hidden_size = hidden_size
        
        # Initialize the layers
        self.linear = nn.Linear(input_dim, hidden_size)  # Linear transformation to hidden size
        self.project = nn.Linear(hidden_size, output_dim, bias=False)  # Project hidden size to output dimension
        self.conv1 = nn.Conv2d(output_dim, output_dim, [lorder + lorder - 1, 1], [1, 1], groups=output_dim, bias=False)  # First convolution layer
        self.conv2 = nn.Conv2d(output_dim, output_dim, [lorder + lorder - 1, 1], [1, 1], groups=output_dim // 4, bias=False)  # Second convolution layer

    def forward(self, input):
        """
        Forward pass for the UniDeepFsmn_dual model.

        Args:
            input (torch.Tensor): Input tensor of shape (batch_size, input_dim).

        Returns:
            torch.Tensor: The output tensor of the same shape as input, enhanced by the network.
        """
        f1 = F.relu(self.linear(input))  # Apply linear layer followed by ReLU activation
        p1 = self.project(f1)  # Project to output dimension
        x = th.unsqueeze(p1, 1)  # Add a dimension for compatibility with Conv2d
        x_per = x.permute(0, 3, 2, 1)  # Permute dimensions for convolution
        y = F.pad(x_per, [0, 0, self.lorder - 1, self.lorder - 1])  # Pad for causal convolution
        conv1_out = x_per + self.conv1(y)  # Add original input to first convolution output
        z = F.pad(conv1_out, [0, 0, self.lorder - 1, self.lorder - 1])  # Pad for second convolution
        out = conv1_out + self.conv2(z)  # Add output of second convolution
        out1 = out.permute(0, 3, 2, 1)  # Permute back to original dimensions
        return input + out1.squeeze()  # Return enhanced input


class DilatedDenseNet(nn.Module):
    """
    DilatedDenseNet implements a dense network structure with dilated convolutions.

    This architecture enables wider receptive fields while maintaining a lower number of parameters. 
    It consists of multiple convolutional layers with dilation rates that increase at each layer.

    Attributes:
        depth (int): Number of convolutional layers in the network.
        in_channels (int): Number of input channels for the first layer.
        pad (nn.ConstantPad2d): Padding layer to maintain dimensions.
        twidth (int): Width of the kernel used in convolution.
        kernel_size (tuple): Kernel size for convolution operations.
    """

    def __init__(self, depth=4, lorder=20, in_channels=64):
        super(DilatedDenseNet, self).__init__()
        self.depth = depth
        self.in_channels = in_channels
        self.pad = nn.ConstantPad2d((1, 1, 1, 0), value=0.)  # Padding for the input
        self.twidth = lorder * 2 - 1  # Width of the kernel
        self.kernel_size = (self.twidth, 1)  # Kernel size for convolutions

        # Initialize layers dynamically based on depth
        for i in range(self.depth):
            dil = 2 ** i  # Calculate dilation rate
            pad_length = lorder + (dil - 1) * (lorder - 1) - 1  # Calculate padding length
            setattr(self, 'pad{}'.format(i + 1), nn.ConstantPad2d((0, 0, pad_length, pad_length), value=0.))  # Padding for dilation
            setattr(self, 'conv{}'.format(i + 1),
                    nn.Conv2d(self.in_channels * (i + 1), self.in_channels, kernel_size=self.kernel_size,
                              dilation=(dil, 1), groups=self.in_channels, bias=False))  # Convolution layer with dilation
            setattr(self, 'norm{}'.format(i + 1), nn.InstanceNorm2d(in_channels, affine=True))  # Normalization layer
            setattr(self, 'prelu{}'.format(i + 1), nn.PReLU(self.in_channels))  # Activation layer

    def forward(self, x):
        """
        Forward pass for the DilatedDenseNet model.

        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, in_channels, height, width).

        Returns:
            torch.Tensor: Output tensor after applying dense layers.
        """
        skip = x  # Initialize skip connection
        for i in range(self.depth):
            out = getattr(self, 'pad{}'.format(i + 1))(skip)  # Apply padding
            out = getattr(self, 'conv{}'.format(i + 1))(out)  # Apply convolution
            out = getattr(self, 'norm{}'.format(i + 1))(out)  # Apply normalization
            out = getattr(self, 'prelu{}'.format(i + 1))(out)  # Apply PReLU activation            
            skip = th.cat([out, skip], dim=1)  # Concatenate the output with the skip connection
        return out  # Return the final output

class UniDeepFsmn_dilated(nn.Module):
    """
    UniDeepFsmn_dilated combines the UniDeepFsmn architecture with a dilated dense network 
    to enhance feature extraction while maintaining efficient computation.

    Attributes:
        input_dim (int): Dimension of the input features.
        output_dim (int): Dimension of the output features.
        depth (int): Depth of the dilated dense network.
        lorder (int): Length of the order for the convolution layers.
        hidden_size (int): Number of hidden units in the linear layer.
        linear (nn.Linear): Linear layer to project input features to hidden size.
        project (nn.Linear): Linear layer to project hidden features to output dimensions.
        conv (DilatedDenseNet): Instance of the DilatedDenseNet for feature extraction.
    """

    def __init__(self, input_dim, output_dim, lorder=None, hidden_size=None, depth=2):
        super(UniDeepFsmn_dilated, self).__init__()

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.depth = depth
        if lorder is None:
            return
        self.lorder = lorder
        self.hidden_size = hidden_size
        
        # Initialize layers
        self.linear = nn.Linear(input_dim, hidden_size)  # Linear transformation to hidden size
        self.project = nn.Linear(hidden_size, output_dim, bias=False)  # Project hidden size to output dimension
        self.conv = DilatedDenseNet(depth=self.depth, lorder=lorder, in_channels=output_dim)  # Dilated dense network for feature extraction

    def forward(self, input):
        """
        Forward pass for the UniDeepFsmn_dilated model.

        Args:
            input (torch.Tensor): Input tensor of shape (batch_size, input_dim).

        Returns:
            torch.Tensor: The output tensor of the same shape as input, enhanced by the network.
        """
        f1 = F.relu(self.linear(input))  # Apply linear layer followed by ReLU activation
        p1 = self.project(f1)  # Project to output dimension
        x = th.unsqueeze(p1, 1)  # Add a dimension for compatibility with Conv2d
        x_per = x.permute(0, 3, 2, 1)  # Permute dimensions for convolution
        out = self.conv(x_per)  # Pass through the dilated dense network
        out1 = out.permute(0, 3, 2, 1)  # Permute back to original dimensions

        return input + out1.squeeze()  # Return enhanced input