wasm-ar-tts / wasq /models /tacotron2 /tacotron2_ms.py
wasmdashai's picture
first commit
7694c84
raw
history blame
15.9 kB
# *****************************************************************************
# From PyTorch:
# Copyright (c) 2016- Facebook, Inc (Adam Paszke)
# Copyright (c) 2014- Facebook, Inc (Soumith Chintala)
# Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
# Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu)
# Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
# Copyright (c) 2011-2013 NYU (Clement Farabet)
# Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
# Copyright (c) 2006 Idiap Research Institute (Samy Bengio)
# Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
# From Caffe2:
# Copyright (c) 2016-present, Facebook Inc. All rights reserved.
# All contributions by Facebook:
# Copyright (c) 2016 Facebook Inc.
# All contributions by Google:
# Copyright (c) 2015 Google Inc.
# All rights reserved.
# All contributions by Yangqing Jia:
# Copyright (c) 2015 Yangqing Jia
# All rights reserved.
# All contributions by Kakao Brain:
# Copyright 2019-2020 Kakao Brain
# All contributions by Cruise LLC:
# Copyright (c) 2022 Cruise LLC.
# All rights reserved.
# All contributions from Caffe:
# Copyright(c) 2013, 2014, 2015, the respective contributors
# All rights reserved.
# All other contributions:
# Copyright(c) 2015, 2016 the respective contributors
# All rights reserved.
# Caffe2 uses a copyright model similar to Caffe: each contributor holds
# copyright over their contributions to Caffe2. The project versioning records
# all such contribution and copyright details. If a contributor wants to further
# mark their specific copyright on a particular contribution, they should
# indicate their copyright solely in the commit message of the change when it is
# committed.
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
# and IDIAP Research Institute nor the names of its contributors may be
# used to endorse or promote products derived from this software without
# specific prior written permission.
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
# *****************************************************************************
# *****************************************************************************
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of the NVIDIA CORPORATION nor the
# names of its contributors may be used to endorse or promote products
# derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# *****************************************************************************
from typing import Optional, Tuple
import torch
import torch.nn as nn
from torchaudio.models.tacotron2 import _Encoder, _Decoder, _Postnet, _get_mask_from_lengths
from torch import Tensor
# modified version of torchaudio.models.Tacotron2
class Tacotron2MS(nn.Module):
r"""Tacotron2 model from *Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions*
:cite:`shen2018natural` based on the implementation from
`Nvidia Deep Learning Examples <https://github.com/NVIDIA/DeepLearningExamples/>`_.
See Also:
* :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model.
Args:
mask_padding (bool, optional): Use mask padding (Default: ``False``).
n_mels (int, optional): Number of mel bins (Default: ``80``).
n_symbol (int, optional): Number of symbols for the input text (Default: ``148``).
n_frames_per_step (int, optional): Number of frames processed per step, only 1 is supported (Default: ``1``).
symbol_embedding_dim (int, optional): Input embedding dimension (Default: ``512``).
encoder_n_convolution (int, optional): Number of encoder convolutions (Default: ``3``).
encoder_kernel_size (int, optional): Encoder kernel size (Default: ``5``).
encoder_embedding_dim (int, optional): Encoder embedding dimension (Default: ``512``).
decoder_rnn_dim (int, optional): Number of units in decoder LSTM (Default: ``1024``).
decoder_max_step (int, optional): Maximum number of output mel spectrograms (Default: ``2000``).
decoder_dropout (float, optional): Dropout probability for decoder LSTM (Default: ``0.1``).
decoder_early_stopping (bool, optional): Continue decoding after all samples are finished (Default: ``True``).
attention_rnn_dim (int, optional): Number of units in attention LSTM (Default: ``1024``).
attention_hidden_dim (int, optional): Dimension of attention hidden representation (Default: ``128``).
attention_location_n_filter (int, optional): Number of filters for attention model (Default: ``32``).
attention_location_kernel_size (int, optional): Kernel size for attention model (Default: ``31``).
attention_dropout (float, optional): Dropout probability for attention LSTM (Default: ``0.1``).
prenet_dim (int, optional): Number of ReLU units in prenet layers (Default: ``256``).
postnet_n_convolution (int, optional): Number of postnet convolutions (Default: ``5``).
postnet_kernel_size (int, optional): Postnet kernel size (Default: ``5``).
postnet_embedding_dim (int, optional): Postnet embedding dimension (Default: ``512``).
gate_threshold (float, optional): Probability threshold for stop token (Default: ``0.5``).
"""
def __init__(
self,
mask_padding: bool = False,
n_mels: int = 80,
n_symbol: int = 148,
n_frames_per_step: int = 1,
###
num_speakers=40,
speaker_embedding_dim=128,
###
symbol_embedding_dim: int = 512,
encoder_embedding_dim: int = 512,
encoder_n_convolution: int = 3,
encoder_kernel_size: int = 5,
decoder_rnn_dim: int = 1024,
decoder_max_step: int = 2000,
decoder_dropout: float = 0.1,
decoder_early_stopping: bool = True,
attention_rnn_dim: int = 1024,
attention_hidden_dim: int = 128,
attention_location_n_filter: int = 32,
attention_location_kernel_size: int = 31,
attention_dropout: float = 0.1,
prenet_dim: int = 256,
postnet_n_convolution: int = 5,
postnet_kernel_size: int = 5,
postnet_embedding_dim: int = 512,
gate_threshold: float = 0.5,
) -> None:
super().__init__()
self.mask_padding = mask_padding
self.n_mels = n_mels
self.n_frames_per_step = n_frames_per_step
self.embedding = nn.Embedding(n_symbol, symbol_embedding_dim)
torch.nn.init.xavier_uniform_(self.embedding.weight)
self.encoder = _Encoder(encoder_embedding_dim,
encoder_n_convolution, encoder_kernel_size)
self.decoder = _Decoder(
n_mels,
n_frames_per_step,
encoder_embedding_dim + (speaker_embedding_dim if num_speakers > 1 else 0),
decoder_rnn_dim,
decoder_max_step,
decoder_dropout,
decoder_early_stopping,
attention_rnn_dim,
attention_hidden_dim,
attention_location_n_filter,
attention_location_kernel_size,
attention_dropout,
prenet_dim,
gate_threshold,
)
self.postnet = _Postnet(
n_mels, postnet_embedding_dim, postnet_kernel_size, postnet_n_convolution)
self.speaker_embedding = None
if num_speakers > 1:
self.speaker_embedding = nn.Embedding(
num_speakers, speaker_embedding_dim)
def forward(
self,
tokens: Tensor,
token_lengths: Tensor,
mel_specgram: Tensor,
mel_specgram_lengths: Tensor,
speaker_ids: Tensor,
) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
r"""Pass the input through the Tacotron2 model. This is in teacher
forcing mode, which is generally used for training.
The input ``tokens`` should be padded with zeros to length max of ``token_lengths``.
The input ``mel_specgram`` should be padded with zeros to length max of ``mel_specgram_lengths``.
Args:
tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of token_lengths)`.
token_lengths (Tensor): The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
mel_specgram (Tensor): The target mel spectrogram
with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape `(n_batch, )`.
Returns:
[Tensor, Tensor, Tensor, Tensor]:
Tensor
Mel spectrogram before Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
Tensor
Mel spectrogram after Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
Tensor
The output for stop token at each time step with shape `(n_batch, max of mel_specgram_lengths)`.
Tensor
Sequence of attention weights from the decoder with
shape `(n_batch, max of mel_specgram_lengths, max of token_lengths)`.
"""
embedded_inputs = self.embedding(tokens).transpose(1, 2)
embedded_text = self.encoder(embedded_inputs, token_lengths)
if self.speaker_embedding is not None:
embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1)
embedded_speakers = embedded_speakers.repeat(
1, embedded_text.size(1), 1)
encoder_outputs = torch.cat(
(embedded_text, embedded_speakers), dim=2)
else:
encoder_outputs = embedded_text
mel_specgram, gate_outputs, alignments = self.decoder(
encoder_outputs, mel_specgram, memory_lengths=token_lengths
)
mel_specgram_postnet = self.postnet(mel_specgram)
mel_specgram_postnet = mel_specgram + mel_specgram_postnet
if self.mask_padding:
mask = _get_mask_from_lengths(mel_specgram_lengths)
mask = mask.expand(self.n_mels, mask.size(0), mask.size(1))
mask = mask.permute(1, 0, 2)
mel_specgram.masked_fill_(mask, 0.0)
mel_specgram_postnet.masked_fill_(mask, 0.0)
gate_outputs.masked_fill_(mask[:, 0, :], 1e3)
return mel_specgram, mel_specgram_postnet, gate_outputs, alignments
@torch.jit.export
def infer(self, tokens: Tensor, speaker_ids: Optional[Tensor] = None, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Tensor, Tensor]:
r"""Using Tacotron2 for inference. The input is a batch of encoded
sentences (``tokens``) and its corresponding lengths (``lengths``). The
output is the generated mel spectrograms, its corresponding lengths, and
the attention weights from the decoder.
The input `tokens` should be padded with zeros to length max of ``lengths``.
Args:
tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of lengths)`.
lengths (Tensor or None, optional):
The valid length of each sample in ``tokens`` with shape `(n_batch, )`.
If ``None``, it is assumed that the all the tokens are valid. Default: ``None``
Returns:
(Tensor, Tensor, Tensor):
Tensor
The predicted mel spectrogram with shape `(n_batch, n_mels, max of mel_specgram_lengths)`.
Tensor
The length of the predicted mel spectrogram with shape `(n_batch, )`.
Tensor
Sequence of attention weights from the decoder with shape
`(n_batch, max of mel_specgram_lengths, max of lengths)`.
"""
n_batch, max_length = tokens.shape
if lengths is None:
lengths = torch.tensor([max_length]).expand(
n_batch).to(tokens.device, tokens.dtype)
if speaker_ids is None:
speaker_ids = torch.zeros_like(lengths)
assert lengths is not None # For TorchScript compiler
embedded_inputs = self.embedding(tokens).transpose(1, 2)
embedded_text = self.encoder(embedded_inputs, lengths)
if self.speaker_embedding is not None:
embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1)
embedded_speakers = embedded_speakers.repeat(
1, embedded_text.size(1), 1)
encoder_outputs = torch.cat(
(embedded_text, embedded_speakers), dim=2)
else:
encoder_outputs = embedded_text
mel_specgram, mel_specgram_lengths, _, alignments = self.decoder.infer(
encoder_outputs, lengths)
mel_outputs_postnet = self.postnet(mel_specgram)
mel_outputs_postnet = mel_specgram + mel_outputs_postnet
alignments = alignments.unfold(1, n_batch, n_batch).transpose(0, 2)
return mel_outputs_postnet, mel_specgram_lengths, alignments