# ***************************************************************************** # From PyTorch: # Copyright (c) 2016- Facebook, Inc (Adam Paszke) # Copyright (c) 2014- Facebook, Inc (Soumith Chintala) # Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert) # Copyright (c) 2012-2014 Deepmind Technologies (Koray Kavukcuoglu) # Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu) # Copyright (c) 2011-2013 NYU (Clement Farabet) # Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston) # Copyright (c) 2006 Idiap Research Institute (Samy Bengio) # Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz) # From Caffe2: # Copyright (c) 2016-present, Facebook Inc. All rights reserved. # All contributions by Facebook: # Copyright (c) 2016 Facebook Inc. # All contributions by Google: # Copyright (c) 2015 Google Inc. # All rights reserved. # All contributions by Yangqing Jia: # Copyright (c) 2015 Yangqing Jia # All rights reserved. # All contributions by Kakao Brain: # Copyright 2019-2020 Kakao Brain # All contributions by Cruise LLC: # Copyright (c) 2022 Cruise LLC. # All rights reserved. # All contributions from Caffe: # Copyright(c) 2013, 2014, 2015, the respective contributors # All rights reserved. # All other contributions: # Copyright(c) 2015, 2016 the respective contributors # All rights reserved. # Caffe2 uses a copyright model similar to Caffe: each contributor holds # copyright over their contributions to Caffe2. The project versioning records # all such contribution and copyright details. If a contributor wants to further # mark their specific copyright on a particular contribution, they should # indicate their copyright solely in the commit message of the change when it is # committed. # All rights reserved. # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America # and IDIAP Research Institute nor the names of its contributors may be # used to endorse or promote products derived from this software without # specific prior written permission. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. # ***************************************************************************** # ***************************************************************************** # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the NVIDIA CORPORATION nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # ***************************************************************************** from typing import Optional, Tuple import torch import torch.nn as nn from torchaudio.models.tacotron2 import _Encoder, _Decoder, _Postnet, _get_mask_from_lengths from torch import Tensor # modified version of torchaudio.models.Tacotron2 class Tacotron2MS(nn.Module): r"""Tacotron2 model from *Natural TTS Synthesis by Conditioning WaveNet on Mel Spectrogram Predictions* :cite:`shen2018natural` based on the implementation from `Nvidia Deep Learning Examples `_. See Also: * :class:`torchaudio.pipelines.Tacotron2TTSBundle`: TTS pipeline with pretrained model. Args: mask_padding (bool, optional): Use mask padding (Default: ``False``). n_mels (int, optional): Number of mel bins (Default: ``80``). n_symbol (int, optional): Number of symbols for the input text (Default: ``148``). n_frames_per_step (int, optional): Number of frames processed per step, only 1 is supported (Default: ``1``). symbol_embedding_dim (int, optional): Input embedding dimension (Default: ``512``). encoder_n_convolution (int, optional): Number of encoder convolutions (Default: ``3``). encoder_kernel_size (int, optional): Encoder kernel size (Default: ``5``). encoder_embedding_dim (int, optional): Encoder embedding dimension (Default: ``512``). decoder_rnn_dim (int, optional): Number of units in decoder LSTM (Default: ``1024``). decoder_max_step (int, optional): Maximum number of output mel spectrograms (Default: ``2000``). decoder_dropout (float, optional): Dropout probability for decoder LSTM (Default: ``0.1``). decoder_early_stopping (bool, optional): Continue decoding after all samples are finished (Default: ``True``). attention_rnn_dim (int, optional): Number of units in attention LSTM (Default: ``1024``). attention_hidden_dim (int, optional): Dimension of attention hidden representation (Default: ``128``). attention_location_n_filter (int, optional): Number of filters for attention model (Default: ``32``). attention_location_kernel_size (int, optional): Kernel size for attention model (Default: ``31``). attention_dropout (float, optional): Dropout probability for attention LSTM (Default: ``0.1``). prenet_dim (int, optional): Number of ReLU units in prenet layers (Default: ``256``). postnet_n_convolution (int, optional): Number of postnet convolutions (Default: ``5``). postnet_kernel_size (int, optional): Postnet kernel size (Default: ``5``). postnet_embedding_dim (int, optional): Postnet embedding dimension (Default: ``512``). gate_threshold (float, optional): Probability threshold for stop token (Default: ``0.5``). """ def __init__( self, mask_padding: bool = False, n_mels: int = 80, n_symbol: int = 148, n_frames_per_step: int = 1, ### num_speakers=40, speaker_embedding_dim=128, ### symbol_embedding_dim: int = 512, encoder_embedding_dim: int = 512, encoder_n_convolution: int = 3, encoder_kernel_size: int = 5, decoder_rnn_dim: int = 1024, decoder_max_step: int = 2000, decoder_dropout: float = 0.1, decoder_early_stopping: bool = True, attention_rnn_dim: int = 1024, attention_hidden_dim: int = 128, attention_location_n_filter: int = 32, attention_location_kernel_size: int = 31, attention_dropout: float = 0.1, prenet_dim: int = 256, postnet_n_convolution: int = 5, postnet_kernel_size: int = 5, postnet_embedding_dim: int = 512, gate_threshold: float = 0.5, ) -> None: super().__init__() self.mask_padding = mask_padding self.n_mels = n_mels self.n_frames_per_step = n_frames_per_step self.embedding = nn.Embedding(n_symbol, symbol_embedding_dim) torch.nn.init.xavier_uniform_(self.embedding.weight) self.encoder = _Encoder(encoder_embedding_dim, encoder_n_convolution, encoder_kernel_size) self.decoder = _Decoder( n_mels, n_frames_per_step, encoder_embedding_dim + (speaker_embedding_dim if num_speakers > 1 else 0), decoder_rnn_dim, decoder_max_step, decoder_dropout, decoder_early_stopping, attention_rnn_dim, attention_hidden_dim, attention_location_n_filter, attention_location_kernel_size, attention_dropout, prenet_dim, gate_threshold, ) self.postnet = _Postnet( n_mels, postnet_embedding_dim, postnet_kernel_size, postnet_n_convolution) self.speaker_embedding = None if num_speakers > 1: self.speaker_embedding = nn.Embedding( num_speakers, speaker_embedding_dim) def forward( self, tokens: Tensor, token_lengths: Tensor, mel_specgram: Tensor, mel_specgram_lengths: Tensor, speaker_ids: Tensor, ) -> Tuple[Tensor, Tensor, Tensor, Tensor]: r"""Pass the input through the Tacotron2 model. This is in teacher forcing mode, which is generally used for training. The input ``tokens`` should be padded with zeros to length max of ``token_lengths``. The input ``mel_specgram`` should be padded with zeros to length max of ``mel_specgram_lengths``. Args: tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of token_lengths)`. token_lengths (Tensor): The valid length of each sample in ``tokens`` with shape `(n_batch, )`. mel_specgram (Tensor): The target mel spectrogram with shape `(n_batch, n_mels, max of mel_specgram_lengths)`. mel_specgram_lengths (Tensor): The length of each mel spectrogram with shape `(n_batch, )`. Returns: [Tensor, Tensor, Tensor, Tensor]: Tensor Mel spectrogram before Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`. Tensor Mel spectrogram after Postnet with shape `(n_batch, n_mels, max of mel_specgram_lengths)`. Tensor The output for stop token at each time step with shape `(n_batch, max of mel_specgram_lengths)`. Tensor Sequence of attention weights from the decoder with shape `(n_batch, max of mel_specgram_lengths, max of token_lengths)`. """ embedded_inputs = self.embedding(tokens).transpose(1, 2) embedded_text = self.encoder(embedded_inputs, token_lengths) if self.speaker_embedding is not None: embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1) embedded_speakers = embedded_speakers.repeat( 1, embedded_text.size(1), 1) encoder_outputs = torch.cat( (embedded_text, embedded_speakers), dim=2) else: encoder_outputs = embedded_text mel_specgram, gate_outputs, alignments = self.decoder( encoder_outputs, mel_specgram, memory_lengths=token_lengths ) mel_specgram_postnet = self.postnet(mel_specgram) mel_specgram_postnet = mel_specgram + mel_specgram_postnet if self.mask_padding: mask = _get_mask_from_lengths(mel_specgram_lengths) mask = mask.expand(self.n_mels, mask.size(0), mask.size(1)) mask = mask.permute(1, 0, 2) mel_specgram.masked_fill_(mask, 0.0) mel_specgram_postnet.masked_fill_(mask, 0.0) gate_outputs.masked_fill_(mask[:, 0, :], 1e3) return mel_specgram, mel_specgram_postnet, gate_outputs, alignments @torch.jit.export def infer(self, tokens: Tensor, speaker_ids: Optional[Tensor] = None, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Tensor, Tensor]: r"""Using Tacotron2 for inference. The input is a batch of encoded sentences (``tokens``) and its corresponding lengths (``lengths``). The output is the generated mel spectrograms, its corresponding lengths, and the attention weights from the decoder. The input `tokens` should be padded with zeros to length max of ``lengths``. Args: tokens (Tensor): The input tokens to Tacotron2 with shape `(n_batch, max of lengths)`. lengths (Tensor or None, optional): The valid length of each sample in ``tokens`` with shape `(n_batch, )`. If ``None``, it is assumed that the all the tokens are valid. Default: ``None`` Returns: (Tensor, Tensor, Tensor): Tensor The predicted mel spectrogram with shape `(n_batch, n_mels, max of mel_specgram_lengths)`. Tensor The length of the predicted mel spectrogram with shape `(n_batch, )`. Tensor Sequence of attention weights from the decoder with shape `(n_batch, max of mel_specgram_lengths, max of lengths)`. """ n_batch, max_length = tokens.shape if lengths is None: lengths = torch.tensor([max_length]).expand( n_batch).to(tokens.device, tokens.dtype) if speaker_ids is None: speaker_ids = torch.zeros_like(lengths) assert lengths is not None # For TorchScript compiler embedded_inputs = self.embedding(tokens).transpose(1, 2) embedded_text = self.encoder(embedded_inputs, lengths) if self.speaker_embedding is not None: embedded_speakers = self.speaker_embedding(speaker_ids).unsqueeze(1) embedded_speakers = embedded_speakers.repeat( 1, embedded_text.size(1), 1) encoder_outputs = torch.cat( (embedded_text, embedded_speakers), dim=2) else: encoder_outputs = embedded_text mel_specgram, mel_specgram_lengths, _, alignments = self.decoder.infer( encoder_outputs, lengths) mel_outputs_postnet = self.postnet(mel_specgram) mel_outputs_postnet = mel_specgram + mel_outputs_postnet alignments = alignments.unfold(1, n_batch, n_batch).transpose(0, 2) return mel_outputs_postnet, mel_specgram_lengths, alignments