File size: 3,008 Bytes
46a75d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os

import torch

from tests import get_tests_input_path, get_tests_output_path, get_tests_path
from TTS.config import BaseAudioConfig
from TTS.utils.audio import AudioProcessor
from TTS.vocoder.layers.losses import MelganFeatureLoss, MultiScaleSTFTLoss, STFTLoss, TorchSTFT

TESTS_PATH = get_tests_path()

OUT_PATH = os.path.join(get_tests_output_path(), "audio_tests")
os.makedirs(OUT_PATH, exist_ok=True)

WAV_FILE = os.path.join(get_tests_input_path(), "example_1.wav")

ap = AudioProcessor(**BaseAudioConfig().to_dict())


def test_torch_stft():
    torch_stft = TorchSTFT(ap.fft_size, ap.hop_length, ap.win_length)
    # librosa stft
    wav = ap.load_wav(WAV_FILE)
    M_librosa = abs(ap._stft(wav))  # pylint: disable=protected-access
    # torch stft
    wav = torch.from_numpy(wav[None, :]).float()
    M_torch = torch_stft(wav)
    # check the difference b/w librosa and torch outputs
    assert (M_librosa - M_torch[0].data.numpy()).max() < 1e-5


def test_stft_loss():
    stft_loss = STFTLoss(ap.fft_size, ap.hop_length, ap.win_length)
    wav = ap.load_wav(WAV_FILE)
    wav = torch.from_numpy(wav[None, :]).float()
    loss_m, loss_sc = stft_loss(wav, wav)
    assert loss_m + loss_sc == 0
    loss_m, loss_sc = stft_loss(wav, torch.rand_like(wav))
    assert loss_sc < 1.0
    assert loss_m + loss_sc > 0


def test_multiscale_stft_loss():
    stft_loss = MultiScaleSTFTLoss(
        [ap.fft_size // 2, ap.fft_size, ap.fft_size * 2],
        [ap.hop_length // 2, ap.hop_length, ap.hop_length * 2],
        [ap.win_length // 2, ap.win_length, ap.win_length * 2],
    )
    wav = ap.load_wav(WAV_FILE)
    wav = torch.from_numpy(wav[None, :]).float()
    loss_m, loss_sc = stft_loss(wav, wav)
    assert loss_m + loss_sc == 0
    loss_m, loss_sc = stft_loss(wav, torch.rand_like(wav))
    assert loss_sc < 1.0
    assert loss_m + loss_sc > 0


def test_melgan_feature_loss():
    feats_real = []
    feats_fake = []

    # if all the features are different.
    for _ in range(5):  # different scales
        scale_feats_real = []
        scale_feats_fake = []
        for _ in range(4):  # different layers
            scale_feats_real.append(torch.rand([3, 5, 7]))
            scale_feats_fake.append(torch.rand([3, 5, 7]))
        feats_real.append(scale_feats_real)
        feats_fake.append(scale_feats_fake)

    loss_func = MelganFeatureLoss()
    loss = loss_func(feats_fake, feats_real)
    assert loss.item() <= 1.0

    feats_real = []
    feats_fake = []

    # if all the features are the same
    for _ in range(5):  # different scales
        scale_feats_real = []
        scale_feats_fake = []
        for _ in range(4):  # different layers
            tensor = torch.rand([3, 5, 7])
            scale_feats_real.append(tensor)
            scale_feats_fake.append(tensor)
        feats_real.append(scale_feats_real)
        feats_fake.append(scale_feats_fake)

    loss_func = MelganFeatureLoss()
    loss = loss_func(feats_fake, feats_real)
    assert loss.item() == 0