voice-cloning-simple-with-gpu / TTS /bin /compute_statistics.py
Shadhil's picture
voice-clone with single audio sample input
9b2107c
raw
history blame
3.18 kB
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import glob
import os
import numpy as np
from tqdm import tqdm
# from TTS.utils.io import load_config
from TTS.config import load_config
from TTS.tts.datasets import load_tts_samples
from TTS.utils.audio import AudioProcessor
def main():
"""Run preprocessing process."""
parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
parser.add_argument("out_path", type=str, help="save path (directory and filename).")
parser.add_argument(
"--data_path",
type=str,
required=False,
help="folder including the target set of wavs overriding dataset config.",
)
args, overrides = parser.parse_known_args()
CONFIG = load_config(args.config_path)
CONFIG.parse_known_args(overrides, relaxed_parser=True)
# load config
CONFIG.audio.signal_norm = False # do not apply earlier normalization
CONFIG.audio.stats_path = None # discard pre-defined stats
# load audio processor
ap = AudioProcessor(**CONFIG.audio.to_dict())
# load the meta data of target dataset
if args.data_path:
dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
else:
dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
print(f" > There are {len(dataset_items)} files.")
mel_sum = 0
mel_square_sum = 0
linear_sum = 0
linear_square_sum = 0
N = 0
for item in tqdm(dataset_items):
# compute features
wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
linear = ap.spectrogram(wav)
mel = ap.melspectrogram(wav)
# compute stats
N += mel.shape[1]
mel_sum += mel.sum(1)
linear_sum += linear.sum(1)
mel_square_sum += (mel**2).sum(axis=1)
linear_square_sum += (linear**2).sum(axis=1)
mel_mean = mel_sum / N
mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
linear_mean = linear_sum / N
linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
output_file_path = args.out_path
stats = {}
stats["mel_mean"] = mel_mean
stats["mel_std"] = mel_scale
stats["linear_mean"] = linear_mean
stats["linear_std"] = linear_scale
print(f" > Avg mel spec mean: {mel_mean.mean()}")
print(f" > Avg mel spec scale: {mel_scale.mean()}")
print(f" > Avg linear spec mean: {linear_mean.mean()}")
print(f" > Avg linear spec scale: {linear_scale.mean()}")
# set default config values for mean-var scaling
CONFIG.audio.stats_path = output_file_path
CONFIG.audio.signal_norm = True
# remove redundant values
del CONFIG.audio.max_norm
del CONFIG.audio.min_level_db
del CONFIG.audio.symmetric_norm
del CONFIG.audio.clip_norm
stats["audio_config"] = CONFIG.audio.to_dict()
np.save(output_file_path, stats, allow_pickle=True)
print(f" > stats saved to {output_file_path}")
if __name__ == "__main__":
main()