Bark-Voice-Cloning / training /training_prepare.py
kevinwang676's picture
Upload 4 files
79cb6e1
import random
import uuid
import numpy
import os
import random
import fnmatch
from tqdm.auto import tqdm
from scipy.io import wavfile
from bark.generation import load_model, SAMPLE_RATE
from bark.api import semantic_to_waveform
from bark import text_to_semantic
from bark.generation import load_model
from training.data import load_books, random_split_chunk
output = 'training/data/output'
output_wav = 'training/data/output_wav'
def prepare_semantics_from_text(num_generations):
loaded_data = load_books(True)
print('Loading semantics model')
load_model(use_gpu=True, use_small=False, force_reload=False, model_type='text')
if not os.path.isdir(output):
os.mkdir(output)
loop = 1
while 1:
filename = uuid.uuid4().hex + '.npy'
file_name = os.path.join(output, filename)
text = ''
while not len(text) > 0:
text = random_split_chunk(loaded_data) # Obtain a short chunk of text
text = text.strip()
print(f'{loop} Generating semantics for text:', text)
loop+=1
semantics = text_to_semantic(text, temp=round(random.uniform(0.6, 0.8), ndigits=2))
numpy.save(file_name, semantics)
def prepare_wavs_from_semantics():
if not os.path.isdir(output):
raise Exception('No \'output\' folder, make sure you run create_data.py first!')
if not os.path.isdir(output_wav):
os.mkdir(output_wav)
print('Loading coarse model')
load_model(use_gpu=True, use_small=False, force_reload=False, model_type='coarse')
print('Loading fine model')
load_model(use_gpu=True, use_small=False, force_reload=False, model_type='fine')
files = fnmatch.filter(os.listdir(output), '*.npy')
current = 1
total = len(files)
for i, f in tqdm(enumerate(files), total=len(files)):
real_name = '.'.join(f.split('.')[:-1]) # Cut off the extension
file_name = os.path.join(output, f)
out_file = os.path.join(output_wav, f'{real_name}.wav')
if not os.path.isfile(out_file) and os.path.isfile(file_name): # Don't process files that have already been processed, to be able to continue previous generations
print(f'Processing ({i+1}/{total}) -> {f}')
wav = semantic_to_waveform(numpy.load(file_name), temp=round(random.uniform(0.6, 0.8), ndigits=2))
# Change to PCM16
# wav = (wav * 32767).astype(np.int16)
wavfile.write(out_file, SAMPLE_RATE, wav)
print('Done!')