import gradio as gr import os import bark from bark import generate_audio, preload_models, SAMPLE_RATE import time import scipy import noisereduce as nr import bark from transformers import BertTokenizer ######################## ##### Voice cloning functionality # make sure to only use CPU os.environ["CUDA_VISIBLE_DEVICES"] = "" os.environ["SUNO_USE_SMALL_MODELS"] = "1" # make sure to download BERT tokenizer BertTokenizer.from_pretrained("bert-base-multilingual-cased") # Do not re-download the models when loading them bark.generation.CACHE_DIR = "bark_models" def generate_cloned_voice_audio(text_prompt): print("="*10) print("NOW READING:") print(text_prompt) print("="*10) # load voice file history_prompt = "pm_voice.npz" # keep track of duration t0 = time.time() # generate cloned voice audio audio_array = generate_audio( text_prompt, history_prompt = history_prompt ) # keep track of duration generation_duration_s = time.time() - t0 audio_duration_s = audio_array.shape[0] / SAMPLE_RATE print(f"took {generation_duration_s:.0f}s to generate {audio_duration_s:.0f}s of audio") # reduce noise reduced_noise_audio_array = nr.reduce_noise(y=audio_array, sr=SAMPLE_RATE) # write to file audio_output_path = "output_audio.wav" noisereduced_audio_output_path = "output_noisereduced_audio.wav" scipy.io.wavfile.write(audio_output_path, rate=SAMPLE_RATE, data=audio_array) scipy.io.wavfile.write(noisereduced_audio_output_path, rate=SAMPLE_RATE, data=reduced_noise_audio_array) return (SAMPLE_RATE, audio_array) ######################## def greet(name): if os.path.isfile("pm_voice.npz"): preffix = "Found the voice file" else: preffix = "Voice file not found" return "Hello " + name + "!!" + preffix output_audio = gr.Audio( # format = "ogg", label = "My cloned voice reading your text", ) iface = gr.Interface( fn=generate_cloned_voice_audio, inputs="text", outputs=output_audio ) iface.launch(share=True)