import gradio as gr import librosa import numpy as np import torch import torch.nn.functional as F from pathlib import Path from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan from speechbrain.pretrained import EncoderClassifier title = "SpeechT5: Voice Conversion" description = """ This space can "clone voice" and can do "text to speech" . Special credit goes to "Microsoft" and "Speechbrain". Because without their models its not possible to create this space. Enjoy ! """ article = """ """ device = "cuda" if torch.cuda.is_available() else "cpu" checkpoint = "microsoft/speecht5_vc" processor_vc = SpeechT5Processor.from_pretrained(checkpoint) model_vc = SpeechT5ForSpeechToSpeech.from_pretrained(checkpoint) checkpoint_tts = "microsoft/speecht5_tts" processor_tts = SpeechT5Processor.from_pretrained(checkpoint_tts) model_tts = SpeechT5ForTextToSpeech.from_pretrained(checkpoint_tts) vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") model_embed = { "speechbrain/spkrec-xvect-voxceleb": 512, "speechbrain/spkrec-ecapa-voxceleb": 192, } checkpoint_embed = "speechbrain/spkrec-xvect-voxceleb" size_embed = model_embed[checkpoint_embed] embeding_classifier = EncoderClassifier.from_hparams(source=checkpoint_embed, run_opts={"device": device}, savedir="/tmp/speaker_embed") examples_pt = 'examples' allowed_extentions = ['.mp3', '.wav'] examples = {f.name: f for f in Path(examples_pt).glob('*') if f.suffix in allowed_extentions} default_voice = list(examples.keys())[0] verse = """Hey how are you doing today ?""" def process_audio(sampling_rate, waveform, target_sr=16000): # convert from int16 to floating point waveform = waveform / 32678.0 # convert to mono if stereo if len(waveform.shape) > 1: waveform = librosa.to_mono(waveform.T) # resample to 16 kHz if necessary if sampling_rate != target_sr: waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=target_sr) # limit to 30 seconds waveform = waveform[:target_sr * 30] # make PyTorch tensor waveform = torch.tensor(waveform) return waveform def f2embed(waveform, sz): with torch.no_grad(): embeddings = embeding_classifier.encode_batch(waveform) embeddings = F.normalize(embeddings, dim=2) embeddings = embeddings.squeeze().cpu().numpy() assert embeddings.shape[0] == sz, embeddings.shape[0] return embeddings def on_voicedropdown(x): return examples[x] def on_voiceload(audio, sz=size_embed): print("on_voiceload") # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels)) if audio is not None: sampling_rate, waveform = audio else: return np.zeros(sz) waveform = process_audio(sampling_rate, waveform) embed = f2embed(waveform, sz) print("Generated embedding", embed[:5]) return embed def voice_clone(audio, speaker_embedding, target_sr=16000): # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels)) if audio is None or speaker_embedding is None: return (target_sr, np.zeros(0).astype(np.int16)) else: sampling_rate, waveform = audio waveform = process_audio(sampling_rate, waveform) inputs = processor_vc(audio=waveform, sampling_rate=target_sr, return_tensors="pt") speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) speech = model_vc.generate_speech(inputs["input_values"], speaker_embedding, vocoder=vocoder) speech = (speech.numpy() * 32767).astype(np.int16) return (target_sr, speech) def text_to_speech(text, speaker_embedding, target_sr=16000): if len(text.strip()) == 0 or speaker_embedding is None: return (target_sr, np.zeros(0).astype(np.int16)) inputs = processor_tts(text=text, return_tensors="pt") # limit input length input_ids = inputs["input_ids"] input_ids = input_ids[..., :model_tts.config.max_text_positions] speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0) speech = model_tts.generate_speech(input_ids, speaker_embedding, vocoder=vocoder) speech = (speech.numpy() * 32767).astype(np.int16) return (target_sr, speech) theme = gr.themes.Monochrome() with gr.Blocks() as demo: voice_embedding = gr.State(None) def activate(*args): return gr.update(interactive=True) if len(args) == 1 else [gr.update(interactive=True)] * len(args) def deactivate(*args): return gr.update(interactive=False) if len(args) == 1 else [gr.update(interactive=False)] * len(args) gr.Markdown(description) with gr.Accordion("Voice to clone", open=False) as accordion: gr.Markdown("Upload target voice...") with gr.Row(equal_height=True): voice_upload = gr.Audio(label="Upload target voice", source="upload", type="numpy") voice_dropdown = gr.Dropdown(examples, label='Examples', interactive=True) # TODO: couldn't catch microphone stop event # mic = gr.Audio(label="Record Speech", source="microphone", type="numpy") # mic.stop(fn=lambda x: print('mic stop'), inputs=None, outputs=None) with gr.Row(equal_height=True): with gr.Column(scale=2): with gr.Row(equal_height=True): text_to_convert = gr.Textbox(verse) voice_to_convert = gr.Audio(label="Upload voice to convert", source="upload", type="numpy") with gr.Row(equal_height=True): button_text = gr.Button("Text to speech", interactive=False) button_audio = gr.Button("Convert audio", interactive=False) with gr.Row(equal_height=True): speech = gr.Audio(label="Converted Speech", type="numpy", visible=True, interactive=False) # actions kwargs = dict(fn=on_voiceload, inputs=voice_upload, outputs=voice_embedding) voice_upload.upload(deactivate, [button_text, button_audio], [button_text, button_audio]).\ then(**kwargs).then(activate, [button_text, button_audio], [button_text, button_audio]) voice_dropdown.change(deactivate, [button_text, button_audio], [button_text, button_audio]).\ then(fn=on_voicedropdown, inputs=voice_dropdown, outputs=voice_upload).\ then(**kwargs).then(activate, [button_text, button_audio], [button_text, button_audio]) button_text.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\ then(fn=text_to_speech, inputs=[text_to_convert, voice_embedding], outputs=speech).\ then(activate, [button_text, button_audio], [button_text, button_audio]) button_audio.click(deactivate, [button_text, button_audio], [button_text, button_audio]).\ then(fn=voice_clone, inputs=[voice_to_convert, voice_embedding], outputs=speech).\ then(activate, [button_text, button_audio], [button_text, button_audio]) gr.HTML(article) demo.launch(share=False)