Spaces:
Runtime error
Runtime error
File size: 3,943 Bytes
33d9042 4c1c145 33d9042 9488c79 33d9042 a71b09f 33d9042 9488c79 33d9042 e660ba9 33d9042 9488c79 33d9042 e660ba9 33d9042 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
import spaces
import tempfile
import gradio as gr
import os
from whisperspeech.pipeline import Pipeline
import torch
import soundfile as sf
import numpy as np
import torch.nn.functional as F
from whisperspeech.languages import LANGUAGES
from whisperspeech.pipeline import Pipeline
title = """#🙋🏻♂️ Welcome to🌟Tonic's🌬️💬📝WhisperSpeech
You can use this ZeroGPU Space to test out the current model [🌬️💬📝collabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). 🌬️💬📝collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Previously known as spear-tts-pytorch. It's like Stable Diffusion but for speech – both powerful and easily customizable.
You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬🔍 Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/laion-whisper?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a></h3>
Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community 👻 [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Polytonic](https://github.com/tonic-ai) & contribute to 🌟 [Poly](https://github.com/tonic-ai/poly) 🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
"""
@spaces.GPU
def whisper_speech_demo(text, lang, speaker_audio=None, mix_lang=None, mix_text=None):
pipe = Pipeline()
speaker_url = None
if speaker_audio is not None:
speaker_url = speaker_audio.name
if mix_lang and mix_text:
mixed_langs = lang.split(',') + mix_lang.split(',')
mixed_texts = [text] + mix_text.split(',')
stoks = pipe.t2s.generate(mixed_texts, lang=mixed_langs)
audio_data = pipe.generate(stoks, speaker_url, lang=mixed_langs[0])
else:
audio_data = pipe.generate(text, speaker_url, lang)
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
tmp_file_name = tmp_file.name
audio_np = audio_data.numpy() # Convert tensor to numpy array
sf.write(tmp_file_name, audio_np, 22050) # Assuming a sample rate of 22050 Hz
return tmp_file_name
with gr.Blocks() as demo:
gr.Markdown(title)
with gr.Row():
text_input = gr.Textbox(label="Enter text")
lang_input = gr.Dropdown(choices=list(LANGUAGES.keys()), label="Language")
speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)", sources=["upload", "microphone"], type="filepath")
with gr.Row():
mix_lang_input = gr.CheckboxGroup(choices=list(LANGUAGES.keys()), label="Mixed Languages (optional)")
mix_text_input = gr.Textbox(label="Mixed Texts (optional, for mixed languages)", placeholder="e.g., Hello, Cześć")
with gr.Row():
submit_button = gr.Button("Generate Speech")
output_audio = gr.Audio(label="🌬️💬📝WhisperSpeech")
submit_button.click(
whisper_speech_demo,
inputs=[text_input, lang_input, speaker_input, mix_lang_input, mix_text_input],
outputs=output_audio
)
demo.launch() |