WhisperSpeech

Runtime error

App Files Files Community

Tonic commited on Jan 25

Commit

c4d7f81

•

1 Parent(s): fa57d13

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -62

app.py CHANGED Viewed

@@ -12,82 +12,71 @@ from whisperspeech.languages import LANGUAGES
 from whisperspeech.pipeline import Pipeline
 from whisperspeech.utils import resampler
-title = """# 🙋🏻‍♂️ Welcome to🌟Tonic's🌬️💬📝WhisperSpeech
-You can use this ZeroGPU Space to test out the current model [🌬️💬📝collabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). 🌬️💬📝collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Previously known as spear-tts-pytorch. It's like Stable Diffusion but for speech – both powerful and easily customizable.
 You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬🔍 Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/laion-whisper?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3>
-Join us : 🌟TeamTonic🌟 is always making cool demos! Join our active builder's🛠️community 👻  [![Join us on Discord](https://img.shields.io/discord/1109943800132010065?label=Discord&logo=discord&style=flat-square)](https://discord.gg/GWpVpekp) On 🤗Huggingface: [TeamTonic](https://huggingface.co/TeamTonic) & [MultiTransformer](https://huggingface.co/MultiTransformer) On 🌐Github: [Polytonic](https://github.com/tonic-ai) & contribute to 🌟 [Poly](https://github.com/tonic-ai/poly) 🤗Big thanks to Yuvi Sharma and all the folks at huggingface for the community grant 🤗
 """
 @spaces.GPU
-def whisper_speech_demo(text, lang, speaker_audio, mix_lang, mix_text):
-    print(f"Text: {text}, Lang: {lang}, Speaker Audio: {speaker_audio}, Mix Lang: {mix_lang}, Mix Text: {mix_text}")
     pipe = Pipeline()
     speaker_url = speaker_audio if speaker_audio is not None else None
-    if isinstance(lang, list):
-        if not lang:
-            raise ValueError("Language list is empty.")
-        lang = lang[0]
-    elif not isinstance(lang, str):
-        raise ValueError("Language is not specified correctly.")
-    if mix_lang and mix_text:
-        mixed_langs = mix_lang.split(',') if isinstance(mix_lang, str) else mix_lang
-        mixed_texts = mix_text.split(',')
-        stoks = pipe.t2s.generate(mixed_texts, lang=mixed_langs)
-        audio_data = pipe.generate(stoks, speaker_url, lang=mixed_langs[0])
-    else:
-        audio_data = pipe.generate(text, speaker_url, lang)
-    resample_audio = resampler(newsr=24000)
-    audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
-    audio_np = audio_data_resampled.cpu().numpy()
-    audio_np = audio_np / np.max(np.abs(audio_np))
-    audio_np = np.asarray(audio_np, dtype=np.float32)
-    audio_stereo = np.stack((audio_np, audio_np), axis=-1)
     audio_stereo = audio_stereo.reshape(-1, 2)
-    print("Audio Array Shape:", audio_stereo.shape)
-    print("Audio Array Dtype:", audio_stereo.dtype)
     with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
         sf.write(tmp_file.name, audio_stereo, 24000, format='WAV', subtype='PCM_16')
-    return tmp_file.name
 with gr.Blocks() as demo:
     gr.Markdown(title)
-    with gr.Tabs():
-        with gr.TabItem("🌬️💬📝Standard TTS"):
-            with gr.Row():
-                text_input_standard = gr.Textbox(label="Enter text")
-                lang_input_standard = gr.Dropdown(choices=list(LANGUAGES.keys()), label="Language")
-                speaker_input_standard = gr.Audio(label="Upload or Record Speaker Audio (optional)", sources=["upload", "microphone"], type="filepath")
-                placeholder_mix_lang = gr.Textbox(visible=False)
-                placeholder_mix_text = gr.Textbox(visible=False)
-                generate_button_standard = gr.Button("Generate Speech")
-            output_audio_standard = gr.Audio(label="🌬️💬📝WhisperSpeech")
-            generate_button_standard.click(
-                whisper_speech_demo,
-                inputs=[text_input_standard, lang_input_standard, speaker_input_standard, placeholder_mix_lang, placeholder_mix_text],
-                outputs=output_audio_standard
-            )
-        with gr.TabItem("🌬️💬📝Mixed Language TTS"):
-            with gr.Row():
-                placeholder_text_input = gr.Textbox(visible=False)
-                placeholder_lang_input = gr.Dropdown(choices=[], visible=False)
-                placeholder_speaker_input = gr.Audio(visible=False)
-                mix_lang_input_mixed = gr.CheckboxGroup(choices=list(LANGUAGES.keys()), label="Select Languages")
-                mix_text_input_mixed = gr.Textbox(label="Enter mixed language text", placeholder="e.g., Hello, Cześć")
-                generate_button_mixed = gr.Button("Generate Mixed Speech")
-            output_audio_mixed = gr.Audio(label="Mixed🌬️💬📝WhisperSpeech")
-            generate_button_mixed.click(
-                whisper_speech_demo,
-                inputs=[placeholder_text_input, placeholder_lang_input, placeholder_speaker_input, mix_lang_input_mixed, mix_text_input_mixed],
-                outputs=output_audio_mixed
-            )
 demo.launch()

 from whisperspeech.pipeline import Pipeline
 from whisperspeech.utils import resampler
+title = """# 🙋🏻‍♂️ Welcome to🌟Collabora🌬️💬📝WhisperSpeech
+You can use this ZeroGPU Space to test out the current model [🌬️💬📝collabora/whisperspeech](https://huggingface.co/collabora/whisperspeech). 🌬️💬📝collabora/whisperspeech is An Open Source text-to-speech system built by inverting Whisper. Install it and use your command line interface locally with `pip install whisperspeech`. It's like Stable Diffusion but for speech – both powerful and easily customizable : so you can use it programmatically in your own pipelines! [Contribute to whisperspeech here](https://github.com/collabora/WhisperSpeech)
 You can also use 🌬️💬📝WhisperSpeech by cloning this space. 🧬🔬🔍 Simply click here: <a style="display:inline-block" href="https://huggingface.co/spaces/Tonic/laion-whisper?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAAXNSR0IArs4c6QAAAP5JREFUOE+lk7FqAkEURY+ltunEgFXS2sZGIbXfEPdLlnxJyDdYB62sbbUKpLbVNhyYFzbrrA74YJlh9r079973psed0cvUD4A+4HoCjsA85X0Dfn/RBLBgBDxnQPfAEJgBY+A9gALA4tcbamSzS4xq4FOQAJgCDwV2CPKV8tZAJcAjMMkUe1vX+U+SMhfAJEHasQIWmXNN3abzDwHUrgcRGmYcgKe0bxrblHEB4E/pndMazNpSZGcsZdBlYJcEL9Afo75molJyM2FxmPgmgPqlWNLGfwZGG6UiyEvLzHYDmoPkDDiNm9JR9uboiONcBXrpY1qmgs21x1QwyZcpvxt9NS09PlsPAAAAAElFTkSuQmCC&logoWidth=14" alt="Duplicate Space"></a></h3>
+We're **celebrating the release of the whisperspeech** at [the LAION community, if you love open source ai learn more here : https://laion.ai/](https://laion.ai/) big thanks to the folks at huggingface for the community grant 🤗
+### How to Use
+Input text with the language identifiers provided to create a multilingual speech. Optionally you can add an audiosample to make a voice print. Scroll down and try the api <3 Gradio.
 """
+# Function to parse the multilingual input text
+def parse_multilingual_text(input_text):
+    pattern = r"<(\w+)>\s(.*?)\s(?=<\w+>|$)"
+    segments = re.findall(pattern, input_text)
+    return [(lang, text.strip()) for lang, text in segments if lang in LANGUAGES.keys()]
+# Function to generate audio for each language segment
+def generate_segment_audio(text, lang, speaker_url, pipe):
+    stoks = pipe.t2s.generate([text], lang=[lang])
+    audio_data = pipe.generate(stoks, speaker_url, lang)
+    resample_audio = resampler(newsr=24000)
+    audio_data_resampled = next(resample_audio([{'sample_rate': 24000, 'samples': audio_data.cpu()}]))['samples_24k']
+    audio_np = audio_data_resampled.cpu().numpy()
+    return audio_np
+# Function to concatenate audio segments
+def concatenate_audio_segments(segments):
+    concatenated_audio = np.concatenate(segments, axis=0)
+    concatenated_audio = concatenated_audio / np.max(np.abs(concatenated_audio))
+    return np.asarray(concatenated_audio, dtype=np.float32)
 @spaces.GPU
+def whisper_speech_demo(multilingual_text, speaker_audio):
+    segments = parse_multilingual_text(multilingual_text)
+    if not segments:
+        return None, "No valid language segments found. Please use the format: <lang> text"
     pipe = Pipeline()
     speaker_url = speaker_audio if speaker_audio is not None else None
+    audio_segments = []
+    for lang, text in segments:
+        audio_np = generate_segment_audio(text, lang, speaker_url, pipe)
+        audio_segments.append(audio_np)
+    concatenated_audio = concatenate_audio_segments(audio_segments)
+    audio_stereo = np.stack((concatenated_audio, concatenated_audio), axis=-1)
     audio_stereo = audio_stereo.reshape(-1, 2)
     with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp_file:
         sf.write(tmp_file.name, audio_stereo, 24000, format='WAV', subtype='PCM_16')
+        return tmp_file.name
 with gr.Blocks() as demo:
     gr.Markdown(title)
+    output_audio = gr.Audio(label="Generated Speech")
+    generate_button = gr.Button("Try 🌟Collabora🌬️💬📝WhisperSpeech")
+    with gr.Row():
+        text_input = gr.Textbox(label="Enter multilingual text", placeholder="e.g., <en> Hello <fr> Bonjour <es> Hola", examples=["<en> Hello, how are you? <fr> Bonjour, comment ça va?", "<de> Guten Tag <it> Buongiorno <jp> こんにちは"])
+        speaker_input = gr.Audio(label="Upload or Record Speaker Audio (optional)", sources=["upload", "microphone"], type="filepath", examples=["path/to/tonic.wav"])
+        with gr.Accordion("Available Languages and Their Tags"):
+            language_list = "\n".join([f"{lang}: {LANGUAGES[lang]}" for lang in LANGUAGES])
+            gr.Markdown(language_list)
+    generate_button.click(whisper_speech_demo, inputs=[text_input, speaker_input], outputs=output_audio)
 demo.launch()