Persian_Piper_TTS

Running

File size: 1,757 Bytes

f845b05
944dedf
 
 
 
4492d6d
19be65d
 
 
 
4eb15f6
944dedf
19be65d
 
 
 
 
a3cf651
 
944dedf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19be65d
944dedf
1782e10
 
 
 
 
 
 
 
19be65d
f845b05
944dedf
1782e10
19be65d

import gradio as gr
import wave
import numpy as np
from io import BytesIO
from huggingface_hub import hf_hub_download
from piper import PiperVoice 
from transformers import pipeline

# Load the NSFW classifier model
nsfw_detector = pipeline("text-classification", model="michellejieli/NSFW_text_classifier")

def synthesize_speech(text):
    # Check for NSFW content
    nsfw_result = nsfw_detector(text)
    if nsfw_result[0]['label'] == 'NSFW':
        return "NSFW content detected. Cannot process.", None

    model_path = hf_hub_download(repo_id="aigmixer/speaker_00", filename="speaker_00_model.onnx")
    config_path = hf_hub_download(repo_id="aigmixer/speaker_00", filename="speaker_00_model.onnx.json")
    voice = PiperVoice.load(model_path, config_path)

    # Create an in-memory buffer for the WAV file
    buffer = BytesIO()
    with wave.open(buffer, 'wb') as wav_file:
        wav_file.setframerate(voice.config.sample_rate)
        wav_file.setsampwidth(2)  # 16-bit
        wav_file.setnchannels(1)  # mono

        # Synthesize speech
        voice.synthesize(text, wav_file)

    # Convert buffer to NumPy array for Gradio output
    buffer.seek(0)
    audio_data = np.frombuffer(buffer.read(), dtype=np.int16)

    return audio_data.tobytes(), None

# Using Gradio Blocks
with gr.Blocks(theme=gr.themes.Base()) as blocks:
    gr.Markdown("# Text to Speech Synthesizer")
    gr.Markdown("Enter text to synthesize it into speech using PiperVoice.")
    input_text = gr.Textbox(label="Input Text")
    output_audio = gr.Audio(label="Synthesized Speech", type="numpy")
    submit_button = gr.Button("Synthesize")

    submit_button.click(synthesize_speech, inputs=input_text, outputs=[output_audio, "text"])

# Run the app
blocks.launch()