HAMMALE
/

speecht5-darija

TensorBoard

Safetensors

speecht5

Model card Files Files and versions

xet

Metrics Training metrics Community

HAMMALE commited on Apr 27

Commit

c2bc96d

verified ·

1 Parent(s): f49e9c9

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -255

app.py DELETED Viewed

@@ -1,255 +0,0 @@
-import torch
-import soundfile as sf
-import os
-import re
-from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-from speechbrain.pretrained import EncoderClassifier
-# Define paths and device
-model_path = "HAMMALE/speecht5-darija"  # Path to your model on HF Hub
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Using device: {device}")
-# Load models
-processor = SpeechT5Processor.from_pretrained(model_path)
-model = SpeechT5ForTextToSpeech.from_pretrained(model_path).to(device)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
-# Load speaker embedding model
-speaker_model = EncoderClassifier.from_hparams(
-    source="speechbrain/spkrec-xvect-voxceleb",
-    run_opts={"device": device},
-    savedir=os.path.join("/tmp", "spkrec-xvect-voxceleb"),
-)
-# Load pre-computed speaker embeddings
-male_embedding = torch.load("male_embedding.pt") if os.path.exists("male_embedding.pt") else torch.randn(1, 512)
-female_embedding = torch.load("female_embedding.pt") if os.path.exists("female_embedding.pt") else torch.randn(1, 512)
-# Text normalization function
-def normalize_text(text):
-    """Normalize text for TTS processing"""
-    text = text.lower()
-    # Keep letters, numbers, spaces and apostrophes - fixed regex
-    text = re.sub(r'[^\w\s\'\u0600-\u06FF]', '', text)
-    text = ' '.join(text.split())
-    return text
-# Function to synthesize speech
-def synthesize_speech(text, voice_type="male", speed=1.0):
-    """Generate speech from text using the specified voice type"""
-    try:
-        # Select speaker embedding based on voice type
-        if voice_type == "male":
-            speaker_embeddings = male_embedding.to(device)
-        else:
-            speaker_embeddings = female_embedding.to(device)
-        # Normalize and tokenize input text
-        normalized_text = normalize_text(text)
-        inputs = processor(text=normalized_text, return_tensors="pt").to(device)
-        # Generate speech
-        with torch.no_grad():
-            speech = model.generate_speech(
-                inputs["input_ids"],
-                speaker_embeddings,
-                vocoder=vocoder
-            )
-        # Convert to numpy array and adjust speed if needed
-        speech_np = speech.cpu().numpy()
-        # Apply speed adjustment (simple resampling)
-        if speed != 1.0:
-            # This is a simple approach - for production use a proper resampling library
-            import numpy as np
-            from scipy import signal
-            sample_rate = 16000
-            new_length = int(len(speech_np) / speed)
-            speech_np = signal.resample(speech_np, new_length)
-        # Save temporary audio file
-        output_file = "output_speech.wav"
-        sf.write(output_file, speech_np, 16000)
-        return output_file, None
-    except Exception as e:
-        return None, f"Error generating speech: {str(e)}"
-# Gradio imports need to be added
-import gradio as gr
-# Custom CSS for better design
-custom_css = """
-.gradio-container {
-    font-family: 'Poppins', 'Arial', sans-serif;
-    max-width: 750px;
-    margin: auto;
-}
-.main-header {
-    background: linear-gradient(90deg, #c31432, #240b36);
-    color: white;
-    padding: 1.5em;
-    border-radius: 10px;
-    text-align: center;
-    margin-bottom: 1em;
-    box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
-}
-.main-header h1 {
-    font-size: 2.2em;
-    margin-bottom: 0.3em;
-}
-.main-header p {
-    font-size: 1.1em;
-    opacity: 0.9;
-}
-footer {
-    text-align: center;
-    margin-top: 2em;
-    color: #555;
-    font-size: 0.9em;
-}
-.flag-icon {
-    width: 24px;
-    height: 24px;
-    vertical-align: middle;
-    margin-right: 8px;
-}
-.example-header {
-    font-weight: bold;
-    color: #c31432;
-    margin-top: 1em;
-}
-.info-box {
-    background-color: #f9f9f9;
-    border-left: 4px solid #c31432;
-    padding: 1em;
-    margin: 1em 0;
-    border-radius: 5px;
-}
-.voice-selector {
-    display: flex;
-    justify-content: center;
-    gap: 20px;
-    margin: 10px 0;
-}
-.voice-option {
-    border: 2px solid #ddd;
-    border-radius: 10px;
-    padding: 10px 15px;
-    transition: all 0.3s ease;
-    cursor: pointer;
-}
-.voice-option.selected {
-    border-color: #c31432;
-    background-color: #fff5f5;
-}
-.slider-container {
-    margin: 20px 0;
-}
-"""
-# Create Gradio interface with improved design
-with gr.Blocks(css=custom_css) as demo:
-    gr.HTML(
-        """
-        <div class="main-header">
-            <h1>🇲🇦 Moroccan Darija Text-to-Speech 🎧</h1>
-            <p>Convert Moroccan Arabic (Darija) text into natural-sounding speech</p>
-        </div>
-        """
-    )
-    with gr.Row():
-        with gr.Column():
-            gr.HTML(
-                """
-                <div class="info-box">
-                    <p>This model was fine-tuned on the DODa audio dataset to produce high-quality
-                    Darija speech from text input. You can adjust the voice and speed below.</p>
-                </div>
-                """
-            )
-            text_input = gr.Textbox(
-                label="Enter Darija Text",
-                placeholder="Kteb chi jomla b darija hna...",
-                lines=3
-            )
-            with gr.Row():
-                voice_type = gr.Radio(
-                    ["male", "female"],
-                    label="Voice Type",
-                    value="male"
-                )
-            speed = gr.Slider(
-                minimum=0.5,
-                maximum=2.0,
-                value=1.0,
-                step=0.1,
-                label="Speech Speed"
-            )
-            generate_btn = gr.Button("Generate Speech", variant="primary")
-            gr.HTML(
-                """
-                <div class="example-header">Example phrases:</div>
-                <ul>
-                    <li>"Ana Nadi Bezzaaf hhh"</li>
-                    <li>"Lyoum ajwaa zwina bezzaf."</li>
-                    <li>"lmaghrib ahssan blad fi l3alam "</li>
-                </ul>
-                """
-            )
-        with gr.Column():
-            audio_output = gr.Audio(label="Generated Speech")
-            error_output = gr.Textbox(label="Error (if any)", visible=False)
-            gr.Examples(
-                examples=[
-                    ["Ana Nadi Bezzaaf hhh", "male", 1.0],
-                    ["Lyoum ajwaa zwina bezzaf.", "female", 1.0],
-                    ["lmaghrib ahssan blad fi l3alam", "male", 1.0],
-                    ["Filistine hora mina lbar ila lbahr", "female", 0.8],
-                ],
-                inputs=[text_input, voice_type, speed],
-                outputs=[audio_output, error_output],
-                fn=synthesize_speech
-            )
-    gr.HTML(
-        """
-        <footer>
-            <p>Developed by HAMMALE | Powered by Microsoft SpeechT5 | Data: DODa</p>
-        </footer>
-        """
-    )
-    # Set button click action
-    generate_btn.click(
-        fn=synthesize_speech,
-        inputs=[text_input, voice_type, speed],
-        outputs=[audio_output, error_output]
-    )
-# Launch the demo
-if __name__ == "__main__":
-    demo.launch()