HAMMALE commited on
Commit
c2bc96d
·
verified ·
1 Parent(s): f49e9c9

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -255
app.py DELETED
@@ -1,255 +0,0 @@
1
-
2
- import torch
3
- import soundfile as sf
4
- import os
5
- import re
6
- from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
7
- from speechbrain.pretrained import EncoderClassifier
8
-
9
- # Define paths and device
10
- model_path = "HAMMALE/speecht5-darija" # Path to your model on HF Hub
11
- device = "cuda" if torch.cuda.is_available() else "cpu"
12
- print(f"Using device: {device}")
13
-
14
- # Load models
15
- processor = SpeechT5Processor.from_pretrained(model_path)
16
- model = SpeechT5ForTextToSpeech.from_pretrained(model_path).to(device)
17
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
18
-
19
- # Load speaker embedding model
20
- speaker_model = EncoderClassifier.from_hparams(
21
- source="speechbrain/spkrec-xvect-voxceleb",
22
- run_opts={"device": device},
23
- savedir=os.path.join("/tmp", "spkrec-xvect-voxceleb"),
24
- )
25
-
26
- # Load pre-computed speaker embeddings
27
- male_embedding = torch.load("male_embedding.pt") if os.path.exists("male_embedding.pt") else torch.randn(1, 512)
28
- female_embedding = torch.load("female_embedding.pt") if os.path.exists("female_embedding.pt") else torch.randn(1, 512)
29
-
30
- # Text normalization function
31
- def normalize_text(text):
32
- """Normalize text for TTS processing"""
33
- text = text.lower()
34
- # Keep letters, numbers, spaces and apostrophes - fixed regex
35
- text = re.sub(r'[^\w\s\'\u0600-\u06FF]', '', text)
36
- text = ' '.join(text.split())
37
- return text
38
-
39
- # Function to synthesize speech
40
- def synthesize_speech(text, voice_type="male", speed=1.0):
41
- """Generate speech from text using the specified voice type"""
42
- try:
43
- # Select speaker embedding based on voice type
44
- if voice_type == "male":
45
- speaker_embeddings = male_embedding.to(device)
46
- else:
47
- speaker_embeddings = female_embedding.to(device)
48
-
49
- # Normalize and tokenize input text
50
- normalized_text = normalize_text(text)
51
- inputs = processor(text=normalized_text, return_tensors="pt").to(device)
52
-
53
- # Generate speech
54
- with torch.no_grad():
55
- speech = model.generate_speech(
56
- inputs["input_ids"],
57
- speaker_embeddings,
58
- vocoder=vocoder
59
- )
60
-
61
- # Convert to numpy array and adjust speed if needed
62
- speech_np = speech.cpu().numpy()
63
-
64
- # Apply speed adjustment (simple resampling)
65
- if speed != 1.0:
66
- # This is a simple approach - for production use a proper resampling library
67
- import numpy as np
68
- from scipy import signal
69
- sample_rate = 16000
70
- new_length = int(len(speech_np) / speed)
71
- speech_np = signal.resample(speech_np, new_length)
72
-
73
- # Save temporary audio file
74
- output_file = "output_speech.wav"
75
- sf.write(output_file, speech_np, 16000)
76
-
77
- return output_file, None
78
-
79
- except Exception as e:
80
- return None, f"Error generating speech: {str(e)}"
81
-
82
- # Gradio imports need to be added
83
- import gradio as gr
84
-
85
- # Custom CSS for better design
86
- custom_css = """
87
- .gradio-container {
88
- font-family: 'Poppins', 'Arial', sans-serif;
89
- max-width: 750px;
90
- margin: auto;
91
- }
92
-
93
- .main-header {
94
- background: linear-gradient(90deg, #c31432, #240b36);
95
- color: white;
96
- padding: 1.5em;
97
- border-radius: 10px;
98
- text-align: center;
99
- margin-bottom: 1em;
100
- box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
101
- }
102
-
103
- .main-header h1 {
104
- font-size: 2.2em;
105
- margin-bottom: 0.3em;
106
- }
107
-
108
- .main-header p {
109
- font-size: 1.1em;
110
- opacity: 0.9;
111
- }
112
-
113
- footer {
114
- text-align: center;
115
- margin-top: 2em;
116
- color: #555;
117
- font-size: 0.9em;
118
- }
119
-
120
- .flag-icon {
121
- width: 24px;
122
- height: 24px;
123
- vertical-align: middle;
124
- margin-right: 8px;
125
- }
126
-
127
- .example-header {
128
- font-weight: bold;
129
- color: #c31432;
130
- margin-top: 1em;
131
- }
132
-
133
- .info-box {
134
- background-color: #f9f9f9;
135
- border-left: 4px solid #c31432;
136
- padding: 1em;
137
- margin: 1em 0;
138
- border-radius: 5px;
139
- }
140
-
141
- .voice-selector {
142
- display: flex;
143
- justify-content: center;
144
- gap: 20px;
145
- margin: 10px 0;
146
- }
147
-
148
- .voice-option {
149
- border: 2px solid #ddd;
150
- border-radius: 10px;
151
- padding: 10px 15px;
152
- transition: all 0.3s ease;
153
- cursor: pointer;
154
- }
155
-
156
- .voice-option.selected {
157
- border-color: #c31432;
158
- background-color: #fff5f5;
159
- }
160
-
161
- .slider-container {
162
- margin: 20px 0;
163
- }
164
- """
165
-
166
- # Create Gradio interface with improved design
167
- with gr.Blocks(css=custom_css) as demo:
168
- gr.HTML(
169
- """
170
- <div class="main-header">
171
- <h1>🇲🇦 Moroccan Darija Text-to-Speech 🎧</h1>
172
- <p>Convert Moroccan Arabic (Darija) text into natural-sounding speech</p>
173
- </div>
174
- """
175
- )
176
-
177
- with gr.Row():
178
- with gr.Column():
179
- gr.HTML(
180
- """
181
- <div class="info-box">
182
- <p>This model was fine-tuned on the DODa audio dataset to produce high-quality
183
- Darija speech from text input. You can adjust the voice and speed below.</p>
184
- </div>
185
- """
186
- )
187
-
188
- text_input = gr.Textbox(
189
- label="Enter Darija Text",
190
- placeholder="Kteb chi jomla b darija hna...",
191
- lines=3
192
- )
193
-
194
- with gr.Row():
195
- voice_type = gr.Radio(
196
- ["male", "female"],
197
- label="Voice Type",
198
- value="male"
199
- )
200
-
201
- speed = gr.Slider(
202
- minimum=0.5,
203
- maximum=2.0,
204
- value=1.0,
205
- step=0.1,
206
- label="Speech Speed"
207
- )
208
-
209
- generate_btn = gr.Button("Generate Speech", variant="primary")
210
-
211
- gr.HTML(
212
- """
213
- <div class="example-header">Example phrases:</div>
214
- <ul>
215
- <li>"Ana Nadi Bezzaaf hhh"</li>
216
- <li>"Lyoum ajwaa zwina bezzaf."</li>
217
- <li>"lmaghrib ahssan blad fi l3alam "</li>
218
- </ul>
219
- """
220
- )
221
-
222
- with gr.Column():
223
- audio_output = gr.Audio(label="Generated Speech")
224
- error_output = gr.Textbox(label="Error (if any)", visible=False)
225
-
226
- gr.Examples(
227
- examples=[
228
- ["Ana Nadi Bezzaaf hhh", "male", 1.0],
229
- ["Lyoum ajwaa zwina bezzaf.", "female", 1.0],
230
- ["lmaghrib ahssan blad fi l3alam", "male", 1.0],
231
- ["Filistine hora mina lbar ila lbahr", "female", 0.8],
232
- ],
233
- inputs=[text_input, voice_type, speed],
234
- outputs=[audio_output, error_output],
235
- fn=synthesize_speech
236
- )
237
-
238
- gr.HTML(
239
- """
240
- <footer>
241
- <p>Developed by HAMMALE | Powered by Microsoft SpeechT5 | Data: DODa</p>
242
- </footer>
243
- """
244
- )
245
-
246
- # Set button click action
247
- generate_btn.click(
248
- fn=synthesize_speech,
249
- inputs=[text_input, voice_type, speed],
250
- outputs=[audio_output, error_output]
251
- )
252
-
253
- # Launch the demo
254
- if __name__ == "__main__":
255
- demo.launch()