SherryT997 commited on
Commit
1b0d691
·
verified ·
1 Parent(s): 60a97c6

Added Temp, Top k, Top p

Browse files
Files changed (1) hide show
  1. app.py +108 -54
app.py CHANGED
@@ -43,27 +43,42 @@ examples = [
43
  [
44
  "मुले बागेत खेळत आहेत आणि पक्षी किलबिलाट करत आहेत.",
45
  "Sunita speaks slowly in a calm, moderate-pitched voice, delivering the news with a neutral tone. The recording is very high quality with no background noise.",
46
- 3.0
 
 
 
47
  ],
48
  [
49
  "ಉದ್ಯಾನದಲ್ಲಿ ಮಕ್ಕಳ ಆಟವಾಡುತ್ತಿದ್ದಾರೆ ಮತ್ತು ಪಕ್ಷಿಗಳು ಚಿಲಿಪಿಲಿ ಮಾಡುತ್ತಿವೆ.",
50
  "Suresh speaks slowly in a low-pitched, calm voice, with a neutral tone, perfect for narration. The recording is very high quality with no background noise.",
51
- 3.0
 
 
 
52
  ],
53
  [
54
  "বাচ্চারা বাগানে খেলছে আর পাখি কিচিরমিচির করছে।",
55
  "Aditi speaks at a moderate pace and pitch, with a clear, neutral tone and no emotional emphasis. The recording is very high quality with no background noise.",
56
- 3.0
 
 
 
57
  ],
58
  [
59
  "పిల్లలు తోటలో ఆడుకుంటున్నారు, పక్షుల కిలకిలరావాలు.",
60
  "Prakash speaks slowly in a low-pitched, calm voice, with a neutral tone, perfect for narration. The recording is very high quality with no background noise.",
61
- 3.0
 
 
 
62
  ],
63
  [
64
  "పిల్లలు తోటలో ఆడుకుంటున్నారు, పక్షుల కిలకిలరావాలు.",
65
  "Prakash speaks slowly in a low-pitched, calm voice, with a neutral tone, perfect for narration. The recording is very high quality with no background noise.",
66
- 3.0
 
 
 
67
  ],
68
  [
69
  "This is the best time of my life, Bartley,' she said happily",
@@ -73,22 +88,34 @@ examples = [
73
  [
74
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
75
  "A female speaker with a slightly low-pitched, quite monotone voice speaks with an American accent at a slightly faster-than-average pace in a confined space with very clear audio.",
76
- 3.0
 
 
 
77
  ],
78
  [
79
  "बगीचे में बच्चे खेल रहे हैं और पक्षी चहचहा रहे हैं।",
80
  "Rohit speaks with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
81
- 3.0
 
 
 
82
  ],
83
  [
84
  "കുട്ടികൾ പൂന്തോട്ടത്തിൽ കളിക്കുന്നു, പക്ഷികൾ ചിലയ്ക്കുന്നു.",
85
  "Anjali speaks with a low-pitched voice delivering her words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
86
- 3.0
 
 
 
87
  ],
88
  [
89
  "குழந்தைகள் தோட்டத்தில் விளையாடுகிறார்கள், பறவைகள் கிண்டல் செய்கின்றன.",
90
  "Jaya speaks with a slightly low-pitched, quite monotone voice at a slightly faster-than-average pace in a confined space with very clear audio.",
91
- 3.0
 
 
 
92
  ]
93
  ]
94
 
@@ -97,64 +124,91 @@ finetuned_examples = [
97
  [
98
  "मुले बागेत खेळत आहेत आणि पक्षी किलबिलाट करत आहेत.",
99
  "Sunita speaks slowly in a calm, moderate-pitched voice, delivering the news with a neutral tone. The recording is very high quality with no background noise.",
100
- 3.0
 
 
 
101
  ],
102
  [
103
  "ಉದ್ಯಾನದಲ್ಲಿ ಮಕ್ಕಳ ಆಟವಾಡುತ್ತಿದ್ದಾರೆ ಮತ್ತು ಪಕ್ಷಿಗಳು ಚಿಲಿಪಿಲಿ ಮಾಡುತ್ತಿವೆ.",
104
  "Suresh speaks slowly in a low-pitched, calm voice, with a neutral tone, perfect for narration. The recording is very high quality with no background noise.",
105
- 3.0
 
 
 
106
  ],
107
  [
108
  "বাচ্চারা বাগানে খেলছে আর পাখি কিচিরমিচির করছে।",
109
  "Aditi speaks at a moderate pace and pitch, with a clear, neutral tone and no emotional emphasis. The recording is very high quality with no background noise.",
110
- 3.0
 
 
 
111
  ],
112
  [
113
  "పిల్లలు తోటలో ఆడుకుంటున్నారు, పక్షుల కిలకిలరావాలు.",
114
  "Prakash speaks slowly in a low-pitched, calm voice, with a neutral tone, perfect for narration. The recording is very high quality with no background noise.",
115
- 3.0
 
 
 
116
  ],
117
  [
118
  "పిల్లలు తోటలో ఆడుకుంటున్నారు, పక్షుల కిలకిలరావాలు.",
119
  "Prakash speaks slowly in a low-pitched, calm voice, with a neutral tone, perfect for narration. The recording is very high quality with no background noise.",
120
- 3.0
 
 
 
121
  ],
122
  [
123
  "This is the best time of my life, Bartley,' she said happily",
124
  "A male speaker with a low-pitched voice speaks with a British accent at a fast pace in a small, confined space with very clear audio and an animated tone.",
125
- 3.0
 
 
 
126
  ],
127
  [
128
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
129
  "A female speaker with a slightly low-pitched, quite monotone voice speaks with an American accent at a slightly faster-than-average pace in a confined space with very clear audio.",
130
- 3.0
 
 
 
131
  ],
132
  [
133
  "बगीचे में बच्चे खेल रहे हैं और पक्षी चहचहा रहे हैं।",
134
  "Rohit speaks with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
135
- 3.0
 
 
 
136
  ],
137
  [
138
  "കുട്ടികൾ പൂന്തോട്ടത്തിൽ കളിക്കുന്നു, പക്ഷികൾ ചിലയ്ക്കുന്നു.",
139
  "Anjali speaks with a low-pitched voice delivering her words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
140
- 3.0
 
 
 
141
  ],
142
  [
143
  "குழந்தைகள் தோட்டத்தில் விளையாடுகிறார்கள், பறவைகள் கிண்டல் செய்கின்றன.",
144
  "Jaya speaks with a slightly low-pitched, quite monotone voice at a slightly faster-than-average pace in a confined space with very clear audio.",
145
- 3.0
 
 
 
146
  ]
147
  ]
148
 
149
-
150
  def numpy_to_mp3(audio_array, sampling_rate):
151
- # Normalize audio_array if it's floating-point
152
  if np.issubdtype(audio_array.dtype, np.floating):
153
  max_val = np.max(np.abs(audio_array))
154
- audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
155
  audio_array = audio_array.astype(np.int16)
156
 
157
- # Create an audio segment from the numpy array
158
  audio_segment = AudioSegment(
159
  audio_array.tobytes(),
160
  frame_rate=sampling_rate,
@@ -162,11 +216,9 @@ def numpy_to_mp3(audio_array, sampling_rate):
162
  channels=1
163
  )
164
 
165
- # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
166
  mp3_io = io.BytesIO()
167
  audio_segment.export(mp3_io, format="mp3", bitrate="320k")
168
 
169
- # Get the MP3 bytes
170
  mp3_bytes = mp3_io.getvalue()
171
  mp3_io.close()
172
 
@@ -176,14 +228,12 @@ sampling_rate = model.audio_encoder.config.sampling_rate
176
  frame_rate = model.audio_encoder.config.frame_rate
177
 
178
  @spaces.GPU
179
- def generate_base(text, description,):
180
- # Initialize variables
181
- chunk_size = 25 # Process max 25 words or a sentence at a time
182
 
183
- # Tokenize the full text and description
184
  inputs = description_tokenizer(description, return_tensors="pt").to(device)
185
 
186
- sentences_text = nltk.sent_tokenize(text) # this gives us a list of sentences
187
  curr_sentence = ""
188
  chunks = []
189
  for sentence in sentences_text:
@@ -201,22 +251,21 @@ def generate_base(text, description,):
201
 
202
  all_audio = []
203
 
204
- # Process each chunk
205
  for chunk in chunks:
206
- # Tokenize the chunk
207
  prompt = tokenizer(chunk, return_tensors="pt").to(device)
208
 
209
- # Generate audio for the chunk
210
  generation = model.generate(
211
  input_ids=inputs.input_ids,
212
  attention_mask=inputs.attention_mask,
213
  prompt_input_ids=prompt.input_ids,
214
  prompt_attention_mask=prompt.attention_mask,
215
  do_sample=True,
 
 
 
216
  return_dict_in_generate=True
217
  )
218
 
219
- # Extract audio from generation
220
  if hasattr(generation, 'sequences') and hasattr(generation, 'audios_length'):
221
  audio = generation.sequences[0, :generation.audios_length[0]]
222
  audio_np = audio.to(torch.float32).cpu().numpy().squeeze()
@@ -224,23 +273,18 @@ def generate_base(text, description,):
224
  audio_np = audio_np.flatten()
225
  all_audio.append(audio_np)
226
 
227
- # Combine all audio chunks
228
  combined_audio = np.concatenate(all_audio)
229
 
230
- # Convert to expected format and yield
231
  print(f"Sample of length: {round(combined_audio.shape[0] / sampling_rate, 2)} seconds")
232
  yield numpy_to_mp3(combined_audio, sampling_rate=sampling_rate)
233
 
234
-
235
  @spaces.GPU
236
- def generate_finetuned(text, description):
237
- # Initialize variables
238
- chunk_size = 25 # Process max 25 words or a sentence at a time
239
 
240
- # Tokenize the full text and description
241
  inputs = description_tokenizer(description, return_tensors="pt").to(device)
242
 
243
- sentences_text = nltk.sent_tokenize(text) # this gives us a list of sentences
244
  curr_sentence = ""
245
  chunks = []
246
  for sentence in sentences_text:
@@ -258,22 +302,21 @@ def generate_finetuned(text, description):
258
 
259
  all_audio = []
260
 
261
- # Process each chunk
262
  for chunk in chunks:
263
- # Tokenize the chunk
264
  prompt = tokenizer(chunk, return_tensors="pt").to(device)
265
 
266
- # Generate audio for the chunk
267
  generation = finetuned_model.generate(
268
  input_ids=inputs.input_ids,
269
  attention_mask=inputs.attention_mask,
270
  prompt_input_ids=prompt.input_ids,
271
  prompt_attention_mask=prompt.attention_mask,
272
  do_sample=True,
 
 
 
273
  return_dict_in_generate=True
274
  )
275
 
276
- # Extract audio from generation
277
  if hasattr(generation, 'sequences') and hasattr(generation, 'audios_length'):
278
  audio = generation.sequences[0, :generation.audios_length[0]]
279
  audio_np = audio.to(torch.float32).cpu().numpy().squeeze()
@@ -281,14 +324,11 @@ def generate_finetuned(text, description):
281
  audio_np = audio_np.flatten()
282
  all_audio.append(audio_np)
283
 
284
- # Combine all audio chunks
285
  combined_audio = np.concatenate(all_audio)
286
 
287
- # Convert to expected format and yield
288
  print(f"Sample of length: {round(combined_audio.shape[0] / sampling_rate, 2)} seconds")
289
  yield numpy_to_mp3(combined_audio, sampling_rate=sampling_rate)
290
 
291
-
292
  css = """
293
  #share-btn-container {
294
  display: flex;
@@ -325,6 +365,7 @@ css = """
325
  display: none !important;
326
  }
327
  """
 
328
  with gr.Blocks(css=css) as block:
329
  gr.HTML(
330
  """
@@ -335,7 +376,7 @@ with gr.Blocks(css=css) as block:
335
  "
336
  >
337
  <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
338
- Parler-TTS 🗣️
339
  </h1>
340
  </div>
341
  </div>
@@ -362,11 +403,18 @@ with gr.Blocks(css=css) as block:
362
  with gr.Column():
363
  input_text = gr.Textbox(label="Input Text", lines=2, value=finetuned_examples[0][0], elem_id="input_text")
364
  description = gr.Textbox(label="Description", lines=2, value=finetuned_examples[0][1], elem_id="input_description")
 
 
 
 
 
 
365
  run_button = gr.Button("Generate Audio", variant="primary")
 
366
  with gr.Column():
367
  audio_out = gr.Audio(label="Parler-TTS generation", format="mp3", elem_id="audio_out", autoplay=True)
368
 
369
- inputs = [input_text, description]
370
  outputs = [audio_out]
371
  gr.Examples(examples=finetuned_examples, fn=generate_finetuned, inputs=inputs, outputs=outputs, cache_examples=False)
372
  run_button.click(fn=generate_finetuned, inputs=inputs, outputs=outputs, queue=True)
@@ -376,20 +424,26 @@ with gr.Blocks(css=css) as block:
376
  with gr.Column():
377
  input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
378
  description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
 
 
 
 
 
 
379
  run_button = gr.Button("Generate Audio", variant="primary")
 
380
  with gr.Column():
381
  audio_out = gr.Audio(label="Parler-TTS generation", format="mp3", elem_id="audio_out", autoplay=True)
382
 
383
- inputs = [input_text, description]
384
  outputs = [audio_out]
385
  gr.Examples(examples=examples, fn=generate_base, inputs=inputs, outputs=outputs, cache_examples=False)
386
  run_button.click(fn=generate_base, inputs=inputs, outputs=outputs, queue=True)
387
 
388
-
389
  gr.HTML(
390
  """
391
  If you'd like to learn more about how the model was trained or explore fine-tuning it yourself, visit the <a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> repository on GitHub. The Parler-TTS codebase and associated checkpoints are licensed under the <a href="https://github.com/huggingface/parler-tts/blob/main/LICENSE">Apache 2.0 license</a>.</p>
392
- """
393
  )
394
 
395
  block.queue()
 
43
  [
44
  "मुले बागेत खेळत आहेत आणि पक्षी किलबिलाट करत आहेत.",
45
  "Sunita speaks slowly in a calm, moderate-pitched voice, delivering the news with a neutral tone. The recording is very high quality with no background noise.",
46
+ 3.0,
47
+ 0.8,
48
+ 0.9,
49
+ 50
50
  ],
51
  [
52
  "ಉದ್ಯಾನದಲ್ಲಿ ಮಕ್ಕಳ ಆಟವಾಡುತ್ತಿದ್ದಾರೆ ಮತ್ತು ಪಕ್ಷಿಗಳು ಚಿಲಿಪಿಲಿ ಮಾಡುತ್ತಿವೆ.",
53
  "Suresh speaks slowly in a low-pitched, calm voice, with a neutral tone, perfect for narration. The recording is very high quality with no background noise.",
54
+ 3.0,
55
+ 0.8,
56
+ 0.9,
57
+ 50
58
  ],
59
  [
60
  "বাচ্চারা বাগানে খেলছে আর পাখি কিচিরমিচির করছে।",
61
  "Aditi speaks at a moderate pace and pitch, with a clear, neutral tone and no emotional emphasis. The recording is very high quality with no background noise.",
62
+ 3.0,
63
+ 0.8,
64
+ 0.9,
65
+ 50
66
  ],
67
  [
68
  "పిల్లలు తోటలో ఆడుకుంటున్నారు, పక్షుల కిలకిలరావాలు.",
69
  "Prakash speaks slowly in a low-pitched, calm voice, with a neutral tone, perfect for narration. The recording is very high quality with no background noise.",
70
+ 3.0,
71
+ 0.8,
72
+ 0.9,
73
+ 50
74
  ],
75
  [
76
  "పిల్లలు తోటలో ఆడుకుంటున్నారు, పక్షుల కిలకిలరావాలు.",
77
  "Prakash speaks slowly in a low-pitched, calm voice, with a neutral tone, perfect for narration. The recording is very high quality with no background noise.",
78
+ 3.0,
79
+ 0.8,
80
+ 0.9,
81
+ 50
82
  ],
83
  [
84
  "This is the best time of my life, Bartley,' she said happily",
 
88
  [
89
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
90
  "A female speaker with a slightly low-pitched, quite monotone voice speaks with an American accent at a slightly faster-than-average pace in a confined space with very clear audio.",
91
+ 3.0,
92
+ 0.8,
93
+ 0.9,
94
+ 50
95
  ],
96
  [
97
  "बगीचे में बच्चे खेल रहे हैं और पक्षी चहचहा रहे हैं।",
98
  "Rohit speaks with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
99
+ 3.0,
100
+ 0.8,
101
+ 0.9,
102
+ 50
103
  ],
104
  [
105
  "കുട്ടികൾ പൂന്തോട്ടത്തിൽ കളിക്കുന്നു, പക്ഷികൾ ചിലയ്ക്കുന്നു.",
106
  "Anjali speaks with a low-pitched voice delivering her words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
107
+ 3.0,
108
+ 0.8,
109
+ 0.9,
110
+ 50
111
  ],
112
  [
113
  "குழந்தைகள் தோட்டத்தில் விளையாடுகிறார்கள், பறவைகள் கிண்டல் செய்கின்றன.",
114
  "Jaya speaks with a slightly low-pitched, quite monotone voice at a slightly faster-than-average pace in a confined space with very clear audio.",
115
+ 3.0,
116
+ 0.8,
117
+ 0.9,
118
+ 50
119
  ]
120
  ]
121
 
 
124
  [
125
  "मुले बागेत खेळत आहेत आणि पक्षी किलबिलाट करत आहेत.",
126
  "Sunita speaks slowly in a calm, moderate-pitched voice, delivering the news with a neutral tone. The recording is very high quality with no background noise.",
127
+ 3.0,
128
+ 0.8,
129
+ 0.9,
130
+ 50
131
  ],
132
  [
133
  "ಉದ್ಯಾನದಲ್ಲಿ ಮಕ್ಕಳ ಆಟವಾಡುತ್ತಿದ್ದಾರೆ ಮತ್ತು ಪಕ್ಷಿಗಳು ಚಿಲಿಪಿಲಿ ಮಾಡುತ್ತಿವೆ.",
134
  "Suresh speaks slowly in a low-pitched, calm voice, with a neutral tone, perfect for narration. The recording is very high quality with no background noise.",
135
+ 3.0,
136
+ 0.8,
137
+ 0.9,
138
+ 50
139
  ],
140
  [
141
  "বাচ্চারা বাগানে খেলছে আর পাখি কিচিরমিচির করছে।",
142
  "Aditi speaks at a moderate pace and pitch, with a clear, neutral tone and no emotional emphasis. The recording is very high quality with no background noise.",
143
+ 3.0,
144
+ 0.8,
145
+ 0.9,
146
+ 50
147
  ],
148
  [
149
  "పిల్లలు తోటలో ఆడుకుంటున్నారు, పక్షుల కిలకిలరావాలు.",
150
  "Prakash speaks slowly in a low-pitched, calm voice, with a neutral tone, perfect for narration. The recording is very high quality with no background noise.",
151
+ 3.0,
152
+ 0.8,
153
+ 0.9,
154
+ 50
155
  ],
156
  [
157
  "పిల్లలు తోటలో ఆడుకుంటున్నారు, పక్షుల కిలకిలరావాలు.",
158
  "Prakash speaks slowly in a low-pitched, calm voice, with a neutral tone, perfect for narration. The recording is very high quality with no background noise.",
159
+ 3.0,
160
+ 0.8,
161
+ 0.9,
162
+ 50
163
  ],
164
  [
165
  "This is the best time of my life, Bartley,' she said happily",
166
  "A male speaker with a low-pitched voice speaks with a British accent at a fast pace in a small, confined space with very clear audio and an animated tone.",
167
+ 3.0,
168
+ 0.8,
169
+ 0.9,
170
+ 50
171
  ],
172
  [
173
  "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
174
  "A female speaker with a slightly low-pitched, quite monotone voice speaks with an American accent at a slightly faster-than-average pace in a confined space with very clear audio.",
175
+ 3.0,
176
+ 0.8,
177
+ 0.9,
178
+ 50
179
  ],
180
  [
181
  "बगीचे में बच्चे खेल रहे हैं और पक्षी चहचहा रहे हैं।",
182
  "Rohit speaks with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
183
+ 3.0,
184
+ 0.8,
185
+ 0.9,
186
+ 50
187
  ],
188
  [
189
  "കുട്ടികൾ പൂന്തോട്ടത്തിൽ കളിക്കുന്നു, പക്ഷികൾ ചിലയ്ക്കുന്നു.",
190
  "Anjali speaks with a low-pitched voice delivering her words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
191
+ 3.0,
192
+ 0.8,
193
+ 0.9,
194
+ 50
195
  ],
196
  [
197
  "குழந்தைகள் தோட்டத்தில் விளையாடுகிறார்கள், பறவைகள் கிண்டல் செய்கின்றன.",
198
  "Jaya speaks with a slightly low-pitched, quite monotone voice at a slightly faster-than-average pace in a confined space with very clear audio.",
199
+ 3.0,
200
+ 0.8,
201
+ 0.9,
202
+ 50
203
  ]
204
  ]
205
 
 
206
  def numpy_to_mp3(audio_array, sampling_rate):
 
207
  if np.issubdtype(audio_array.dtype, np.floating):
208
  max_val = np.max(np.abs(audio_array))
209
+ audio_array = (audio_array / max_val) * 32767
210
  audio_array = audio_array.astype(np.int16)
211
 
 
212
  audio_segment = AudioSegment(
213
  audio_array.tobytes(),
214
  frame_rate=sampling_rate,
 
216
  channels=1
217
  )
218
 
 
219
  mp3_io = io.BytesIO()
220
  audio_segment.export(mp3_io, format="mp3", bitrate="320k")
221
 
 
222
  mp3_bytes = mp3_io.getvalue()
223
  mp3_io.close()
224
 
 
228
  frame_rate = model.audio_encoder.config.frame_rate
229
 
230
  @spaces.GPU
231
+ def generate_base(text, description, temperature, top_p, top_k):
232
+ chunk_size = 25
 
233
 
 
234
  inputs = description_tokenizer(description, return_tensors="pt").to(device)
235
 
236
+ sentences_text = nltk.sent_tokenize(text)
237
  curr_sentence = ""
238
  chunks = []
239
  for sentence in sentences_text:
 
251
 
252
  all_audio = []
253
 
 
254
  for chunk in chunks:
 
255
  prompt = tokenizer(chunk, return_tensors="pt").to(device)
256
 
 
257
  generation = model.generate(
258
  input_ids=inputs.input_ids,
259
  attention_mask=inputs.attention_mask,
260
  prompt_input_ids=prompt.input_ids,
261
  prompt_attention_mask=prompt.attention_mask,
262
  do_sample=True,
263
+ temperature=temperature,
264
+ top_p=top_p,
265
+ top_k=top_k,
266
  return_dict_in_generate=True
267
  )
268
 
 
269
  if hasattr(generation, 'sequences') and hasattr(generation, 'audios_length'):
270
  audio = generation.sequences[0, :generation.audios_length[0]]
271
  audio_np = audio.to(torch.float32).cpu().numpy().squeeze()
 
273
  audio_np = audio_np.flatten()
274
  all_audio.append(audio_np)
275
 
 
276
  combined_audio = np.concatenate(all_audio)
277
 
 
278
  print(f"Sample of length: {round(combined_audio.shape[0] / sampling_rate, 2)} seconds")
279
  yield numpy_to_mp3(combined_audio, sampling_rate=sampling_rate)
280
 
 
281
  @spaces.GPU
282
+ def generate_finetuned(text, description, temperature, top_p, top_k):
283
+ chunk_size = 25
 
284
 
 
285
  inputs = description_tokenizer(description, return_tensors="pt").to(device)
286
 
287
+ sentences_text = nltk.sent_tokenize(text)
288
  curr_sentence = ""
289
  chunks = []
290
  for sentence in sentences_text:
 
302
 
303
  all_audio = []
304
 
 
305
  for chunk in chunks:
 
306
  prompt = tokenizer(chunk, return_tensors="pt").to(device)
307
 
 
308
  generation = finetuned_model.generate(
309
  input_ids=inputs.input_ids,
310
  attention_mask=inputs.attention_mask,
311
  prompt_input_ids=prompt.input_ids,
312
  prompt_attention_mask=prompt.attention_mask,
313
  do_sample=True,
314
+ temperature=temperature,
315
+ top_p=top_p,
316
+ top_k=top_k,
317
  return_dict_in_generate=True
318
  )
319
 
 
320
  if hasattr(generation, 'sequences') and hasattr(generation, 'audios_length'):
321
  audio = generation.sequences[0, :generation.audios_length[0]]
322
  audio_np = audio.to(torch.float32).cpu().numpy().squeeze()
 
324
  audio_np = audio_np.flatten()
325
  all_audio.append(audio_np)
326
 
 
327
  combined_audio = np.concatenate(all_audio)
328
 
 
329
  print(f"Sample of length: {round(combined_audio.shape[0] / sampling_rate, 2)} seconds")
330
  yield numpy_to_mp3(combined_audio, sampling_rate=sampling_rate)
331
 
 
332
  css = """
333
  #share-btn-container {
334
  display: flex;
 
365
  display: none !important;
366
  }
367
  """
368
+
369
  with gr.Blocks(css=css) as block:
370
  gr.HTML(
371
  """
 
376
  "
377
  >
378
  <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
379
+ Indic-Parler-TTS 🗣️
380
  </h1>
381
  </div>
382
  </div>
 
403
  with gr.Column():
404
  input_text = gr.Textbox(label="Input Text", lines=2, value=finetuned_examples[0][0], elem_id="input_text")
405
  description = gr.Textbox(label="Description", lines=2, value=finetuned_examples[0][1], elem_id="input_description")
406
+
407
+ with gr.Row():
408
+ temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature", info="Controls randomness in generation (higher = more random)")
409
+ top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top P", info="Nucleus sampling threshold")
410
+ top_k = gr.Slider(minimum=1, maximum=100, value=50, step=1, label="Top K", info="Number of highest probability tokens to consider")
411
+
412
  run_button = gr.Button("Generate Audio", variant="primary")
413
+
414
  with gr.Column():
415
  audio_out = gr.Audio(label="Parler-TTS generation", format="mp3", elem_id="audio_out", autoplay=True)
416
 
417
+ inputs = [input_text, description, temperature, top_p, top_k]
418
  outputs = [audio_out]
419
  gr.Examples(examples=finetuned_examples, fn=generate_finetuned, inputs=inputs, outputs=outputs, cache_examples=False)
420
  run_button.click(fn=generate_finetuned, inputs=inputs, outputs=outputs, queue=True)
 
424
  with gr.Column():
425
  input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
426
  description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
427
+
428
+ with gr.Row():
429
+ temperature = gr.Slider(minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature", info="Controls randomness in generation (higher = more random)")
430
+ top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.9, step=0.1, label="Top P", info="Nucleus sampling threshold")
431
+ top_k = gr.Slider(minimum=1, maximum=100, value=50, step=1, label="Top K", info="Number of highest probability tokens to consider")
432
+
433
  run_button = gr.Button("Generate Audio", variant="primary")
434
+
435
  with gr.Column():
436
  audio_out = gr.Audio(label="Parler-TTS generation", format="mp3", elem_id="audio_out", autoplay=True)
437
 
438
+ inputs = [input_text, description, temperature, top_p, top_k]
439
  outputs = [audio_out]
440
  gr.Examples(examples=examples, fn=generate_base, inputs=inputs, outputs=outputs, cache_examples=False)
441
  run_button.click(fn=generate_base, inputs=inputs, outputs=outputs, queue=True)
442
 
 
443
  gr.HTML(
444
  """
445
  If you'd like to learn more about how the model was trained or explore fine-tuning it yourself, visit the <a href="https://github.com/huggingface/parler-tts">Parler-TTS</a> repository on GitHub. The Parler-TTS codebase and associated checkpoints are licensed under the <a href="https://github.com/huggingface/parler-tts/blob/main/LICENSE">Apache 2.0 license</a>.</p>
446
+ """
447
  )
448
 
449
  block.queue()