freddyaboulton HF staff commited on
Commit
d86bc7f
1 Parent(s): 496bf8a
Files changed (2) hide show
  1. app.py +37 -183
  2. requirements.txt +2 -0
app.py CHANGED
@@ -13,19 +13,18 @@ from parler_tts import ParlerTTSForConditionalGeneration
13
  from pydub import AudioSegment
14
  from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
15
  from transformers.generation.streamers import BaseStreamer
 
16
 
17
  device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
18
  torch_dtype = torch.float16 if device != "cpu" else torch.float32
19
 
20
  repo_id = "parler-tts/parler_tts_mini_v0.1"
21
- jenny_repo_id = "ylacombe/parler-tts-mini-jenny-30H"
22
 
23
  model = ParlerTTSForConditionalGeneration.from_pretrained(
24
  repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
25
  ).to(device)
26
- jenny_model = ParlerTTSForConditionalGeneration.from_pretrained(
27
- jenny_repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
28
- ).to(device)
29
 
30
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
31
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
@@ -33,53 +32,6 @@ feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
33
  SAMPLE_RATE = feature_extractor.sampling_rate
34
  SEED = 42
35
 
36
- default_text = "Please surprise me and speak in whatever voice you enjoy."
37
- examples = [
38
- [
39
- "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
40
- "A male speaker with a low-pitched voice delivering his words at a fast pace in a small, confined space with a very clear audio and an animated tone.",
41
- 3.0,
42
- ],
43
- [
44
- "'This is the best time of my life, Bartley,' she said happily.",
45
- "A female speaker with a slightly low-pitched, quite monotone voice delivers her words at a slightly faster-than-average pace in a confined space with very clear audio.",
46
- 3.0,
47
- ],
48
- [
49
- "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
50
- "A male speaker with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
51
- 3.0,
52
- ],
53
- [
54
- "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
55
- "A male speaker with a low-pitched voice delivers his words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
56
- 3.0,
57
- ],
58
- ]
59
-
60
- jenny_examples = [
61
- [
62
- "Remember, this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
63
- "Jenny speaks at an average pace with a slightly animated delivery in a very confined sounding environment with clear audio quality.",
64
- 3.0,
65
- ],
66
- [
67
- "'This is the best time of my life, Bartley,' she said happily.",
68
- "Jenny speaks in quite a monotone voice at a slightly faster-than-average pace in a confined space with very clear audio.",
69
- 3.0,
70
- ],
71
- [
72
- "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
73
- "Jenny delivers her words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
74
- 3.0,
75
- ],
76
- [
77
- "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
78
- "Jenny delivers her words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
79
- 3.0,
80
- ],
81
- ]
82
-
83
 
84
  class ParlerTTSStreamer(BaseStreamer):
85
  def __init__(
@@ -238,13 +190,28 @@ def numpy_to_mp3(audio_array, sampling_rate):
238
  sampling_rate = model.audio_encoder.config.sampling_rate
239
  frame_rate = model.audio_encoder.config.frame_rate
240
 
 
 
241
  @spaces.GPU
242
- def generate_base(text, description, play_steps_in_s=2.0):
 
 
 
 
 
 
 
 
 
 
 
 
243
  play_steps = int(frame_rate * play_steps_in_s)
244
  streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
245
 
 
246
  inputs = tokenizer(description, return_tensors="pt").to(device)
247
- prompt = tokenizer(text, return_tensors="pt").to(device)
248
 
249
  generation_kwargs = dict(
250
  input_ids=inputs.input_ids,
@@ -259,145 +226,32 @@ def generate_base(text, description, play_steps_in_s=2.0):
259
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
260
  thread.start()
261
 
 
 
 
262
  for new_audio in streamer:
263
  print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
264
- yield numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
265
 
266
- @spaces.GPU
267
- def generate_jenny(text, description, play_steps_in_s=2.0):
268
- play_steps = int(frame_rate * play_steps_in_s)
269
- streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
270
 
271
- inputs = tokenizer(description, return_tensors="pt").to(device)
272
- prompt = tokenizer(text, return_tensors="pt").to(device)
273
-
274
- generation_kwargs = dict(
275
- input_ids=inputs.input_ids,
276
- prompt_input_ids=prompt.input_ids,
277
- streamer=streamer,
278
- do_sample=True,
279
- temperature=1.0,
280
- min_new_tokens=10,
281
- )
282
-
283
- set_seed(SEED)
284
- thread = Thread(target=jenny_model.generate, kwargs=generation_kwargs)
285
- thread.start()
286
-
287
- for new_audio in streamer:
288
- print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
289
- yield sampling_rate, new_audio
290
-
291
-
292
- css = """
293
- #share-btn-container {
294
- display: flex;
295
- padding-left: 0.5rem !important;
296
- padding-right: 0.5rem !important;
297
- background-color: #000000;
298
- justify-content: center;
299
- align-items: center;
300
- border-radius: 9999px !important;
301
- width: 13rem;
302
- margin-top: 10px;
303
- margin-left: auto;
304
- flex: unset !important;
305
- }
306
- #share-btn {
307
- all: initial;
308
- color: #ffffff;
309
- font-weight: 600;
310
- cursor: pointer;
311
- font-family: 'IBM Plex Sans', sans-serif;
312
- margin-left: 0.5rem !important;
313
- padding-top: 0.25rem !important;
314
- padding-bottom: 0.25rem !important;
315
- right:0;
316
- }
317
- #share-btn * {
318
- all: unset !important;
319
- }
320
- #share-btn-container div:nth-child(-n+2){
321
- width: auto !important;
322
- min-height: 0px !important;
323
- }
324
- #share-btn-container .wrap {
325
- display: none !important;
326
- }
327
- """
328
  with gr.Blocks(css=css) as block:
329
- gr.HTML(
330
- """
331
- <div style="text-align: center; max-width: 700px; margin: 0 auto;">
332
- <div
333
- style="
334
- display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
335
- "
336
- >
337
- <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
338
- Parler-TTS 🗣️
339
- </h1>
340
- </div>
341
- </div>
342
- """
343
- )
344
  gr.HTML(
345
  f"""
346
- <p><a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> is a training and inference library for
347
- high-fidelity text-to-speech (TTS) models. Two models are demonstrated here, <a href="https://huggingface.co/parler-tts/parler_tts_mini_v0.1"> Parler-TTS Mini v0.1</a>,
348
- is the first iteration model trained using 10k hours of narrated audiobooks, and <a href="https://huggingface.co/ylacombe/parler-tts-mini-jenny-30H"> Parler-TTS Jenny</a>,
349
- a model fine-tuned on the <a href="https://huggingface.co/datasets/reach-vb/jenny_tts_dataset"> Jenny dataset</a>.
350
- Both models generates high-quality speech with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation).</p>
351
-
352
- <p>Tips for ensuring good generation:
353
- <ul>
354
- <li>Include the term <b>"very clear audio"</b> to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
355
- <li>When using the fine-tuned model, include the term <b>"Jenny"</b> to pick out her voice</li>
356
- <li>Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech</li>
357
- <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
358
- </ul>
359
- </p>
360
- """
361
- )
362
- with gr.Tab("Base"):
363
- with gr.Row():
364
- with gr.Column():
365
- input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
366
- description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
367
- play_seconds = gr.Slider(3.0, 7.0, value=3.0, step=2, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps")
368
- run_button = gr.Button("Generate Audio", variant="primary")
369
- with gr.Column():
370
- audio_out = gr.Audio(label="Parler-TTS generation", format="mp3", elem_id="audio_out", streaming=True, autoplay=True)
371
-
372
- inputs = [input_text, description, play_seconds]
373
- outputs = [audio_out]
374
- gr.Examples(examples=examples, fn=generate_base, inputs=inputs, outputs=outputs, cache_examples=False)
375
- run_button.click(fn=generate_base, inputs=inputs, outputs=outputs, queue=True)
376
-
377
- with gr.Tab("Jenny"):
378
- with gr.Row():
379
- with gr.Column():
380
- input_text = gr.Textbox(label="Input Text", lines=2, value=jenny_examples[0][0], elem_id="input_text")
381
- description = gr.Textbox(label="Description", lines=2, value=jenny_examples[0][1], elem_id="input_description")
382
- play_seconds = gr.Slider(3.0, 7.0, value=jenny_examples[0][2], step=2, label="Streaming interval in seconds", info="Lower = shorter chunks, lower latency, more codec steps")
383
- run_button = gr.Button("Generate Audio", variant="primary")
384
- with gr.Column():
385
- audio_out = gr.Audio(label="Parler-TTS generation", format="mp3", elem_id="audio_out", streaming=True, autoplay=True)
386
-
387
- inputs = [input_text, description, play_seconds]
388
- outputs = [audio_out]
389
- gr.Examples(examples=jenny_examples, fn=generate_jenny, inputs=inputs, outputs=outputs, cache_examples=False)
390
- run_button.click(fn=generate_jenny, inputs=inputs, outputs=outputs, queue=True)
391
-
392
- gr.HTML(
393
- """
394
- <p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.
395
- The v1 release of the model will be trained on this data, as well as inference optimisations, such as flash attention
396
- and torch compile, that will improve the latency by 2-4x. If you want to find out more about how this model was trained and even fine-tune it yourself, check-out the
397
- <a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> repository on GitHub. The Parler-TTS codebase and its
398
- associated checkpoints are licensed under <a href='https://github.com/huggingface/parler-tts?tab=Apache-2.0-1-ov-file#readme'> Apache 2.0</a>.</p>
399
  """
400
  )
 
 
 
 
 
 
 
 
 
 
 
401
 
402
  block.queue()
403
  block.launch(share=True)
 
13
  from pydub import AudioSegment
14
  from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
15
  from transformers.generation.streamers import BaseStreamer
16
+ from huggingface_hub import InferrenceClient
17
 
18
  device = "cuda:0" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
19
  torch_dtype = torch.float16 if device != "cpu" else torch.float32
20
 
21
  repo_id = "parler-tts/parler_tts_mini_v0.1"
 
22
 
23
  model = ParlerTTSForConditionalGeneration.from_pretrained(
24
  repo_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True
25
  ).to(device)
26
+
27
+ client = InferenceClient()
 
28
 
29
  tokenizer = AutoTokenizer.from_pretrained(repo_id)
30
  feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
 
32
  SAMPLE_RATE = feature_extractor.sampling_rate
33
  SEED = 42
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  class ParlerTTSStreamer(BaseStreamer):
37
  def __init__(
 
190
  sampling_rate = model.audio_encoder.config.sampling_rate
191
  frame_rate = model.audio_encoder.config.frame_rate
192
 
193
+ import random
194
+
195
  @spaces.GPU
196
+ def generate_base(subject, setting, ):
197
+
198
+ messages = [{"role": "sytem", "content": ("You are an award-winning children's bedtime story author lauded for your inventive stories."
199
+ "You want to write a bed time story for your child. They will give you the subject and setting "
200
+ "and you will write the entire story. It should be targetted at children 5 and younger and take about "
201
+ "a minute to read")},
202
+ {"role": "user", "content": f"Please tell me a story about a {subject} in {setting}"}]
203
+ gr.Info("Generating story", duration=3)
204
+ response = client.chat_completion(messages, max_tokens=2048, seed=random.randint(1, 5000))
205
+ gr.Info("Story Generated", duration=3)
206
+ story = output.choices[0].content
207
+
208
+ play_steps_in_s = 2.0
209
  play_steps = int(frame_rate * play_steps_in_s)
210
  streamer = ParlerTTSStreamer(model, device=device, play_steps=play_steps)
211
 
212
+ description = "A female speaker with a calm, warm, monotone voice delivers her words at a normal pace confined space with very clear audio."
213
  inputs = tokenizer(description, return_tensors="pt").to(device)
214
+ prompt = tokenizer(story, return_tensors="pt").to(device)
215
 
216
  generation_kwargs = dict(
217
  input_ids=inputs.input_ids,
 
226
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
227
  thread.start()
228
 
229
+ yield story, None
230
+
231
+ gr.Info("Reading story", duration=3)
232
  for new_audio in streamer:
233
  print(f"Sample of length: {round(new_audio.shape[0] / sampling_rate, 2)} seconds")
234
+ yield story, numpy_to_mp3(new_audio, sampling_rate=sampling_rate)
235
 
 
 
 
 
236
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  with gr.Blocks(css=css) as block:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  gr.HTML(
239
  f"""
240
+ <h1> Bedtime Story Reader 😴🔊 </h1>
241
+ <p> Powered by <a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  """
243
  )
244
+ with gr.Row():
245
+ subject = gr.Dropdown(value="Princess", choices=["Prince", "Princess", "Dog", "Cat"])
246
+ setting = gr.Dropdown(value="Forest", choices=["Forest", "Kingdom", "Jungle", "Underwater"])
247
+ with gr.Row():
248
+ with gr.Group():
249
+ audio_out = gr.Audio(label="Bed time story", streaming=True, autoplay=True)
250
+ story = gr.Textbox(label="Story")
251
+
252
+ inputs = [subject, setting]
253
+ outputs = [audio_out, story]
254
+ run_button.click(fn=generate_base, inputs=inputs, outputs=outputs)
255
 
256
  block.queue()
257
  block.launch(share=True)
requirements.txt CHANGED
@@ -1,2 +1,4 @@
 
 
1
  git+https://github.com/huggingface/parler-tts.git
2
  accelerate
 
1
+ "gradio-client @ git+https://github.com/gradio-app/gradio@bed454c3d22cfacedc047eb3b0ba987b485ac3fd#subdirectory=client/python"
2
+ https://gradio-builds.s3.amazonaws.com/bed454c3d22cfacedc047eb3b0ba987b485ac3fd/gradio-4.40.0-py3-none-any.whl
3
  git+https://github.com/huggingface/parler-tts.git
4
  accelerate