gitgato commited on
Commit
087a581
·
verified ·
1 Parent(s): 444cb8b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -159
app.py CHANGED
@@ -1,171 +1,61 @@
1
- import spaces
2
- import gradio as gr
3
  import torch
4
- from transformers.models.speecht5.number_normalizer import EnglishNumberNormalizer
5
- from string import punctuation
6
- import re
7
-
8
-
9
- from parler_tts import ParlerTTSForConditionalGeneration
10
- from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
11
-
12
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
13
-
14
-
15
- repo_id = "gitgato/tr-xtts"
16
-
17
- model = ParlerTTSForConditionalGeneration.from_pretrained(repo_id).to(device)
18
- tokenizer = AutoTokenizer.from_pretrained(repo_id)
19
- feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
20
-
 
 
 
 
 
 
 
 
 
21
 
22
- SAMPLE_RATE = feature_extractor.sampling_rate
23
- SEED = 42
24
 
25
- default_text = "Por favor sorprendeme y habla"
26
- examples = [
27
- [
28
- "Remember - this is only the first iteration of the model! To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data by a factor of five times.",
29
- "A male speaker with a low-pitched voice delivering his words at a fast pace in a small, confined space with a very clear audio and an animated tone."
30
- ],
31
- [
32
- "'This is the best time of my life, Bartley,' she said happily.",
33
- "A female speaker with a slightly low-pitched, quite monotone voice delivers her words at a slightly faster-than-average pace in a confined space with very clear audio.",
34
- ],
35
- [
36
- "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
37
- "A male speaker with a slightly high-pitched voice delivering his words at a slightly slow pace in a small, confined space with a touch of background noise and a quite monotone tone.",
38
- ],
39
- [
40
- "Montrose also, after having experienced still more variety of good and bad fortune, threw down his arms, and retired out of the kingdom.",
41
- "A male speaker with a low-pitched voice delivers his words at a fast pace and an animated tone, in a very spacious environment, accompanied by noticeable background noise.",
42
- ],
43
- ]
44
 
45
- number_normalizer = EnglishNumberNormalizer()
46
 
47
- def preprocess(text):
48
- text = number_normalizer(text).strip()
49
- text = text.replace("-", " ")
50
- if text[-1] not in punctuation:
51
- text = f"{text}."
52
-
53
- abbreviations_pattern = r'\b[A-Z][A-Z\.]+\b'
54
-
55
- def separate_abb(chunk):
56
- chunk = chunk.replace(".","")
57
- print(chunk)
58
- return " ".join(chunk)
59
-
60
- abbreviations = re.findall(abbreviations_pattern, text)
61
- for abv in abbreviations:
62
- if abv in text:
63
- text = text.replace(abv, separate_abb(abv))
64
  return text
65
 
66
- @spaces.GPU
67
- def gen_tts(text, description):
68
- inputs = tokenizer(description, return_tensors="pt").to(device)
69
- prompt = tokenizer(preprocess(text), return_tensors="pt").to(device)
70
-
71
- set_seed(SEED)
72
- generation = model.generate(
73
- input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, do_sample=True, temperature=1.0
74
- )
75
- audio_arr = generation.cpu().numpy().squeeze()
76
 
77
- return SAMPLE_RATE, audio_arr
78
 
 
79
 
80
- css = """
81
- #share-btn-container {
82
- display: flex;
83
- padding-left: 0.5rem !important;
84
- padding-right: 0.5rem !important;
85
- background-color: #000000;
86
- justify-content: center;
87
- align-items: center;
88
- border-radius: 9999px !important;
89
- width: 13rem;
90
- margin-top: 10px;
91
- margin-left: auto;
92
- flex: unset !important;
93
- }
94
- #share-btn {
95
- all: initial;
96
- color: #ffffff;
97
- font-weight: 600;
98
- cursor: pointer;
99
- font-family: 'IBM Plex Sans', sans-serif;
100
- margin-left: 0.5rem !important;
101
- padding-top: 0.25rem !important;
102
- padding-bottom: 0.25rem !important;
103
- right:0;
104
- }
105
- #share-btn * {
106
- all: unset !important;
107
- }
108
- #share-btn-container div:nth-child(-n+2){
109
- width: auto !important;
110
- min-height: 0px !important;
111
- }
112
- #share-btn-container .wrap {
113
- display: none !important;
114
- }
115
- """
116
- with gr.Blocks(css=css) as block:
117
- gr.HTML(
118
- """
119
- <div style="text-align: center; max-width: 700px; margin: 0 auto;">
120
- <div
121
- style="
122
- display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
123
- "
124
- >
125
- <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
126
- Parler-TTS 🗣️
127
- </h1>
128
- </div>
129
- </div>
130
- """
131
- )
132
- gr.HTML(
133
- f"""
134
- <p><a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> is a training and inference library for
135
- high-fidelity text-to-speech (TTS) models. The model demonstrated here, <a href="https://huggingface.co/gitgato/tr-XTTS"> Parler-TTS Mini v0.1</a>,
136
- is the first iteration model trained using 10k hours of narrated audiobooks. It generates high-quality speech
137
- with features that can be controlled using a simple text prompt (e.g. gender, background noise, speaking rate, pitch and reverberation).</p>
138
- <p>Tips for ensuring good generation:
139
- <ul>
140
- <li>Include the term "very clear audio" to generate the highest quality audio, and "very noisy audio" for high levels of background noise</li>
141
- <li>Punctuation can be used to control the prosody of the generations, e.g. use commas to add small breaks in speech</li>
142
- <li>The remaining speech features (gender, speaking rate, pitch and reverberation) can be controlled directly through the prompt</li>
143
- </ul>
144
- </p>
145
- """
146
- )
147
- with gr.Row():
148
- with gr.Column():
149
- input_text = gr.Textbox(label="Input Text", lines=2, value=default_text, elem_id="input_text")
150
- description = gr.Textbox(label="Description", lines=2, value="", elem_id="input_description")
151
- run_button = gr.Button("Generate Audio", variant="primary")
152
- with gr.Column():
153
- audio_out = gr.Audio(label="Parler-TTS generation", type="numpy", elem_id="audio_out")
154
 
155
- inputs = [input_text, description]
156
- outputs = [audio_out]
157
- gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs, cache_examples=True)
158
- run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)
159
- gr.HTML(
160
- """
161
- <p>To improve the prosody and naturalness of the speech further, we're scaling up the amount of training data to 50k hours of speech.
162
- The v1 release of the model will be trained on this data, as well as inference optimisations, such as flash attention
163
- and torch compile, that will improve the latency by 2-4x. If you want to find out more about how this model was trained and even fine-tune it yourself, check-out the
164
- <a href="https://github.com/huggingface/parler-tts"> Parler-TTS</a> repository on GitHub.</p>
165
-
166
- <p>The Parler-TTS codebase and its associated checkpoints are licensed under <a href='https://github.com/huggingface/parler-tts?tab=Apache-2.0-1-ov-file#readme'> Apache 2.0</a>.</p>
167
- """
168
- )
169
 
170
- block.queue()
171
- block.launch(share=True)
 
1
+ mport gradio as gr
 
2
  import torch
3
+ from datasets import load_dataset
4
+ from transformers import pipeline, SpeechT5Processor, SpeechT5HifiGan, SpeechT5ForTextToSpeech
5
+
6
+ model_id = "gitgato/tr-xtts" # update with your model id
7
+ # pipe = pipeline("automatic-speech-recognition", model=model_id)
8
+ model = SpeechT5ForTextToSpeech.from_pretrained(model_id)
9
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
10
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
11
+ speaker_embeddings = torch.tensor(embeddings_dataset[7440]["xvector"]).unsqueeze(0)
12
+
13
+ # checkpoint = "microsoft/speecht5_tts"
14
+ processor = SpeechT5Processor.from_pretrained(model_id)
15
+
16
+ replacements = [
17
+ ("à", "a"),
18
+ ("â", "a"),
19
+ ("ç", "c"),
20
+ ("è", "e"),
21
+ ("ë", "e"),
22
+ ("î", "i"),
23
+ ("ï", "i"),
24
+ ("ô", "o"),
25
+ ("ù", "u"),
26
+ ("û", "u"),
27
+ ("ü", "u"),
28
+ ]
29
 
 
 
30
 
31
+ title = "Text-to-Speech"
32
+ description = """
33
+ Demo for text-to-speech translation in French. Demo uses [Sandiago21/speecht5_finetuned_facebook_voxpopuli_french](https://huggingface.co/Sandiago21/speecht5_finetuned_facebook_voxpopuli_french) checkpoint, which is based on Microsoft's
34
+ [SpeechT5 TTS](https://huggingface.co/microsoft/speecht5_tts) model and is fine-tuned in French Audio dataset
35
+ ![Text-to-Speech (TTS)"](https://geekflare.com/wp-content/uploads/2021/07/texttospeech-1200x385.png "Diagram of Text-to-Speech (TTS)")
36
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
 
38
 
39
+ def cleanup_text(text):
40
+ for src, dst in replacements:
41
+ text = text.replace(src, dst)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  return text
43
 
44
+ def synthesize_speech(text):
45
+ text = cleanup_text(text)
46
+ inputs = processor(text=text, return_tensors="pt")
 
 
 
 
 
 
 
47
 
48
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
49
 
50
+ return gr.Audio.update(value=(16000, speech.cpu().numpy()))
51
 
52
+ syntesize_speech_gradio = gr.Interface(
53
+ synthesize_speech,
54
+ inputs = gr.Textbox(label="Text", placeholder="Type something here..."),
55
+ outputs=gr.Audio(),
56
+ examples=["Hola, probando audio."],
57
+ title=title,
58
+ description=description,
59
+ ).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61