Kit-Lemonfoot commited on
Commit
20611fc
1 Parent(s): a7b6d59

(Hopefully) Addressed a long-standing issue when attempting to run the Space locally.

Browse files
Files changed (3) hide show
  1. app.py +378 -375
  2. images/none.png +0 -0
  3. requirements_local.txt +32 -0
app.py CHANGED
@@ -1,375 +1,378 @@
1
- print("Starting up. Please be patient...")
2
-
3
- import argparse
4
- import datetime
5
- import os
6
- import sys
7
- from typing import Optional
8
- import json
9
- import utils
10
-
11
- import gradio as gr
12
- import torch
13
- import yaml
14
-
15
- from common.constants import (
16
- DEFAULT_ASSIST_TEXT_WEIGHT,
17
- DEFAULT_LENGTH,
18
- DEFAULT_LINE_SPLIT,
19
- DEFAULT_NOISE,
20
- DEFAULT_NOISEW,
21
- DEFAULT_SDP_RATIO,
22
- DEFAULT_SPLIT_INTERVAL,
23
- DEFAULT_STYLE,
24
- DEFAULT_STYLE_WEIGHT,
25
- Languages,
26
- )
27
- from common.log import logger
28
- from common.tts_model import ModelHolder
29
- from infer import InvalidToneError
30
- from text.japanese import g2kata_tone, kata_tone2phone_tone, text_normalize
31
-
32
- is_hf_spaces = os.getenv("SYSTEM") == "spaces"
33
- limit = 150
34
-
35
- # Get path settings
36
- with open(os.path.join("configs", "paths.yml"), "r", encoding="utf-8") as f:
37
- path_config: dict[str, str] = yaml.safe_load(f.read())
38
- # dataset_root = path_config["dataset_root"]
39
- assets_root = path_config["assets_root"]
40
-
41
- def tts_fn(
42
- model_name,
43
- model_path,
44
- text,
45
- language,
46
- reference_audio_path,
47
- sdp_ratio,
48
- noise_scale,
49
- noise_scale_w,
50
- length_scale,
51
- line_split,
52
- split_interval,
53
- assist_text,
54
- assist_text_weight,
55
- use_assist_text,
56
- style,
57
- style_weight,
58
- kata_tone_json_str,
59
- use_tone,
60
- speaker,
61
- ):
62
- if len(text)<2:
63
- return "Please enter some text.", None, kata_tone_json_str
64
-
65
- if is_hf_spaces and len(text) > limit:
66
- return f"Too long! There is a character limit of {limit} characters.", None, kata_tone_json_str
67
-
68
- if(not model_holder.current_model):
69
- model_holder.load_model_gr(model_name, model_path)
70
- logger.info(f"Loaded model '{model_name}'")
71
- if(model_holder.current_model.model_path != model_path):
72
- model_holder.load_model_gr(model_name, model_path)
73
- logger.info(f"Swapped to model '{model_name}'")
74
- speaker_id = model_holder.current_model.spk2id[speaker]
75
- start_time = datetime.datetime.now()
76
-
77
- wrong_tone_message = ""
78
- kata_tone: Optional[list[tuple[str, int]]] = None
79
- if use_tone and kata_tone_json_str != "":
80
- if language != "JP":
81
- #logger.warning("Only Japanese is supported for tone generation.")
82
- wrong_tone_message = "アクセント指定は現在日本語のみ対応しています。"
83
- if line_split:
84
- #logger.warning("Tone generation is not supported for line split.")
85
- wrong_tone_message = (
86
- "アクセント指定は改行で分けて生成を使わない場合のみ対応しています。"
87
- )
88
- try:
89
- kata_tone = []
90
- json_data = json.loads(kata_tone_json_str)
91
- # tupleを使うように変換
92
- for kana, tone in json_data:
93
- assert isinstance(kana, str) and tone in (0, 1), f"{kana}, {tone}"
94
- kata_tone.append((kana, tone))
95
- except Exception as e:
96
- logger.warning(f"Error occurred when parsing kana_tone_json: {e}")
97
- wrong_tone_message = f"アクセント指定が不正です: {e}"
98
- kata_tone = None
99
-
100
- # toneは実際に音声合成に代入される際のみnot Noneになる
101
- tone: Optional[list[int]] = None
102
- if kata_tone is not None:
103
- phone_tone = kata_tone2phone_tone(kata_tone)
104
- tone = [t for _, t in phone_tone]
105
-
106
- try:
107
- sr, audio = model_holder.current_model.infer(
108
- text=text,
109
- language=language,
110
- reference_audio_path=reference_audio_path,
111
- sdp_ratio=sdp_ratio,
112
- noise=noise_scale,
113
- noisew=noise_scale_w,
114
- length=length_scale,
115
- line_split=line_split,
116
- split_interval=split_interval,
117
- assist_text=assist_text,
118
- assist_text_weight=assist_text_weight,
119
- use_assist_text=use_assist_text,
120
- style=style,
121
- style_weight=style_weight,
122
- given_tone=tone,
123
- sid=speaker_id,
124
- )
125
- except InvalidToneError as e:
126
- logger.error(f"Tone error: {e}")
127
- return f"Error: アクセント指定が不正です:\n{e}", None, kata_tone_json_str
128
- except ValueError as e:
129
- logger.error(f"Value error: {e}")
130
- return f"Error: {e}", None, kata_tone_json_str
131
-
132
- end_time = datetime.datetime.now()
133
- duration = (end_time - start_time).total_seconds()
134
-
135
- if tone is None and language == "JP":
136
- # アクセント指定に使えるようにアクセント情報を返す
137
- norm_text = text_normalize(text)
138
- kata_tone = g2kata_tone(norm_text)
139
- kata_tone_json_str = json.dumps(kata_tone, ensure_ascii=False)
140
- elif tone is None:
141
- kata_tone_json_str = ""
142
-
143
- if reference_audio_path:
144
- style="External Audio"
145
- logger.info(f"Successful inference, took {duration}s | {speaker} | {language}/{sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale}/{style}/{style_weight} | {text}")
146
- message = f"Success, time: {duration} seconds."
147
- if wrong_tone_message != "":
148
- message = wrong_tone_message + "\n" + message
149
- return message, (sr, audio), kata_tone_json_str
150
-
151
- def load_voicedata():
152
- print("Loading voice data...")
153
- #voices = []
154
- envoices = []
155
- jpvoices = []
156
- styledict = {}
157
- with open("voicelist.json", "r", encoding="utf-8") as f:
158
- voc_info = json.load(f)
159
- for name, info in voc_info.items():
160
- if not info['enable']:
161
- continue
162
- model_path = info['model_path']
163
- voice_name = info['title']
164
- speakerid = info['speakerid']
165
- datasetauthor = info['datasetauthor']
166
- image = info['cover']
167
- if not model_path in styledict.keys():
168
- conf=f"model_assets/{model_path}/config.json"
169
- hps = utils.get_hparams_from_file(conf)
170
- s2id = hps.data.style2id
171
- styledict[model_path] = s2id.keys()
172
- print(f"Indexed voice {voice_name}")
173
- if(info['primarylang']=="JP"):
174
- jpvoices.append((name, model_path, voice_name, speakerid, datasetauthor, image))
175
- else:
176
- envoices.append((name, model_path, voice_name, speakerid, datasetauthor, image))
177
- return [envoices, jpvoices], styledict
178
-
179
-
180
- initial_text = "Hello there! This is test audio of a new Hololive text to speech tool."
181
-
182
- initial_md = """
183
- # Hololive [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2)
184
- ### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)/[Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
185
- ### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
186
-
187
- Do no evil.
188
-
189
- **Note:** Most of the models are a *work in progress.* They may not sound fully correct.
190
- """
191
-
192
- style_md = """
193
- - You can control things like voice tone, emotion, and reading style through presets or through voice files.
194
- - Neutral acts as an average across all speakers. Styling options act as an override to Neutral.
195
- - Setting the intensity too high will likely break the output.
196
- - The required intensity will depend based on the speaker and the desired style.
197
- - If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker.
198
- """
199
-
200
- if __name__ == "__main__":
201
- parser = argparse.ArgumentParser()
202
- parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
203
- parser.add_argument(
204
- "--dir", "-d", type=str, help="Model directory", default=assets_root
205
- )
206
- parser.add_argument(
207
- "--share", action="store_true", help="Share this app publicly", default=False
208
- )
209
- parser.add_argument(
210
- "--server-name",
211
- type=str,
212
- default=None,
213
- help="Server name for Gradio app",
214
- )
215
- parser.add_argument(
216
- "--no-autolaunch",
217
- action="store_true",
218
- default=False,
219
- help="Do not launch app automatically",
220
- )
221
- args = parser.parse_args()
222
- model_dir = args.dir
223
-
224
- if args.cpu:
225
- device = "cpu"
226
- else:
227
- device = "cuda" if torch.cuda.is_available() else "cpu"
228
-
229
- model_holder = ModelHolder(model_dir, device)
230
-
231
- languages = ["EN", "JP", "ZH"]
232
- langnames = ["English", "Japanese"]
233
-
234
- model_names = model_holder.model_names
235
- if len(model_names) == 0:
236
- logger.error(f"No models found. Please place the model in {model_dir}.")
237
- sys.exit(1)
238
- initial_id = 0
239
- initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
240
- #print(initial_pth_files)
241
-
242
- voicedata, styledict = load_voicedata()
243
-
244
- #Gradio preload
245
- text_input = gr.TextArea(label="Text", value=initial_text)
246
- line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
247
- split_interval = gr.Slider(
248
- minimum=0.0,
249
- maximum=2,
250
- value=0.5,
251
- step=0.1,
252
- label="Length of division seperation time (in seconds)",
253
- )
254
- language = gr.Dropdown(choices=languages, value="EN", label="Language")
255
- sdp_ratio = gr.Slider(
256
- minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
257
- )
258
- noise_scale = gr.Slider(
259
- minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
260
- )
261
- noise_scale_w = gr.Slider(
262
- minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
263
- )
264
- length_scale = gr.Slider(
265
- minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
266
- )
267
- use_style_text = gr.Checkbox(label="Use stylization text", value=False)
268
- style_text = gr.Textbox(
269
- label="Style text",
270
- placeholder="Check the \"Use stylization text\" box to use this option!",
271
- info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
272
- visible=True,
273
- )
274
- style_text_weight = gr.Slider(
275
- minimum=0,
276
- maximum=1,
277
- value=0.7,
278
- step=0.1,
279
- label="Text stylization strength",
280
- visible=True,
281
- )
282
-
283
- with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="Hololive Style-Bert-VITS2") as app:
284
- gr.Markdown(initial_md)
285
-
286
- #NOT USED SINCE NONE OF MY MODELS ARE JPEXTRA.
287
- #ONLY HERE FOR COMPATIBILITY WITH THE EXISTING INFER CODE.
288
- #DO NOT RENDER OR MAKE VISIBLE
289
- tone = gr.Textbox(
290
- label="Accent adjustment (0 for low, 1 for high)",
291
- info="This can only be used when not seperated by line breaks. It is not universal.",
292
- visible=False
293
- )
294
- use_tone = gr.Checkbox(label="Use accent adjustment", value=False, visible=False)
295
-
296
- #for (name, model_path, voice_name, speakerid, datasetauthor, image) in voicedata:
297
- for vi in range(len(voicedata)):
298
- with gr.TabItem(langnames[vi]):
299
- for (name, model_path, voice_name, speakerid, datasetauthor, image) in voicedata[vi]:
300
- with gr.TabItem(name):
301
- mn = gr.Textbox(value=model_path, visible=False, interactive=False)
302
- mp = gr.Textbox(value=f"model_assets/{model_path}/{model_path}.safetensors", visible=False, interactive=False)
303
- spk = gr.Textbox(value=speakerid, visible=False, interactive=False)
304
- with gr.Row():
305
- with gr.Column():
306
- gr.Markdown(f"**{voice_name}**\n\nModel name: {model_path} | Dataset author: {datasetauthor}")
307
- gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False, show_share_button=False)
308
- with gr.Column():
309
- with gr.TabItem("Style using a preset"):
310
- style = gr.Dropdown(
311
- label="Current style (Neutral is an average style)",
312
- choices=styledict[model_path],
313
- value="Neutral",
314
- )
315
- with gr.TabItem("Style using existing audio"):
316
- ref_audio_path = gr.Audio(label="Reference Audio", type="filepath")
317
- style_weight = gr.Slider(
318
- minimum=0,
319
- maximum=50,
320
- value=5,
321
- step=0.1,
322
- label="Style strength",
323
- )
324
- with gr.Column():
325
- tts_button = gr.Button(
326
- "Synthesize", variant="primary", interactive=True
327
- )
328
- text_output = gr.Textbox(label="Info")
329
- audio_output = gr.Audio(label="Result")
330
-
331
- tts_button.click(
332
- tts_fn,
333
- inputs=[
334
- mn,
335
- mp,
336
- text_input,
337
- language,
338
- ref_audio_path,
339
- sdp_ratio,
340
- noise_scale,
341
- noise_scale_w,
342
- length_scale,
343
- line_split,
344
- split_interval,
345
- style_text,
346
- style_text_weight,
347
- use_style_text,
348
- style,
349
- style_weight,
350
- tone,
351
- use_tone,
352
- spk,
353
- ],
354
- outputs=[text_output, audio_output, tone],
355
- )
356
-
357
- with gr.Row():
358
- with gr.Column():
359
- text_input.render()
360
- line_split.render()
361
- split_interval.render()
362
- language.render()
363
- with gr.Column():
364
- sdp_ratio.render()
365
- noise_scale.render()
366
- noise_scale_w.render()
367
- length_scale.render()
368
- use_style_text.render()
369
- style_text.render()
370
- style_text_weight.render()
371
-
372
- with gr.Accordion("Styling Guide", open=False):
373
- gr.Markdown(style_md)
374
-
375
- app.launch(allowed_paths=['/file/images/'])
 
 
 
 
1
+ print("Starting up. Please be patient...")
2
+
3
+ import argparse
4
+ import datetime
5
+ import os
6
+ import sys
7
+ from typing import Optional
8
+ import json
9
+ import utils
10
+
11
+ import gradio as gr
12
+ import torch
13
+ import yaml
14
+
15
+ from common.constants import (
16
+ DEFAULT_ASSIST_TEXT_WEIGHT,
17
+ DEFAULT_LENGTH,
18
+ DEFAULT_LINE_SPLIT,
19
+ DEFAULT_NOISE,
20
+ DEFAULT_NOISEW,
21
+ DEFAULT_SDP_RATIO,
22
+ DEFAULT_SPLIT_INTERVAL,
23
+ DEFAULT_STYLE,
24
+ DEFAULT_STYLE_WEIGHT,
25
+ Languages,
26
+ )
27
+ from common.log import logger
28
+ from common.tts_model import ModelHolder
29
+ from infer import InvalidToneError
30
+ from text.japanese import g2kata_tone, kata_tone2phone_tone, text_normalize
31
+
32
+ is_hf_spaces = os.getenv("SYSTEM") == "spaces"
33
+ limit = 150
34
+
35
+ # Get path settings
36
+ with open(os.path.join("configs", "paths.yml"), "r", encoding="utf-8") as f:
37
+ path_config: dict[str, str] = yaml.safe_load(f.read())
38
+ # dataset_root = path_config["dataset_root"]
39
+ assets_root = path_config["assets_root"]
40
+
41
+ def tts_fn(
42
+ model_name,
43
+ model_path,
44
+ text,
45
+ language,
46
+ reference_audio_path,
47
+ sdp_ratio,
48
+ noise_scale,
49
+ noise_scale_w,
50
+ length_scale,
51
+ line_split,
52
+ split_interval,
53
+ assist_text,
54
+ assist_text_weight,
55
+ use_assist_text,
56
+ style,
57
+ style_weight,
58
+ kata_tone_json_str,
59
+ use_tone,
60
+ speaker,
61
+ ):
62
+ if len(text)<2:
63
+ return "Please enter some text.", None, kata_tone_json_str
64
+
65
+ if is_hf_spaces and len(text) > limit:
66
+ return f"Too long! There is a character limit of {limit} characters.", None, kata_tone_json_str
67
+
68
+ if(not model_holder.current_model):
69
+ model_holder.load_model_gr(model_name, model_path)
70
+ logger.info(f"Loaded model '{model_name}'")
71
+ if(model_holder.current_model.model_path != model_path):
72
+ model_holder.load_model_gr(model_name, model_path)
73
+ logger.info(f"Swapped to model '{model_name}'")
74
+ speaker_id = model_holder.current_model.spk2id[speaker]
75
+ start_time = datetime.datetime.now()
76
+
77
+ wrong_tone_message = ""
78
+ kata_tone: Optional[list[tuple[str, int]]] = None
79
+ if use_tone and kata_tone_json_str != "":
80
+ if language != "JP":
81
+ #logger.warning("Only Japanese is supported for tone generation.")
82
+ wrong_tone_message = "アクセント指定は現在日本語のみ対応しています。"
83
+ if line_split:
84
+ #logger.warning("Tone generation is not supported for line split.")
85
+ wrong_tone_message = (
86
+ "アクセント指定は改行で分けて生成を使わない場合のみ対応しています。"
87
+ )
88
+ try:
89
+ kata_tone = []
90
+ json_data = json.loads(kata_tone_json_str)
91
+ # tupleを使うように変換
92
+ for kana, tone in json_data:
93
+ assert isinstance(kana, str) and tone in (0, 1), f"{kana}, {tone}"
94
+ kata_tone.append((kana, tone))
95
+ except Exception as e:
96
+ logger.warning(f"Error occurred when parsing kana_tone_json: {e}")
97
+ wrong_tone_message = f"アクセント指定が不正です: {e}"
98
+ kata_tone = None
99
+
100
+ # toneは実際に音声合成に代入される際のみnot Noneになる
101
+ tone: Optional[list[int]] = None
102
+ if kata_tone is not None:
103
+ phone_tone = kata_tone2phone_tone(kata_tone)
104
+ tone = [t for _, t in phone_tone]
105
+
106
+ try:
107
+ sr, audio = model_holder.current_model.infer(
108
+ text=text,
109
+ language=language,
110
+ reference_audio_path=reference_audio_path,
111
+ sdp_ratio=sdp_ratio,
112
+ noise=noise_scale,
113
+ noisew=noise_scale_w,
114
+ length=length_scale,
115
+ line_split=line_split,
116
+ split_interval=split_interval,
117
+ assist_text=assist_text,
118
+ assist_text_weight=assist_text_weight,
119
+ use_assist_text=use_assist_text,
120
+ style=style,
121
+ style_weight=style_weight,
122
+ given_tone=tone,
123
+ sid=speaker_id,
124
+ )
125
+ except InvalidToneError as e:
126
+ logger.error(f"Tone error: {e}")
127
+ return f"Error: アクセント指定が不正です:\n{e}", None, kata_tone_json_str
128
+ except ValueError as e:
129
+ logger.error(f"Value error: {e}")
130
+ return f"Error: {e}", None, kata_tone_json_str
131
+
132
+ end_time = datetime.datetime.now()
133
+ duration = (end_time - start_time).total_seconds()
134
+
135
+ if tone is None and language == "JP":
136
+ # アクセント指定に使えるようにアクセント情報を返す
137
+ norm_text = text_normalize(text)
138
+ kata_tone = g2kata_tone(norm_text)
139
+ kata_tone_json_str = json.dumps(kata_tone, ensure_ascii=False)
140
+ elif tone is None:
141
+ kata_tone_json_str = ""
142
+
143
+ if reference_audio_path:
144
+ style="External Audio"
145
+ logger.info(f"Successful inference, took {duration}s | {speaker} | {language}/{sdp_ratio}/{noise_scale}/{noise_scale_w}/{length_scale}/{style}/{style_weight} | {text}")
146
+ message = f"Success, time: {duration} seconds."
147
+ if wrong_tone_message != "":
148
+ message = wrong_tone_message + "\n" + message
149
+ return message, (sr, audio), kata_tone_json_str
150
+
151
+ def load_voicedata():
152
+ print("Loading voice data...")
153
+ #voices = []
154
+ envoices = []
155
+ jpvoices = []
156
+ styledict = {}
157
+ with open("voicelist.json", "r", encoding="utf-8") as f:
158
+ voc_info = json.load(f)
159
+ for name, info in voc_info.items():
160
+ if not info['enable']:
161
+ continue
162
+ model_path = info['model_path']
163
+ model_path_full = f"model_assets/{model_path}/{model_path}.safetensors"
164
+ if not os.path.exists(model_path):
165
+ model_path_full = f"model_assets\\{model_path}\\{model_path}.safetensors"
166
+ voice_name = info['title']
167
+ speakerid = info['speakerid']
168
+ datasetauthor = info['datasetauthor']
169
+ image = info['cover']
170
+ if not os.path.exists(f"images/{image}"):
171
+ image="none.png"
172
+ if not model_path in styledict.keys():
173
+ conf=f"model_assets/{model_path}/config.json"
174
+ hps = utils.get_hparams_from_file(conf)
175
+ s2id = hps.data.style2id
176
+ styledict[model_path] = s2id.keys()
177
+ print(f"Set up hyperparameters for model {model_path}")
178
+ if(info['primarylang']=="JP"):
179
+ jpvoices.append((name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image))
180
+ else:
181
+ envoices.append((name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image))
182
+ return [envoices, jpvoices], styledict
183
+
184
+
185
+ initial_text = "Hello there! This is test audio of a new Hololive text to speech tool."
186
+
187
+ initial_md = """
188
+ # Hololive [Style-Bert-VITS2](https://github.com/litagin02/Style-Bert-VITS2)
189
+ ### Space by [Kit Lemonfoot](https://huggingface.co/Kit-Lemonfoot)/[Noel Shirogane's High Flying Birds](https://www.youtube.com/channel/UCG9A0OJsJTluLOXfMZjJ9xA)
190
+ ### Based on code originally by [fishaudio](https://github.com/fishaudio) and [litagin02](https://github.com/litagin02)
191
+
192
+ Do no evil.
193
+ """
194
+
195
+ style_md = """
196
+ - You can control things like voice tone, emotion, and reading style through presets or through voice files.
197
+ - Neutral acts as an average across all speakers. Styling options act as an override to Neutral.
198
+ - Setting the intensity too high will likely break the output.
199
+ - The required intensity will depend based on the speaker and the desired style.
200
+ - If you're using preexisting audio data to style the output, try to use a voice that is similar to the desired speaker.
201
+ """
202
+
203
+ if __name__ == "__main__":
204
+ parser = argparse.ArgumentParser()
205
+ parser.add_argument("--cpu", action="store_true", help="Use CPU instead of GPU")
206
+ parser.add_argument(
207
+ "--dir", "-d", type=str, help="Model directory", default=assets_root
208
+ )
209
+ parser.add_argument(
210
+ "--share", action="store_true", help="Share this app publicly", default=False
211
+ )
212
+ parser.add_argument(
213
+ "--server-name",
214
+ type=str,
215
+ default=None,
216
+ help="Server name for Gradio app",
217
+ )
218
+ parser.add_argument(
219
+ "--no-autolaunch",
220
+ action="store_true",
221
+ default=False,
222
+ help="Do not launch app automatically",
223
+ )
224
+ args = parser.parse_args()
225
+ model_dir = args.dir
226
+
227
+ if args.cpu:
228
+ device = "cpu"
229
+ else:
230
+ device = "cuda" if torch.cuda.is_available() else "cpu"
231
+
232
+ model_holder = ModelHolder(model_dir, device)
233
+
234
+ languages = ["EN", "JP", "ZH"]
235
+ langnames = ["English", "Japanese"]
236
+
237
+ model_names = model_holder.model_names
238
+ if len(model_names) == 0:
239
+ logger.error(f"No models found. Please place the model in {model_dir}.")
240
+ sys.exit(1)
241
+ initial_id = 0
242
+ initial_pth_files = model_holder.model_files_dict[model_names[initial_id]]
243
+ #print(initial_pth_files)
244
+
245
+ voicedata, styledict = load_voicedata()
246
+
247
+ #Gradio preload
248
+ text_input = gr.TextArea(label="Text", value=initial_text)
249
+ line_split = gr.Checkbox(label="Divide text seperately by line breaks", value=True)
250
+ split_interval = gr.Slider(
251
+ minimum=0.0,
252
+ maximum=2,
253
+ value=0.5,
254
+ step=0.1,
255
+ label="Length of division seperation time (in seconds)",
256
+ )
257
+ language = gr.Dropdown(choices=languages, value="EN", label="Language")
258
+ sdp_ratio = gr.Slider(
259
+ minimum=0, maximum=1, value=0.2, step=0.1, label="SDP Ratio"
260
+ )
261
+ noise_scale = gr.Slider(
262
+ minimum=0.1, maximum=2, value=0.6, step=0.1, label="Noise"
263
+ )
264
+ noise_scale_w = gr.Slider(
265
+ minimum=0.1, maximum=2, value=0.8, step=0.1, label="Noise_W"
266
+ )
267
+ length_scale = gr.Slider(
268
+ minimum=0.1, maximum=2, value=1.0, step=0.1, label="Length"
269
+ )
270
+ use_style_text = gr.Checkbox(label="Use stylization text", value=False)
271
+ style_text = gr.Textbox(
272
+ label="Style text",
273
+ placeholder="Check the \"Use stylization text\" box to use this option!",
274
+ info="The voice will be similar in tone and emotion to the text, however inflection and tempo may be worse as a result.",
275
+ visible=True,
276
+ )
277
+ style_text_weight = gr.Slider(
278
+ minimum=0,
279
+ maximum=1,
280
+ value=0.7,
281
+ step=0.1,
282
+ label="Text stylization strength",
283
+ visible=True,
284
+ )
285
+
286
+ with gr.Blocks(theme=gr.themes.Base(primary_hue="emerald", secondary_hue="green"), title="Hololive Style-Bert-VITS2") as app:
287
+ gr.Markdown(initial_md)
288
+
289
+ #NOT USED SINCE NONE OF MY MODELS ARE JPEXTRA.
290
+ #ONLY HERE FOR COMPATIBILITY WITH THE EXISTING INFER CODE.
291
+ #DO NOT RENDER OR MAKE VISIBLE
292
+ tone = gr.Textbox(
293
+ label="Accent adjustment (0 for low, 1 for high)",
294
+ info="This can only be used when not seperated by line breaks. It is not universal.",
295
+ visible=False
296
+ )
297
+ use_tone = gr.Checkbox(label="Use accent adjustment", value=False, visible=False)
298
+
299
+ #for (name, model_path, voice_name, speakerid, datasetauthor, image) in voicedata:
300
+ for vi in range(len(voicedata)):
301
+ with gr.TabItem(langnames[vi]):
302
+ for (name, model_path, model_path_full, voice_name, speakerid, datasetauthor, image) in voicedata[vi]:
303
+ with gr.TabItem(name):
304
+ mn = gr.Textbox(value=model_path, visible=False, interactive=False)
305
+ mp = gr.Textbox(value=model_path_full, visible=False, interactive=False)
306
+ spk = gr.Textbox(value=speakerid, visible=False, interactive=False)
307
+ with gr.Row():
308
+ with gr.Column():
309
+ gr.Markdown(f"**{voice_name}**\n\nModel name: {model_path} | Dataset author: {datasetauthor}")
310
+ gr.Image(f"images/{image}", label=None, show_label=False, width=300, show_download_button=False, container=False, show_share_button=False)
311
+ with gr.Column():
312
+ with gr.TabItem("Style using a preset"):
313
+ style = gr.Dropdown(
314
+ label="Current style (Neutral is an average style)",
315
+ choices=styledict[model_path],
316
+ value="Neutral",
317
+ )
318
+ with gr.TabItem("Style using existing audio"):
319
+ ref_audio_path = gr.Audio(label="Reference Audio", type="filepath")
320
+ style_weight = gr.Slider(
321
+ minimum=0,
322
+ maximum=50,
323
+ value=5,
324
+ step=0.1,
325
+ label="Style strength",
326
+ )
327
+ with gr.Column():
328
+ tts_button = gr.Button(
329
+ "Synthesize", variant="primary", interactive=True
330
+ )
331
+ text_output = gr.Textbox(label="Info")
332
+ audio_output = gr.Audio(label="Result")
333
+
334
+ tts_button.click(
335
+ tts_fn,
336
+ inputs=[
337
+ mn,
338
+ mp,
339
+ text_input,
340
+ language,
341
+ ref_audio_path,
342
+ sdp_ratio,
343
+ noise_scale,
344
+ noise_scale_w,
345
+ length_scale,
346
+ line_split,
347
+ split_interval,
348
+ style_text,
349
+ style_text_weight,
350
+ use_style_text,
351
+ style,
352
+ style_weight,
353
+ tone,
354
+ use_tone,
355
+ spk,
356
+ ],
357
+ outputs=[text_output, audio_output, tone],
358
+ )
359
+
360
+ with gr.Row():
361
+ with gr.Column():
362
+ text_input.render()
363
+ line_split.render()
364
+ split_interval.render()
365
+ language.render()
366
+ with gr.Column():
367
+ sdp_ratio.render()
368
+ noise_scale.render()
369
+ noise_scale_w.render()
370
+ length_scale.render()
371
+ use_style_text.render()
372
+ style_text.render()
373
+ style_text_weight.render()
374
+
375
+ with gr.Accordion("Styling Guide", open=False):
376
+ gr.Markdown(style_md)
377
+
378
+ app.launch(allowed_paths=['/file/images/'])
images/none.png ADDED
requirements_local.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ cmudict
2
+ cn2an
3
+ g2p_en
4
+ GPUtil
5
+ gradio
6
+ jaconv
7
+ jieba
8
+ langid
9
+ librosa==0.9.2
10
+ loguru
11
+ matplotlib
12
+ mecab-python3
13
+ num2words
14
+ numba
15
+ numpy
16
+ psutil
17
+ pyannote.audio
18
+ pydantic>=2.0
19
+ pyloudnorm
20
+ # pyopenjtalk-prebuilt
21
+ pyopenjtalk-dict
22
+ pypinyin
23
+ pyworld-prebuilt
24
+ PyYAML
25
+ requests
26
+ safetensors
27
+ scipy
28
+ sentencepiece
29
+ tensorboard
30
+ torch>=2.1,<2.2 # For users without GPU or colab
31
+ transformers
32
+ umap-learn