Artrajz commited on
Commit
b5830b6
·
1 Parent(s): c44addb
Files changed (5) hide show
  1. app.py +125 -65
  2. config.py +6 -0
  3. templates/index.html +258 -234
  4. utils/utils.py +4 -0
  5. voice.py +95 -46
app.py CHANGED
@@ -16,7 +16,8 @@ app.config.from_pyfile("config.py")
16
 
17
  scheduler = APScheduler()
18
  scheduler.init_app(app)
19
- scheduler.start()
 
20
 
21
  logzero.loglevel(logging.WARNING)
22
  logger = logging.getLogger("vits-simple-api")
@@ -53,7 +54,8 @@ def require_api_key(func):
53
  @app.route('/', methods=["GET", "POST"])
54
  def index():
55
  kwargs = {
56
- "speakers": tts.voice_speakers
 
57
  }
58
  return render_template("index.html", **kwargs)
59
 
@@ -77,6 +79,7 @@ def voice_vits_api():
77
  noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
78
  noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
79
  max = int(request.args.get("max", app.config.get("MAX", 50)))
 
80
  elif request.method == "POST":
81
  content_type = request.headers.get('Content-Type')
82
  if content_type == 'application/json':
@@ -91,6 +94,7 @@ def voice_vits_api():
91
  noise = float(data.get("noise", app.config.get("NOISE", 0.667)))
92
  noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
93
  max = int(data.get("max", app.config.get("MAX", 50)))
 
94
  except Exception as e:
95
  logger.error(f"[VITS] {e}")
96
  return make_response("parameter error", 400)
@@ -120,23 +124,37 @@ def voice_vits_api():
120
  if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
121
  speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
122
 
 
 
 
 
123
  fname = f"{str(uuid.uuid1())}.{format}"
124
  file_type = f"audio/{format}"
125
-
126
- t1 = time.time()
127
- output = tts.vits_infer({"text": text,
128
- "id": id,
129
- "format": format,
130
- "length": length,
131
- "noise": noise,
132
- "noisew": noisew,
133
- "max": max,
134
- "lang": lang,
135
- "speaker_lang": speaker_lang})
136
- t2 = time.time()
137
- logger.info(f"[VITS] finish in {(t2 - t1):.2f}s")
138
-
139
- return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 
 
 
 
 
 
 
 
 
 
140
 
141
 
142
  @app.route('/voice/hubert-vits', methods=["POST"])
@@ -150,6 +168,7 @@ def voice_hubert_api():
150
  length = float(request.form.get("length", app.config.get("LENGTH", 1)))
151
  noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
152
  noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
 
153
  except Exception as e:
154
  logger.error(f"[hubert] {e}")
155
  return make_response("parameter error", 400)
@@ -168,18 +187,27 @@ def voice_hubert_api():
168
  return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
169
 
170
  file_type = f"audio/{format}"
 
 
 
 
 
 
171
 
172
  t1 = time.time()
173
- output = tts.hubert_vits_infer({"id": id,
174
- "format": format,
175
- "length": length,
176
- "noise": noise,
177
- "noisew": noisew,
178
- "audio_path": os.path.join(app.config['UPLOAD_FOLDER'], fname)})
179
  t2 = time.time()
 
 
180
  logger.info(f"[hubert] finish in {(t2 - t1):.2f}s")
181
-
182
- return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 
 
 
 
 
 
183
 
184
 
185
  @app.route('/voice/w2v2-vits', methods=["GET", "POST"])
@@ -196,6 +224,7 @@ def voice_w2v2_api():
196
  noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
197
  max = int(request.args.get("max", app.config.get("MAX", 50)))
198
  emotion = int(request.args.get("emotion", app.config.get("EMOTION", 0)))
 
199
  elif request.method == "POST":
200
  content_type = request.headers.get('Content-Type')
201
  if content_type == 'application/json':
@@ -211,6 +240,7 @@ def voice_w2v2_api():
211
  noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
212
  max = int(data.get("max", app.config.get("MAX", 50)))
213
  emotion = int(data.get("emotion", app.config.get("EMOTION", 0)))
 
214
  except Exception as e:
215
  logger.error(f"[w2v2] {e}")
216
  return make_response(f"parameter error", 400)
@@ -241,24 +271,37 @@ def voice_w2v2_api():
241
  if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
242
  speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
243
 
 
 
 
 
244
  fname = f"{str(uuid.uuid1())}.{format}"
245
  file_type = f"audio/{format}"
246
-
 
 
 
 
 
 
 
 
 
 
247
  t1 = time.time()
248
- output = tts.w2v2_vits_infer({"text": text,
249
- "id": id,
250
- "format": format,
251
- "length": length,
252
- "noise": noise,
253
- "noisew": noisew,
254
- "max": max,
255
- "lang": lang,
256
- "emotion": emotion,
257
- "speaker_lang": speaker_lang})
258
  t2 = time.time()
259
- logger.info(f"[w2v2] finish in {(t2 - t1):.2f}s")
260
-
261
- return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 
 
 
 
 
 
 
 
262
 
263
 
264
  @app.route('/voice/conversion', methods=["POST"])
@@ -271,29 +314,35 @@ def vits_voice_conversion_api():
271
  original_id = int(request.form["original_id"])
272
  target_id = int(request.form["target_id"])
273
  format = request.form.get("format", voice.filename.split(".")[1])
 
274
  except Exception as e:
275
  logger.error(f"[vits_voice_convertsion] {e}")
276
  return make_response("parameter error", 400)
277
 
 
278
  fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
279
  audio_path = os.path.join(app.config['UPLOAD_FOLDER'], fname)
280
  voice.save(audio_path)
281
  file_type = f"audio/{format}"
 
 
 
 
282
 
283
- logger.info(f"[vits_voice_convertsion] orginal_id:{original_id} target_id:{target_id}")
284
  t1 = time.time()
285
- try:
286
- output = tts.vits_voice_conversion({"audio_path": audio_path,
287
- "original_id": original_id,
288
- "target_id": target_id,
289
- "format": format})
290
- except Exception as e:
291
- logger.info(f"[vits_voice_convertsion] {e}")
292
- return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
293
  t2 = time.time()
294
- logger.info(f"finish in {(t2 - t1):.2f}s")
295
-
296
- return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 
 
 
 
 
 
 
 
297
 
298
 
299
  @app.route('/voice/ssml', methods=["POST"])
@@ -312,20 +361,24 @@ def ssml():
312
 
313
  logger.debug(ssml)
314
 
315
- t1 = time.time()
316
- try:
317
- output, format = tts.create_ssml_infer_task(ssml)
318
- except Exception as e:
319
- logger.info(f"[ssml] {e}")
320
- return make_response(jsonify({"status": "error", "message": f"synthesis failure"}), 400)
321
- t2 = time.time()
322
-
323
  fname = f"{str(uuid.uuid1())}.{format}"
324
  file_type = f"audio/{format}"
325
 
 
 
 
 
 
326
  logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
327
 
328
- return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 
 
 
 
 
 
 
329
 
330
 
331
  @app.route('/voice/dimension-emotion', methods=["POST"])
@@ -333,6 +386,7 @@ def dimensional_emotion():
333
  if request.method == "POST":
334
  try:
335
  audio = request.files['upload']
 
336
  except Exception as e:
337
  logger.error(f"[dimensional_emotion] {e}")
338
  return make_response("parameter error", 400)
@@ -341,9 +395,15 @@ def dimensional_emotion():
341
 
342
  file_type = "application/octet-stream; charset=ascii"
343
  fname = os.path.splitext(audio.filename)[0] + ".npy"
344
- output = tts.get_dimensional_emotion_npy(content)
345
-
346
- return send_file(path_or_file=output, mimetype=file_type, download_name=fname)
 
 
 
 
 
 
347
 
348
 
349
  @app.route('/voice/check', methods=["GET", "POST"])
@@ -400,7 +460,8 @@ def check():
400
 
401
 
402
  # regular cleaning
403
- @scheduler.task('interval', id='clean_task', seconds=3600, misfire_grace_time=900)
 
404
  def clean_task():
405
  clean_folder(app.config["UPLOAD_FOLDER"])
406
  clean_folder(app.config["CACHE_PATH"])
@@ -409,4 +470,3 @@ def clean_task():
409
  if __name__ == '__main__':
410
  app.run(host='0.0.0.0', port=app.config.get("PORT", 23456), debug=app.config.get("DEBUG", False)) # 对外开放
411
  # app.run(host='127.0.0.1', port=app.config.get("PORT",23456), debug=True) # 本地运行、调试
412
-
 
16
 
17
  scheduler = APScheduler()
18
  scheduler.init_app(app)
19
+ if app.config.get("CLEAN_INTERVAL_SECONDS", 3600) > 0:
20
+ scheduler.start()
21
 
22
  logzero.loglevel(logging.WARNING)
23
  logger = logging.getLogger("vits-simple-api")
 
54
  @app.route('/', methods=["GET", "POST"])
55
  def index():
56
  kwargs = {
57
+ "speakers": tts.voice_speakers,
58
+ "speakers_count": tts.speakers_count
59
  }
60
  return render_template("index.html", **kwargs)
61
 
 
79
  noise = float(request.args.get("noise", app.config.get("NOISE", 0.667)))
80
  noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
81
  max = int(request.args.get("max", app.config.get("MAX", 50)))
82
+ use_streaming = request.args.get('streaming', False, type=bool)
83
  elif request.method == "POST":
84
  content_type = request.headers.get('Content-Type')
85
  if content_type == 'application/json':
 
94
  noise = float(data.get("noise", app.config.get("NOISE", 0.667)))
95
  noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
96
  max = int(data.get("max", app.config.get("MAX", 50)))
97
+ use_streaming = request.form.get('streaming', False, type=bool)
98
  except Exception as e:
99
  logger.error(f"[VITS] {e}")
100
  return make_response("parameter error", 400)
 
124
  if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
125
  speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
126
 
127
+ if use_streaming and format.upper() != "MP3":
128
+ format = "mp3"
129
+ logger.warning("Streaming response only supports MP3 format.")
130
+
131
  fname = f"{str(uuid.uuid1())}.{format}"
132
  file_type = f"audio/{format}"
133
+ task = {"text": text,
134
+ "id": id,
135
+ "format": format,
136
+ "length": length,
137
+ "noise": noise,
138
+ "noisew": noisew,
139
+ "max": max,
140
+ "lang": lang,
141
+ "speaker_lang": speaker_lang}
142
+
143
+ if app.config.get("SAVE_AUDIO", False):
144
+ logger.debug(f"[VITS] {fname}")
145
+
146
+ if use_streaming:
147
+ audio = tts.stream_vits_infer(task, fname)
148
+ response = make_response(audio)
149
+ response.headers['Content-Disposition'] = f'attachment; filename={fname}'
150
+ response.headers['Content-Type'] = file_type
151
+ return response
152
+ else:
153
+ t1 = time.time()
154
+ audio = tts.vits_infer(task, fname)
155
+ t2 = time.time()
156
+ logger.info(f"[VITS] finish in {(t2 - t1):.2f}s")
157
+ return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
158
 
159
 
160
  @app.route('/voice/hubert-vits', methods=["POST"])
 
168
  length = float(request.form.get("length", app.config.get("LENGTH", 1)))
169
  noise = float(request.form.get("noise", app.config.get("NOISE", 0.667)))
170
  noisew = float(request.form.get("noisew", app.config.get("NOISEW", 0.8)))
171
+ use_streaming = request.form.get('streaming', False, type=bool)
172
  except Exception as e:
173
  logger.error(f"[hubert] {e}")
174
  return make_response("parameter error", 400)
 
187
  return make_response(jsonify({"status": "error", "message": f"id {id} does not exist"}), 400)
188
 
189
  file_type = f"audio/{format}"
190
+ task = {"id": id,
191
+ "format": format,
192
+ "length": length,
193
+ "noise": noise,
194
+ "noisew": noisew,
195
+ "audio_path": os.path.join(app.config['UPLOAD_FOLDER'], fname)}
196
 
197
  t1 = time.time()
198
+ audio = tts.hubert_vits_infer(task, fname)
 
 
 
 
 
199
  t2 = time.time()
200
+ if app.config.get("SAVE_AUDIO", False):
201
+ logger.debug(f"[hubert] {fname}")
202
  logger.info(f"[hubert] finish in {(t2 - t1):.2f}s")
203
+ if use_streaming:
204
+ audio = tts.generate_audio_chunks(audio)
205
+ response = make_response(audio)
206
+ response.headers['Content-Disposition'] = f'attachment; filename={fname}'
207
+ response.headers['Content-Type'] = file_type
208
+ return response
209
+ else:
210
+ return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
211
 
212
 
213
  @app.route('/voice/w2v2-vits', methods=["GET", "POST"])
 
224
  noisew = float(request.args.get("noisew", app.config.get("NOISEW", 0.8)))
225
  max = int(request.args.get("max", app.config.get("MAX", 50)))
226
  emotion = int(request.args.get("emotion", app.config.get("EMOTION", 0)))
227
+ use_streaming = request.args.get('streaming', False, type=bool)
228
  elif request.method == "POST":
229
  content_type = request.headers.get('Content-Type')
230
  if content_type == 'application/json':
 
240
  noisew = float(data.get("noisew", app.config.get("NOISEW", 0.8)))
241
  max = int(data.get("max", app.config.get("MAX", 50)))
242
  emotion = int(data.get("emotion", app.config.get("EMOTION", 0)))
243
+ use_streaming = request.form.get('streaming', False, type=bool)
244
  except Exception as e:
245
  logger.error(f"[w2v2] {e}")
246
  return make_response(f"parameter error", 400)
 
271
  if app.config.get("LANGUAGE_AUTOMATIC_DETECT", []) != []:
272
  speaker_lang = app.config.get("LANGUAGE_AUTOMATIC_DETECT")
273
 
274
+ if use_streaming and format.upper() != "MP3":
275
+ format = "mp3"
276
+ logger.warning("Streaming response only supports MP3 format.")
277
+
278
  fname = f"{str(uuid.uuid1())}.{format}"
279
  file_type = f"audio/{format}"
280
+ task = {"text": text,
281
+ "id": id,
282
+ "format": format,
283
+ "length": length,
284
+ "noise": noise,
285
+ "noisew": noisew,
286
+ "max": max,
287
+ "lang": lang,
288
+ "emotion": emotion,
289
+ "speaker_lang": speaker_lang}
290
+
291
  t1 = time.time()
292
+ audio = tts.w2v2_vits_infer(task, fname)
 
 
 
 
 
 
 
 
 
293
  t2 = time.time()
294
+ if app.config.get("SAVE_AUDIO", False):
295
+ logger.debug(f"[W2V2] {fname}")
296
+ if use_streaming:
297
+ audio = tts.generate_audio_chunks(audio)
298
+ response = make_response(audio)
299
+ response.headers['Content-Disposition'] = f'attachment; filename={fname}'
300
+ response.headers['Content-Type'] = file_type
301
+ return response
302
+ else:
303
+ logger.info(f"[w2v2] finish in {(t2 - t1):.2f}s")
304
+ return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
305
 
306
 
307
  @app.route('/voice/conversion', methods=["POST"])
 
314
  original_id = int(request.form["original_id"])
315
  target_id = int(request.form["target_id"])
316
  format = request.form.get("format", voice.filename.split(".")[1])
317
+ use_streaming = request.form.get('streaming', False, type=bool)
318
  except Exception as e:
319
  logger.error(f"[vits_voice_convertsion] {e}")
320
  return make_response("parameter error", 400)
321
 
322
+ logger.info(f"[vits_voice_convertsion] orginal_id:{original_id} target_id:{target_id}")
323
  fname = secure_filename(str(uuid.uuid1()) + "." + voice.filename.split(".")[1])
324
  audio_path = os.path.join(app.config['UPLOAD_FOLDER'], fname)
325
  voice.save(audio_path)
326
  file_type = f"audio/{format}"
327
+ task = {"audio_path": audio_path,
328
+ "original_id": original_id,
329
+ "target_id": target_id,
330
+ "format": format}
331
 
 
332
  t1 = time.time()
333
+ audio = tts.vits_voice_conversion(task, fname)
 
 
 
 
 
 
 
334
  t2 = time.time()
335
+ if app.config.get("SAVE_AUDIO", False):
336
+ logger.debug(f"[Voice conversion] {fname}")
337
+ logger.info(f"[Voice conversion] finish in {(t2 - t1):.2f}s")
338
+ if use_streaming:
339
+ audio = tts.generate_audio_chunks(audio)
340
+ response = make_response(audio)
341
+ response.headers['Content-Disposition'] = f'attachment; filename={fname}'
342
+ response.headers['Content-Type'] = file_type
343
+ return response
344
+ else:
345
+ return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
346
 
347
 
348
  @app.route('/voice/ssml', methods=["POST"])
 
361
 
362
  logger.debug(ssml)
363
 
 
 
 
 
 
 
 
 
364
  fname = f"{str(uuid.uuid1())}.{format}"
365
  file_type = f"audio/{format}"
366
 
367
+ t1 = time.time()
368
+ audio, format = tts.create_ssml_infer_task(ssml, fname)
369
+ t2 = time.time()
370
+ if app.config.get("SAVE_AUDIO", False):
371
+ logger.debug(f"[ssml] {fname}")
372
  logger.info(f"[ssml] finish in {(t2 - t1):.2f}s")
373
 
374
+ if eval(ssml.get('streaming', False)):
375
+ audio = tts.generate_audio_chunks(audio)
376
+ response = make_response(audio)
377
+ response.headers['Content-Disposition'] = f'attachment; filename={fname}'
378
+ response.headers['Content-Type'] = file_type
379
+ return response
380
+ else:
381
+ return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
382
 
383
 
384
  @app.route('/voice/dimension-emotion', methods=["POST"])
 
386
  if request.method == "POST":
387
  try:
388
  audio = request.files['upload']
389
+ use_streaming = request.form.get('streaming', False, type=bool)
390
  except Exception as e:
391
  logger.error(f"[dimensional_emotion] {e}")
392
  return make_response("parameter error", 400)
 
395
 
396
  file_type = "application/octet-stream; charset=ascii"
397
  fname = os.path.splitext(audio.filename)[0] + ".npy"
398
+ audio = tts.get_dimensional_emotion_npy(content)
399
+ if use_streaming:
400
+ audio = tts.generate_audio_chunks(audio)
401
+ response = make_response(audio)
402
+ response.headers['Content-Disposition'] = f'attachment; filename={fname}'
403
+ response.headers['Content-Type'] = file_type
404
+ return response
405
+ else:
406
+ return send_file(path_or_file=audio, mimetype=file_type, download_name=fname)
407
 
408
 
409
  @app.route('/voice/check', methods=["GET", "POST"])
 
460
 
461
 
462
  # regular cleaning
463
+ @scheduler.task('interval', id='clean_task', seconds=app.config.get("CLEAN_INTERVAL_SECONDS", 3600),
464
+ misfire_grace_time=900)
465
  def clean_task():
466
  clean_folder(app.config["UPLOAD_FOLDER"])
467
  clean_folder(app.config["CACHE_PATH"])
 
470
  if __name__ == '__main__':
471
  app.run(host='0.0.0.0', port=app.config.get("PORT", 23456), debug=app.config.get("DEBUG", False)) # 对外开放
472
  # app.run(host='127.0.0.1', port=app.config.get("PORT",23456), debug=True) # 本地运行、调试
 
config.py CHANGED
@@ -20,6 +20,12 @@ UPLOAD_FOLDER = ABS_PATH + "/upload"
20
  # Cahce path
21
  CACHE_PATH = ABS_PATH + "/cache"
22
 
 
 
 
 
 
 
23
  # zh ja ko en... If it is empty, it will be read based on the text_cleaners specified in the config.json.
24
  LANGUAGE_AUTOMATIC_DETECT = []
25
 
 
20
  # Cahce path
21
  CACHE_PATH = ABS_PATH + "/cache"
22
 
23
+ # If CLEAN_INTERVAL_SECONDS <= 0, the cleaning task will not be executed.
24
+ CLEAN_INTERVAL_SECONDS = 3600
25
+
26
+ # save audio to CACHE_PATH
27
+ SAVE_AUDIO = False
28
+
29
  # zh ja ko en... If it is empty, it will be read based on the text_cleaners specified in the config.json.
30
  LANGUAGE_AUTOMATIC_DETECT = []
31
 
templates/index.html CHANGED
@@ -1,237 +1,261 @@
1
  <!DOCTYPE html>
2
  <html lang="en">
3
- <head>
4
- <meta charset="UTF-8" />
5
- <meta name="viewport" content="width=device-width, initial-scale=1.0" />
6
- <title>vits-simple-api</title>
7
-
8
- <link rel="stylesheet" href="/static/css/bootstrap.min.css" />
9
- </head>
10
- <body>
11
- <main style="margin: 0 auto; width: 1024px">
12
- <h1>
13
- <a href="https://github.com/Artrajz/vits-simple-api" target="_blank" style="text-decoration: none; color: black"> vits-simple-api </a>
14
- </h1>
15
-
16
- <div>
17
- <label>文档:</label>
18
- <a href="https://github.com/Artrajz/vits-simple-api" target="_blank" style="text-decoration: none; color: black"> https://github.com/Artrajz/vits-simple-api </a>
19
- </div>
20
- <div>
21
- <label>返回speakers(json):</label>
22
- <a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank" style="text-decoration: none; color: black">
23
- https://artrajz-vits-simple-api.hf.space/voice/speakers
24
- </a>
25
- </div>
26
- <div>
27
- <label>简单调用api:</label>
28
- <a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164" style="text-decoration: none; color: black">
29
- https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
30
- </a>
31
- </div>
32
-
33
- <!-- <div style="display: flex; justify-content: center; align-items: center"> -->
34
- <div>
35
- <form>
36
- <div class="form-group">
37
- <label>text</label>
38
- <textarea class="form-control" id="inputText" rows="3" oninput="updateLink()">你好,こんにちは</textarea>
39
- </div>
40
- <div class="form-group">
41
- <label>id</label>
42
-
43
- <select class="form-control" id="inputId" oninput="updateLink()">
44
- <option value="164"></option>
45
- {% for speaker in speakers["VITS"] %}
46
- {% if speaker["name"] == "雷电将军(雷神)" %}
47
- <option value="{{speaker["id"]}}" selected>{{speaker["id"]}} | {{speaker["name"]}} | {{speaker["lang"]}}</option>
48
- {% else %}
49
- <option value="{{speaker["id"]}}">{{speaker["id"]}} | {{speaker["name"]}} | {{speaker["lang"]}}</option>
50
- {% endif %}
51
- {% endfor %}
52
- </select>
53
- </div>
54
- </form>
55
- </div>
56
- <p>
57
- <button class="btn btn-primary" type="button" data-toggle="collapse" data-target="#collapseExample" aria-expanded="false" aria-controls="collapseExample">
58
- Advanced
59
- </button>
60
- </p>
61
- <div class="collapse" id="collapseExample">
62
- <div class="card card-body">
63
- <form>
64
- <div class="form-group">
65
- <label>format</label>
66
- <select class="form-control" id="inputFormat" oninput="updateLink()">
67
- <option></option>
68
- <option>wav</option>
69
- <option>mp3</option>
70
- <option>ogg</option>
71
- <option>silk</option>
72
- </select>
73
- </div>
74
- <div class="form-group">
75
- <label>lang</label>
76
- <input type="text" class="form-control" id="inputLang" oninput="updateLink()" value="" placeholder="auto" />
77
- </div>
78
- <div class="form-group">
79
- <label>length</label>
80
- <input type="text" class="form-control" id="inputLength" oninput="updateLink()" value="" placeholder="1" />
81
- </div>
82
- <div class="form-group">
83
- <label>noise</label>
84
- <input type="text" class="form-control" id="inputNoise" oninput="updateLink()" value="" placeholder="0.33" />
85
- </div>
86
- <div class="form-group">
87
- <label>noisew</label>
88
- <input type="text" class="form-control" id="inputNoisew" oninput="updateLink()" value="" placeholder="0.4" />
89
- </div>
90
- <div class="form-group">
91
- <label>max</label>
92
- <input type="text" class="form-control" id="inputMax" oninput="updateLink()" value="" placeholder="50" />
93
- </div>
94
- </form>
95
- </div>
96
- </div>
97
-
98
- <div style="display: flex; justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
99
- <button type="button" class="btn btn-outline-secondary" id="getAudio" style="margin-right: 10px">播放器生成</button>
100
- <audio id="audioPlayer" controls>
101
- <source src="" type="audio/mp3" />
102
- Your browser does not support the audio element.
103
- </audio>
104
- </div>
105
- <div>自动识别语言:可识别的语言根据不同speaker而不同,方言无法自动识别</div>
106
- <div>方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd</div>
107
- <br />
108
-
109
- <h2>所有模型均为网络搜集,感谢模型原作者的付出!</h2>
110
- <p>
111
- Nene_Nanami_Rong_Tang:
112
- <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
113
- </p>
114
- <p>
115
- louise:
116
- <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
117
- </p>
118
- <p>
119
- Cantonese:
120
- <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
121
- </p>
122
- <p>
123
- shanghainese:
124
- <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
125
- </p>
126
- <p>
127
- w2v2-vits:
128
- <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
129
- </p>
130
- <p>
131
- vctk:
132
- <a href="https://github.com/jaywalnut310/vits" rel="noreferrer" target="_blank">jaywalnut310/vits</a>
133
- </p>
134
- <p>
135
- Bishojo Mangekyo:
136
- <a href="https://github.com/Francis-Komizu/VITS" rel="noreferrer" target="_blank">Francis-Komizu/VITS</a>
137
- </p>
138
- <p>
139
- genshin:
140
- <a href="https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai" rel="noreferrer" target="_blank">zomehwh/vits-uma-genshin-honkai</a>
141
- </p>
142
- <p>
143
- paimon:
144
- <a href="https://github.com/zixiiu/Digital_Life_Server" rel="noreferrer" target="_blank">zixiiu/Digital_Life_Server</a>
145
- </p>
146
- <p>
147
- vits_chinese:
148
- <a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
149
- </p>
150
- </main>
151
-
152
- <script src="/static/js/jquery.slim.min.js"></script>
153
- <script src="/static/js/bootstrap.bundle.min.js"></script>
154
-
155
- <script>
156
- function getProtocol(){
157
- return 'https:' == location.protocol ? "https://": "http://";
158
- }
159
-
160
- function getUrl(){
161
- var url = window.location.host;
162
- return url;
163
- }
164
-
165
- var baseUrl = getProtocol() + getUrl();
166
-
167
- setBaseUrl();
168
-
169
- function setBaseUrl(){
170
- var text = document.getElementById("inputText").value;
171
- var id = document.getElementById("inputId").value;
172
-
173
- var vitsLink = document.getElementById("vitsLink");
174
- var speakersLink = document.getElementById("speakersLink");
175
-
176
- var vitsUrl = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
177
- var speakersUrl = baseUrl + "/voice/speakers";
178
-
179
- vitsLink.href = vitsUrl;
180
- vitsLink.textContent = vitsUrl;
181
-
182
- speakersLink.href = speakersUrl;
183
- speakersLink.textContent = speakersUrl;
184
- }
185
-
186
- function getLink() {
187
- var text = document.getElementById("inputText").value;
188
- var id = document.getElementById("inputId").value;
189
- var format = document.getElementById("inputFormat").value;
190
- var lang = document.getElementById("inputLang").value;
191
- var length = document.getElementById("inputLength").value;
192
- var noise = document.getElementById("inputNoise").value;
193
- var noisew = document.getElementById("inputNoisew").value;
194
- var max = document.getElementById("inputMax").value;
195
-
196
- var url = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
197
- if (format != "") {
198
- url += "&format=" + format;
199
- }
200
- if (lang != "") {
201
- url += "&lang=" + lang;
202
- }
203
- if (length != "") {
204
- url += "&length=" + length;
205
- }
206
- if (noise != "") {
207
- url += "&noise=" + noise;
208
- }
209
- if (noisew != "") {
210
- url += "&noisew=" + noisew;
211
- }
212
- if (max != "") {
213
- url += "&max=" + max;
214
- }
215
- return url;
216
- }
217
-
218
- function updateLink() {
219
- var url = getLink();
220
- var link = document.getElementById("vitsLink");
221
- link.href = url;
222
- link.textContent = url;
223
- }
224
-
225
- function setAudioSource() {
226
- var url = getLink();
227
- var audioPlayer = document.getElementById("audioPlayer");
228
- audioPlayer.src = url;
229
- }
230
-
231
- var button = document.getElementById("getAudio");
232
- button.addEventListener("click", function () {
233
- setAudioSource();
234
- });
235
- </script>
236
- </body>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  </html>
 
1
  <!DOCTYPE html>
2
  <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8"/>
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0"/>
6
+ <title>vits-simple-api</title>
7
+
8
+ <link rel="stylesheet" href="/static/css/bootstrap.min.css"/>
9
+ </head>
10
+ <body>
11
+ <main style="margin: 0 auto; width: 1024px">
12
+ <h1>
13
+ <a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
14
+ style="text-decoration: none; color: black"> vits-simple-api </a>
15
+ </h1>
16
+
17
+ <div>
18
+ <label>文档:</label>
19
+ <a href="https://github.com/Artrajz/vits-simple-api" target="_blank"
20
+ style="text-decoration: none; color: black"> https://github.com/Artrajz/vits-simple-api </a>
21
+ </div>
22
+ <div>
23
+ <label>返回speakers(json):</label>
24
+ <a id="speakersLink" href="https://artrajz-vits-simple-api.hf.space/voice/speakers" target="_blank"
25
+ style="text-decoration: none; color: black">
26
+ https://artrajz-vits-simple-api.hf.space/voice/speakers
27
+ </a>
28
+ </div>
29
+ <div>
30
+ <label>简单调用api:</label>
31
+ <a id="vitsLink" href="https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164"
32
+ style="text-decoration: none; color: black">
33
+ https://artrajz-vits-simple-api.hf.space/voice/vits?text=你好,こんにちは&id=164
34
+ </a>
35
+ </div>
36
+
37
+ <!-- <div style="display: flex; justify-content: center; align-items: center"> -->
38
+ <div>
39
+ <form>
40
+ <div class="form-group">
41
+ <label>text</label>
42
+ <textarea class="form-control" id="inputText" rows="3" oninput="updateLink()">你好,こんにちは</textarea>
43
+ </div>
44
+ <div class="form-group">
45
+ <label>id</label>
46
+ <select class="form-control" id="inputId" oninput="updateLink()">
47
+ {% for speaker in speakers["VITS"] %}
48
+ {% if speaker["name"] == "雷电将军(雷神)" %}
49
+ <option value="{{ speaker["id"] }}" selected>{{ speaker["id"] }} | {{ speaker["name"] }}
50
+ | {{ speaker["lang"] }}</option>
51
+ {% else %}
52
+ <option value="{{ speaker["id"] }}">{{ speaker["id"] }} | {{ speaker["name"] }}
53
+ | {{ speaker["lang"] }}</option>
54
+ {% endif %}
55
+ {% endfor %}
56
+ </select>
57
+ </div>
58
+ </form>
59
+ </div>
60
+ <p>
61
+ <button class="btn btn-primary" type="button" data-toggle="collapse" data-target="#collapseExample"
62
+ aria-expanded="false" aria-controls="collapseExample">
63
+ Advanced
64
+ </button>
65
+ {% if speakers_count == 0 %}
66
+ <div style="color: red;">未加载任何模型</div>
67
+ {% endif %}
68
+ </p>
69
+ <div class="collapse" id="collapseExample">
70
+ <div class="card card-body">
71
+ <form>
72
+ <div class="form-group">
73
+ <label>format</label>
74
+ <select class="form-control" id="inputFormat" oninput="updateLink()">
75
+ <option></option>
76
+ <option>wav</option>
77
+ <option>mp3</option>
78
+ <option>ogg</option>
79
+ <option>silk</option>
80
+ </select>
81
+ </div>
82
+ <div class="form-group">
83
+ <label>lang</label>
84
+ <input type="text" class="form-control" id="inputLang" oninput="updateLink()" value=""
85
+ placeholder="auto"/>
86
+ </div>
87
+ <div class="form-group">
88
+ <label>length</label>
89
+ <input type="text" class="form-control" id="inputLength" oninput="updateLink()" value=""
90
+ placeholder="1"/>
91
+ </div>
92
+ <div class="form-group">
93
+ <label>noise</label>
94
+ <input type="text" class="form-control" id="inputNoise" oninput="updateLink()" value=""
95
+ placeholder="0.33"/>
96
+ </div>
97
+ <div class="form-group">
98
+ <label>noisew</label>
99
+ <input type="text" class="form-control" id="inputNoisew" oninput="updateLink()" value=""
100
+ placeholder="0.4"/>
101
+ </div>
102
+ <div class="form-group">
103
+ <label>max</label>
104
+ <input type="text" class="form-control" id="inputMax" oninput="updateLink()" value=""
105
+ placeholder="50"/>
106
+ </div>
107
+ </form>
108
+ </div>
109
+ </div>
110
+
111
+ <div style="display: flex; justify-content: center; align-items: center; height: 80px; margin-top: 20px; margin-bottom: 20px; border: 1px solid rgba(0,0,0,.125); border-radius: 0.25rem;">
112
+ <button type="button" class="btn btn-outline-secondary" id="getAudio" style="margin-right: 10px">播放器生成</button>
113
+ <audio id="audioPlayer" controls>
114
+ <source src="" type="audio/mp3"/>
115
+ Your browser does not support the audio element.
116
+ </audio>
117
+ <div class="form-group form-check">
118
+ <input type="checkbox" id="streaming">
119
+ <label class="form-check-label">流式响应</label>
120
+ </div>
121
+ </div>
122
+ <div>自动识别语言:可识别的语言根据不同speaker而不同,方言无法自动识别</div>
123
+ <div>方言模型需要手动指定语言,比如粤语Cantonese要指定参数lang=gd</div>
124
+ <br/>
125
+
126
+ <h2>所有模型均为网络搜集,感谢模型原作者的付出!</h2>
127
+ <p>
128
+ Nene_Nanami_Rong_Tang:
129
+ <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
130
+ </p>
131
+ <p>
132
+ louise:
133
+ <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
134
+ </p>
135
+ <p>
136
+ Cantonese:
137
+ <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
138
+ </p>
139
+ <p>
140
+ shanghainese:
141
+ <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
142
+ </p>
143
+ <p>
144
+ w2v2-vits:
145
+ <a href="https://github.com/CjangCjengh/TTSModels" rel="noreferrer" target="_blank">CjangCjengh/TTSModels</a>
146
+ </p>
147
+ <p>
148
+ vctk:
149
+ <a href="https://github.com/jaywalnut310/vits" rel="noreferrer" target="_blank">jaywalnut310/vits</a>
150
+ </p>
151
+ <p>
152
+ Bishojo Mangekyo:
153
+ <a href="https://github.com/Francis-Komizu/VITS" rel="noreferrer" target="_blank">Francis-Komizu/VITS</a>
154
+ </p>
155
+ <p>
156
+ genshin:
157
+ <a href="https://huggingface.co/spaces/zomehwh/vits-uma-genshin-honkai" rel="noreferrer" target="_blank">zomehwh/vits-uma-genshin-honkai</a>
158
+ </p>
159
+ <p>
160
+ paimon:
161
+ <a href="https://github.com/zixiiu/Digital_Life_Server" rel="noreferrer" target="_blank">zixiiu/Digital_Life_Server</a>
162
+ </p>
163
+ <p>
164
+ vits_chinese:
165
+ <a href="https://github.com/PlayVoice/vits_chinese" rel="noreferrer" target="_blank">PlayVoice/vits_chinese</a>
166
+ </p>
167
+
168
+ </main>
169
+
170
+ <script src="/static/js/jquery.slim.min.js"></script>
171
+ <script src="/static/js/bootstrap.bundle.min.js"></script>
172
+
173
+ <script>
174
+ function getProtocol() {
175
+ return 'https:' == location.protocol ? "https://" : "http://";
176
+ }
177
+
178
+ function getUrl() {
179
+ var url = window.location.host;
180
+ return url;
181
+ }
182
+
183
+ var baseUrl = getProtocol() + getUrl();
184
+
185
+ setBaseUrl();
186
+
187
+ function setBaseUrl() {
188
+ var text = document.getElementById("inputText").value;
189
+ var id = document.getElementById("inputId").value;
190
+
191
+ var vitsLink = document.getElementById("vitsLink");
192
+ var speakersLink = document.getElementById("speakersLink");
193
+
194
+ var vitsUrl = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
195
+ var speakersUrl = baseUrl + "/voice/speakers";
196
+
197
+ vitsLink.href = vitsUrl;
198
+ vitsLink.textContent = vitsUrl;
199
+
200
+ speakersLink.href = speakersUrl;
201
+ speakersLink.textContent = speakersUrl;
202
+ }
203
+
204
+ function getLink() {
205
+ var text = document.getElementById("inputText").value;
206
+ var id = document.getElementById("inputId").value;
207
+ var format = document.getElementById("inputFormat").value;
208
+ var lang = document.getElementById("inputLang").value;
209
+ var length = document.getElementById("inputLength").value;
210
+ var noise = document.getElementById("inputNoise").value;
211
+ var noisew = document.getElementById("inputNoisew").value;
212
+ var max = document.getElementById("inputMax").value;
213
+
214
+ var url = baseUrl + "/voice/vits?text=" + text + "&id=" + id;
215
+ if (format != "") {
216
+ url += "&format=" + format;
217
+ }
218
+ if (lang != "") {
219
+ url += "&lang=" + lang;
220
+ }
221
+ if (length != "") {
222
+ url += "&length=" + length;
223
+ }
224
+ if (noise != "") {
225
+ url += "&noise=" + noise;
226
+ }
227
+ if (noisew != "") {
228
+ url += "&noisew=" + noisew;
229
+ }
230
+ if (max != "") {
231
+ url += "&max=" + max;
232
+ }
233
+ return url;
234
+ }
235
+
236
+ function updateLink() {
237
+ var url = getLink();
238
+ var link = document.getElementById("vitsLink");
239
+ link.href = url;
240
+ link.textContent = url;
241
+ }
242
+
243
+ function setAudioSource() {
244
+ var streaming = document.getElementById('streaming');
245
+ var url = getLink();
246
+ if (streaming.checked) {
247
+ url += '&streaming=true';
248
+ }
249
+
250
+ var audioPlayer = document.getElementById("audioPlayer");
251
+ audioPlayer.src = url;
252
+ audioPlayer.play();
253
+ }
254
+
255
+ var button = document.getElementById("getAudio");
256
+ button.addEventListener("click", function () {
257
+ setAudioSource();
258
+ });
259
+ </script>
260
+ </body>
261
  </html>
utils/utils.py CHANGED
@@ -89,3 +89,7 @@ def clean_folder(folder_path):
89
  # is none -> True, is not none -> False
90
  def check_is_none(s):
91
  return s is None or (isinstance(s, str) and str(s).isspace()) or str(s) == ""
 
 
 
 
 
89
  # is none -> True, is not none -> False
90
  def check_is_none(s):
91
  return s is None or (isinstance(s, str) and str(s).isspace()) or str(s) == ""
92
+
93
+ def save_audio(audio, path):
94
+ with open(path,"wb") as f:
95
+ f.write(audio)
voice.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  import librosa
3
  import commons
4
- import sys
5
  import re
6
  import numpy as np
7
  import torch
@@ -156,7 +155,7 @@ class vits:
156
 
157
  return params
158
 
159
- def get_audio(self, voice, auto_break=False):
160
  text = voice.get("text", None)
161
  speaker_id = voice.get("id", 0)
162
  length = voice.get("length", 1)
@@ -171,47 +170,57 @@ class vits:
171
  # 去除所有多余的空白字符
172
  if text is not None: text = re.sub(r'\s+', ' ', text).strip()
173
 
174
- # 停顿0.75s,避免语音分段合成再拼接后的连接突兀
175
- brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
176
-
177
  tasks = []
178
  if self.model_type == "vits":
179
  sentence_list = sentence_split(text, max, lang, speaker_lang)
180
  for sentence in sentence_list:
181
- tasks.append(
182
- self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length, noise_scale=noise,
183
- noise_scale_w=noisew))
184
-
185
- audios = []
186
- for task in tasks:
187
- audios.append(self.infer(task))
188
- if auto_break:
189
- audios.append(brk)
190
-
191
- audio = np.concatenate(audios, axis=0)
192
 
193
  elif self.model_type == "hubert":
194
  params = self.get_infer_param(speaker_id=speaker_id, length_scale=length, noise_scale=noise,
195
  noise_scale_w=noisew, audio_path=audio_path)
196
- audio = self.infer(params)
197
 
198
  elif self.model_type == "w2v2":
199
  sentence_list = sentence_split(text, max, lang, speaker_lang)
200
  for sentence in sentence_list:
201
- tasks.append(
202
- self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length, noise_scale=noise,
203
- noise_scale_w=noisew, emotion=emotion))
204
 
205
- audios = []
206
- for task in tasks:
207
- audios.append(self.infer(task))
208
- if auto_break:
209
- audios.append(brk)
210
 
211
- audio = np.concatenate(audios, axis=0)
 
 
 
 
 
 
 
 
 
 
 
212
 
 
213
  return audio
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  def voice_conversion(self, voice):
216
  audio_path = voice.get("audio_path")
217
  original_id = voice.get("original_id")
@@ -330,6 +339,14 @@ class TTS:
330
  else:
331
  raise ValueError("Unsupported time unit: {}".format(time_unit))
332
 
 
 
 
 
 
 
 
 
333
  def parse_ssml(self, ssml):
334
  root = ET.fromstring(ssml)
335
  format = root.attrib.get("format", "wav")
@@ -403,7 +420,7 @@ class TTS:
403
 
404
  return voice_tasks, format
405
 
406
- def create_ssml_infer_task(self, ssml):
407
  voice_tasks, format = self.parse_ssml(ssml)
408
 
409
  audios = []
@@ -420,38 +437,66 @@ class TTS:
420
  audios.append(audio)
421
 
422
  audio = np.concatenate(audios, axis=0)
423
- output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
424
-
425
- return output, format
 
 
426
 
427
- def vits_infer(self, voice):
428
  format = voice.get("format", "wav")
429
  voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
430
  voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
 
431
  audio = voice_obj.get_audio(voice, auto_break=True)
432
- output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
433
-
434
- return output
 
 
435
 
436
- def hubert_vits_infer(self, voice):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
437
  format = voice.get("format", "wav")
438
  voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
439
  voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
 
440
  audio = voice_obj.get_audio(voice)
441
- output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
442
-
443
- return output
 
 
444
 
445
- def w2v2_vits_infer(self, voice):
446
  format = voice.get("format", "wav")
447
  voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
448
  voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
 
449
  audio = voice_obj.get_audio(voice, auto_break=True)
450
- output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
 
 
 
 
451
 
452
- return output
453
-
454
- def vits_voice_conversion(self, voice):
455
  original_id = voice.get("original_id")
456
  target_id = voice.get("target_id")
457
  format = voice.get("format")
@@ -466,10 +511,14 @@ class TTS:
466
  voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
467
 
468
  voice_obj = self._voice_obj["VITS"][original_id][1]
469
- audio = voice_obj.voice_conversion(voice)
470
- output = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
471
 
472
- return output
 
 
 
 
 
473
 
474
  def get_dimensional_emotion_npy(self, audio):
475
  if self.dem is None:
 
1
  import os
2
  import librosa
3
  import commons
 
4
  import re
5
  import numpy as np
6
  import torch
 
155
 
156
  return params
157
 
158
+ def get_tasks(self, voice):
159
  text = voice.get("text", None)
160
  speaker_id = voice.get("id", 0)
161
  length = voice.get("length", 1)
 
170
  # 去除所有多余的空白字符
171
  if text is not None: text = re.sub(r'\s+', ' ', text).strip()
172
 
 
 
 
173
  tasks = []
174
  if self.model_type == "vits":
175
  sentence_list = sentence_split(text, max, lang, speaker_lang)
176
  for sentence in sentence_list:
177
+ params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length,
178
+ noise_scale=noise, noise_scale_w=noisew)
179
+ tasks.append(params)
 
 
 
 
 
 
 
 
180
 
181
  elif self.model_type == "hubert":
182
  params = self.get_infer_param(speaker_id=speaker_id, length_scale=length, noise_scale=noise,
183
  noise_scale_w=noisew, audio_path=audio_path)
184
+ tasks.append(params)
185
 
186
  elif self.model_type == "w2v2":
187
  sentence_list = sentence_split(text, max, lang, speaker_lang)
188
  for sentence in sentence_list:
189
+ params = self.get_infer_param(text=sentence, speaker_id=speaker_id, length_scale=length,
190
+ noise_scale=noise, noise_scale_w=noisew, emotion=emotion)
191
+ tasks.append(params)
192
 
193
+ return tasks
 
 
 
 
194
 
195
+ def get_audio(self, voice, auto_break=False):
196
+ tasks = self.get_tasks(voice)
197
+ # 停顿0.75s,避免语音分段合成再拼接后的连接突兀
198
+ brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
199
+
200
+ audios = []
201
+ for task in tasks:
202
+ if auto_break:
203
+ chunk = np.concatenate((self.infer(task), brk), axis=0)
204
+ else:
205
+ chunk = self.infer(task)
206
+ audios.append(chunk)
207
 
208
+ audio = np.concatenate(audios, axis=0)
209
  return audio
210
 
211
+ def get_stream_audio(self, voice, auto_break=False):
212
+ tasks = self.get_tasks(voice)
213
+
214
+ brk = np.zeros(int(0.75 * 22050), dtype=np.int16)
215
+
216
+ for task in tasks:
217
+ if auto_break:
218
+ chunk = np.concatenate((self.infer(task), brk), axis=0)
219
+ else:
220
+ chunk = self.infer(task)
221
+
222
+ yield chunk
223
+
224
  def voice_conversion(self, voice):
225
  audio_path = voice.get("audio_path")
226
  original_id = voice.get("original_id")
 
339
  else:
340
  raise ValueError("Unsupported time unit: {}".format(time_unit))
341
 
342
+ def generate_audio_chunks(self, audio):
343
+ chunk_size = 4096
344
+ while True:
345
+ chunk = audio.read(chunk_size)
346
+ if not chunk:
347
+ break
348
+ yield chunk
349
+
350
  def parse_ssml(self, ssml):
351
  root = ET.fromstring(ssml)
352
  format = root.attrib.get("format", "wav")
 
420
 
421
  return voice_tasks, format
422
 
423
+ def create_ssml_infer_task(self, ssml, fname):
424
  voice_tasks, format = self.parse_ssml(ssml)
425
 
426
  audios = []
 
437
  audios.append(audio)
438
 
439
  audio = np.concatenate(audios, axis=0)
440
+ encoded_audio = self.encode(voice_obj.hps_ms.data.sampling_rate, audio, format)
441
+ if config.SAVE_AUDIO:
442
+ path = f"{config.CACHE_PATH}/{fname}"
443
+ utils.save_audio(encoded_audio.getvalue(), path)
444
+ return encoded_audio, format
445
 
446
+ def vits_infer(self, voice, fname):
447
  format = voice.get("format", "wav")
448
  voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
449
  voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
450
+ sampling_rate = voice_obj.hps_ms.data.sampling_rate
451
  audio = voice_obj.get_audio(voice, auto_break=True)
452
+ encoded_audio = self.encode(sampling_rate, audio, format)
453
+ if config.SAVE_AUDIO:
454
+ path = f"{config.CACHE_PATH}/{fname}"
455
+ utils.save_audio(encoded_audio.getvalue(), path)
456
+ return encoded_audio
457
 
458
+ def stream_vits_infer(self, voice, fname):
459
+ format = voice.get("format", "wav")
460
+ voice_obj = self._voice_obj["VITS"][voice.get("id")][1]
461
+ voice["id"] = self._voice_obj["VITS"][voice.get("id")][0]
462
+ sampling_rate = voice_obj.hps_ms.data.sampling_rate
463
+ genertator = voice_obj.get_stream_audio(voice, auto_break=True)
464
+ audio = BytesIO()
465
+ for chunk in genertator:
466
+ encoded_audio = self.encode(sampling_rate, chunk, format)
467
+ for encoded_audio_chunk in self.generate_audio_chunks(encoded_audio):
468
+ yield encoded_audio_chunk
469
+ if config.SAVE_AUDIO:
470
+ audio.write(encoded_audio.getvalue())
471
+ if config.SAVE_AUDIO:
472
+ path = f"{config.CACHE_PATH}/{fname}"
473
+ utils.save_audio(audio.getvalue(), path)
474
+
475
+ def hubert_vits_infer(self, voice, fname):
476
  format = voice.get("format", "wav")
477
  voice_obj = self._voice_obj["HUBERT-VITS"][voice.get("id")][1]
478
  voice["id"] = self._voice_obj["HUBERT-VITS"][voice.get("id")][0]
479
+ sampling_rate = voice_obj.hps_ms.data.sampling_rate
480
  audio = voice_obj.get_audio(voice)
481
+ encoded_audio = self.encode(sampling_rate, audio, format)
482
+ if config.SAVE_AUDIO:
483
+ path = f"{config.CACHE_PATH}/{fname}"
484
+ utils.save_audio(encoded_audio.getvalue(), path)
485
+ return encoded_audio
486
 
487
+ def w2v2_vits_infer(self, voice, fname):
488
  format = voice.get("format", "wav")
489
  voice_obj = self._voice_obj["W2V2-VITS"][voice.get("id")][1]
490
  voice["id"] = self._voice_obj["W2V2-VITS"][voice.get("id")][0]
491
+ sampling_rate = voice_obj.hps_ms.data.sampling_rate
492
  audio = voice_obj.get_audio(voice, auto_break=True)
493
+ encoded_audio = self.encode(sampling_rate, audio, format)
494
+ if config.SAVE_AUDIO:
495
+ path = f"{config.CACHE_PATH}/{fname}"
496
+ utils.save_audio(encoded_audio.getvalue(), path)
497
+ return encoded_audio
498
 
499
+ def vits_voice_conversion(self, voice, fname):
 
 
500
  original_id = voice.get("original_id")
501
  target_id = voice.get("target_id")
502
  format = voice.get("format")
 
511
  voice["target_id"] = int(self._voice_obj["VITS"][target_id][0])
512
 
513
  voice_obj = self._voice_obj["VITS"][original_id][1]
514
+ sampling_rate = voice_obj.hps_ms.data.sampling_rate
 
515
 
516
+ audio = voice_obj.voice_conversion(voice)
517
+ encoded_audio = self.encode(sampling_rate, audio, format)
518
+ if config.SAVE_AUDIO:
519
+ path = f"{config.CACHE_PATH}/{fname}"
520
+ utils.save_audio(encoded_audio.getvalue(), path)
521
+ return encoded_audio
522
 
523
  def get_dimensional_emotion_npy(self, audio):
524
  if self.dem is None: