Spaces:
Runtime error
Runtime error
File size: 8,590 Bytes
425b9a7 84fef35 425b9a7 84fef35 425b9a7 c86f2b2 425b9a7 657d73c 84fef35 425b9a7 b8f004d 425b9a7 b8f004d 425b9a7 b8f004d 425b9a7 84fef35 657d73c 84fef35 425b9a7 84fef35 425b9a7 e1e5514 84fef35 657d73c 425b9a7 63bb040 425b9a7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
import os, logging, datetime, json, random
import gradio as gr
import numpy as np
import torch
import re_matching
import utils
from infer import infer, latest_version, get_net_g, infer_multilang
import gradio as gr
from config import config
from tools.webui import reload_javascript, get_character_html
from tools.sentence import split_by_language
logging.basicConfig(
level=logging.INFO,
format='[%(levelname)s|%(asctime)s]%(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
device = config.webui_config.device
if device == "mps":
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
hps = utils.get_hparams_from_file(config.webui_config.config_path)
version = hps.version if hasattr(hps, "version") else latest_version
net_g = get_net_g(model_path=config.webui_config.model, version=version, device=device, hps=hps)
with open("./css/style.css", "r", encoding="utf-8") as f:
customCSS = f.read()
with open("./assets/lines.json", "r", encoding="utf-8") as f:
full_lines = json.load(f)
def speak_fn(
text: str,
exceed_flag,
speaker="TalkFlower_CNzh",
sdp_ratio=0.2, # SDP/DP混合比
noise_scale=0.6, # 感情
noise_scale_w=0.6, # 音素长度
length_scale=0.9, # 语速
language="ZH",
reference_audio=None,
emotion=4,
interval_between_para=0.2, # 段间间隔
interval_between_sent=1, # 句间间隔
):
if speaker == "Chinese": speaker = "TalkFlower_CNzh"
elif speaker == "English": speaker = "TalkFlower_USen"
elif speaker == "Japanese": speaker = "TalkFlower_JPja"
else: speaker = "TalkFlower_CNzh"
audio_list = []
while text.find("\n\n") != -1:
text = text.replace("\n\n", "\n")
if len(text) > 512:
logging.info(f"Too Long Text: {text}")
if speaker == "TalkFlower_CNzh":
text = "这句太长了,憋坏我啦!"
audio_value = "./assets/audios/overlength.wav"
elif speaker == "TalkFlower_USen":
text = "This sentence is too long!"
audio_value = "./assets/audios/overlength_en.wav"
elif speaker == "TalkFlower_JPja":
text = "この文は長すぎます!"
audio_value = "./assets/audios/overlength_ja.wav"
exceed_flag = not exceed_flag
else:
for idx, slice in enumerate(text.split("|")):
if slice == "":
continue
skip_start = idx != 0
skip_end = idx != len(text.split("|")) - 1
sentences_list = split_by_language(
slice, target_languages=["zh", "ja", "en"]
)
idx = 0
while idx < len(sentences_list):
text_to_generate = []
lang_to_generate = []
while True:
content, lang = sentences_list[idx]
temp_text = [content]
lang = lang.upper()
if lang == "JA":
lang = "JP"
if len(text_to_generate) > 0:
text_to_generate[-1] += [temp_text.pop(0)]
lang_to_generate[-1] += [lang]
if len(temp_text) > 0:
text_to_generate += [[i] for i in temp_text]
lang_to_generate += [[lang]] * len(temp_text)
if idx + 1 < len(sentences_list):
idx += 1
else:
break
skip_start = (idx != 0) and skip_start
skip_end = (idx != len(sentences_list) - 1) and skip_end
logging.info(f"{speaker[-4:]}: {text_to_generate}{lang_to_generate}")
with torch.no_grad():
for i, piece in enumerate(text_to_generate):
skip_start = (i != 0) and skip_start
skip_end = (i != len(text_to_generate) - 1) and skip_end
audio = infer_multilang(
piece,
reference_audio=reference_audio,
emotion=emotion,
sdp_ratio=sdp_ratio,
noise_scale=noise_scale,
noise_scale_w=noise_scale_w,
length_scale=length_scale,
sid=speaker,
language=lang_to_generate[i],
hps=hps,
net_g=net_g,
device=device,
skip_start=skip_start,
skip_end=skip_end,
)
audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
audio_list.append(audio16bit)
idx += 1
# 单一语言推理
# if len(text) > 42:
# logging.info(f"Long Text: {text}")
# para_list = re_matching.cut_para(text)
# for p in para_list:
# audio_list_sent = []
# sent_list = re_matching.cut_sent(p)
# for s in sent_list:
# audio = infer(
# s,
# sdp_ratio=sdp_ratio,
# noise_scale=noise_scale,
# noise_scale_w=noise_scale_w,
# length_scale=length_scale,
# sid=speaker,
# language=language,
# hps=hps,
# net_g=net_g,
# device=device,
# reference_audio=reference_audio,
# emotion=emotion,
# )
# audio_list_sent.append(audio)
# silence = np.zeros((int)(44100 * interval_between_sent))
# audio_list_sent.append(silence)
# if (interval_between_para - interval_between_sent) > 0:
# silence = np.zeros((int)(44100 * (interval_between_para - interval_between_sent)))
# audio_list_sent.append(silence)
# audio16bit = gr.processing_utils.convert_to_16_bit_wav(np.concatenate(audio_list_sent)) # 对完整句子做音量归一
# audio_list.append(audio16bit)
# else:
# logging.info(f"Short Text: {text}")
# silence = np.zeros(hps.data.sampling_rate // 2, dtype=np.int16)
# with torch.no_grad():
# for piece in text.split("|"):
# audio = infer(
# piece,
# sdp_ratio=sdp_ratio,
# noise_scale=noise_scale,
# noise_scale_w=noise_scale_w,
# length_scale=length_scale,
# sid=speaker,
# language=language,
# hps=hps,
# net_g=net_g,
# device=device,
# reference_audio=reference_audio,
# emotion=emotion,
# )
# audio16bit = gr.processing_utils.convert_to_16_bit_wav(audio)
# audio_list.append(audio16bit)
# audio_list.append(silence) # 将静音添加到列表中
audio_concat = np.concatenate(audio_list)
audio_value = (hps.data.sampling_rate, audio_concat)
return gr.update(value=audio_value, autoplay=True), get_character_html(text), exceed_flag, gr.update(interactive=True)
def submit_lock_fn():
return gr.update(interactive=False)
def init_fn():
gr.Info("2023-11-27: 支持多语言(中、英、日);支持更换音色! Support Chinese, English, Japanese; Support changing voices!")
# gr.Info("2023-11-24: 优化长句生成效果;增加示例;更新了一些小彩蛋;画了一些大饼)")
# gr.Info("Support languages: Chinese, English, Japanese. 欢迎在 Community 中提建议~")
index = random.randint(1,7)
welcome_text = get_sentence("Welcome", index)
return get_character_html(welcome_text) #gr.update(value=f"./assets/audios/Welcome{index}.wav", autoplay=False),
def get_sentence(category, index=-1):
if index == -1:
index = random.randint(1, len(full_lines[category]))
return full_lines[category][f"{index}"]
|