zhzluke96
commited on
Commit
•
ebc4336
1
Parent(s):
21473c0
update
Browse files- language/zh-CN.json +3 -2
- modules/api/impl/google_api.py +12 -6
- modules/api/impl/openai_api.py +5 -1
- modules/api/impl/refiner_api.py +5 -1
- modules/api/impl/ssml_api.py +15 -5
- modules/api/impl/tts_api.py +38 -1
- modules/generate_audio.py +5 -1
- modules/normalization.py +6 -0
- modules/ssml.py +0 -242
- modules/ssml_parser/SSMLParser.py +9 -3
- modules/utils/git.py +15 -0
- modules/utils/markdown.py +7 -0
- modules/webui/localization.py +9 -3
- modules/webui/speaker/speaker_editor.py +147 -0
- modules/webui/speaker_tab.py +3 -0
- modules/webui/ssml/podcast_tab.py +38 -17
- modules/webui/tts_tab.py +3 -1
- modules/webui/webui_utils.py +1 -3
- webui.py +1 -4
language/zh-CN.json
CHANGED
@@ -57,8 +57,8 @@
|
|
57 |
"🔊Generate speaker.pt": "🔊生成 speaker.pt",
|
58 |
"Save .pt file": "保存.pt文件",
|
59 |
"Save to File": "保存到文件",
|
60 |
-
"🎤Test voice": "
|
61 |
-
"Test Voice": "
|
62 |
"Current Seed": "当前种子",
|
63 |
"Output Audio": "输出音频",
|
64 |
"Merger": "融合",
|
@@ -79,6 +79,7 @@
|
|
79 |
"README": "README",
|
80 |
"readme": "readme",
|
81 |
"changelog": "changelog",
|
|
|
82 |
"TTS_STYLE_GUIDE": ["后缀为 _p 表示带prompt,效果更强但是影响质量"],
|
83 |
"SSML_SPLITER_GUIDE": [
|
84 |
"- 字数限制详见README,超过部分将截断",
|
|
|
57 |
"🔊Generate speaker.pt": "🔊生成 speaker.pt",
|
58 |
"Save .pt file": "保存.pt文件",
|
59 |
"Save to File": "保存到文件",
|
60 |
+
"🎤Test voice": "🎤试语",
|
61 |
+
"Test Voice": "试语",
|
62 |
"Current Seed": "当前种子",
|
63 |
"Output Audio": "输出音频",
|
64 |
"Merger": "融合",
|
|
|
79 |
"README": "README",
|
80 |
"readme": "readme",
|
81 |
"changelog": "changelog",
|
82 |
+
"💼Speaker file": "💼音色文件",
|
83 |
"TTS_STYLE_GUIDE": ["后缀为 _p 表示带prompt,效果更强但是影响质量"],
|
84 |
"SSML_SPLITER_GUIDE": [
|
85 |
"- 字数限制详见README,超过部分将截断",
|
modules/api/impl/google_api.py
CHANGED
@@ -14,7 +14,7 @@ from modules import generate_audio as generate
|
|
14 |
from modules.speaker import speaker_mgr
|
15 |
|
16 |
|
17 |
-
from modules.
|
18 |
from modules.SynthesizeSegments import (
|
19 |
SynthesizeSegments,
|
20 |
combine_audio_segments,
|
@@ -65,6 +65,8 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
|
|
65 |
audioConfig = request.audioConfig
|
66 |
|
67 |
# 提取参数
|
|
|
|
|
68 |
language_code = voice.languageCode
|
69 |
voice_name = voice.name
|
70 |
infer_seed = voice.seed or 42
|
@@ -86,9 +88,8 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
|
|
86 |
# TODO maybe need to change the sample rate
|
87 |
sample_rate = 24000
|
88 |
|
89 |
-
#
|
90 |
-
|
91 |
-
if spk is None:
|
92 |
raise HTTPException(
|
93 |
status_code=400, detail="The specified voice name is not supported."
|
94 |
)
|
@@ -120,7 +121,8 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
|
|
120 |
|
121 |
elif input.ssml:
|
122 |
# 处理SSML合成逻辑
|
123 |
-
|
|
|
124 |
for seg in segments:
|
125 |
seg["text"] = text_normalize(seg["text"], is_end=True)
|
126 |
|
@@ -171,7 +173,11 @@ async def google_text_synthesize(request: GoogleTextSynthesizeRequest):
|
|
171 |
import logging
|
172 |
|
173 |
logging.exception(e)
|
174 |
-
|
|
|
|
|
|
|
|
|
175 |
|
176 |
|
177 |
def setup(app: APIManager):
|
|
|
14 |
from modules.speaker import speaker_mgr
|
15 |
|
16 |
|
17 |
+
from modules.ssml_parser.SSMLParser import create_ssml_parser
|
18 |
from modules.SynthesizeSegments import (
|
19 |
SynthesizeSegments,
|
20 |
combine_audio_segments,
|
|
|
65 |
audioConfig = request.audioConfig
|
66 |
|
67 |
# 提取参数
|
68 |
+
|
69 |
+
# TODO 这个也许应该传给 normalizer
|
70 |
language_code = voice.languageCode
|
71 |
voice_name = voice.name
|
72 |
infer_seed = voice.seed or 42
|
|
|
88 |
# TODO maybe need to change the sample rate
|
89 |
sample_rate = 24000
|
90 |
|
91 |
+
# 虽然 calc_spk_style 可以解析 seed 形式,但是这个接口只准备支持 speakers list 中存在的 speaker
|
92 |
+
if speaker_mgr.get_speaker(voice_name) is None:
|
|
|
93 |
raise HTTPException(
|
94 |
status_code=400, detail="The specified voice name is not supported."
|
95 |
)
|
|
|
121 |
|
122 |
elif input.ssml:
|
123 |
# 处理SSML合成逻辑
|
124 |
+
parser = create_ssml_parser()
|
125 |
+
segments = parser.parse(input.ssml)
|
126 |
for seg in segments:
|
127 |
seg["text"] = text_normalize(seg["text"], is_end=True)
|
128 |
|
|
|
173 |
import logging
|
174 |
|
175 |
logging.exception(e)
|
176 |
+
|
177 |
+
if isinstance(e, HTTPException):
|
178 |
+
raise e
|
179 |
+
else:
|
180 |
+
raise HTTPException(status_code=500, detail=str(e))
|
181 |
|
182 |
|
183 |
def setup(app: APIManager):
|
modules/api/impl/openai_api.py
CHANGED
@@ -115,7 +115,11 @@ async def openai_speech_api(
|
|
115 |
import logging
|
116 |
|
117 |
logging.exception(e)
|
118 |
-
|
|
|
|
|
|
|
|
|
119 |
|
120 |
|
121 |
class TranscribeSegment(BaseModel):
|
|
|
115 |
import logging
|
116 |
|
117 |
logging.exception(e)
|
118 |
+
|
119 |
+
if isinstance(e, HTTPException):
|
120 |
+
raise e
|
121 |
+
else:
|
122 |
+
raise HTTPException(status_code=500, detail=str(e))
|
123 |
|
124 |
|
125 |
class TranscribeSegment(BaseModel):
|
modules/api/impl/refiner_api.py
CHANGED
@@ -42,7 +42,11 @@ async def refiner_prompt_post(request: RefineTextRequest):
|
|
42 |
import logging
|
43 |
|
44 |
logging.exception(e)
|
45 |
-
|
|
|
|
|
|
|
|
|
46 |
|
47 |
|
48 |
def setup(api_manager: APIManager):
|
|
|
42 |
import logging
|
43 |
|
44 |
logging.exception(e)
|
45 |
+
|
46 |
+
if isinstance(e, HTTPException):
|
47 |
+
raise e
|
48 |
+
else:
|
49 |
+
raise HTTPException(status_code=500, detail=str(e))
|
50 |
|
51 |
|
52 |
def setup(api_manager: APIManager):
|
modules/api/impl/ssml_api.py
CHANGED
@@ -7,7 +7,7 @@ from fastapi.responses import FileResponse
|
|
7 |
|
8 |
|
9 |
from modules.normalization import text_normalize
|
10 |
-
from modules.
|
11 |
from modules.SynthesizeSegments import (
|
12 |
SynthesizeSegments,
|
13 |
combine_audio_segments,
|
@@ -34,7 +34,7 @@ async def synthesize_ssml(
|
|
34 |
):
|
35 |
try:
|
36 |
ssml = request.ssml
|
37 |
-
format = request.format
|
38 |
batch_size = request.batch_size
|
39 |
|
40 |
if batch_size < 1:
|
@@ -42,10 +42,16 @@ async def synthesize_ssml(
|
|
42 |
status_code=400, detail="Batch size must be greater than 0."
|
43 |
)
|
44 |
|
45 |
-
if not ssml:
|
46 |
raise HTTPException(status_code=400, detail="SSML content is required.")
|
47 |
|
48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
for seg in segments:
|
50 |
seg["text"] = text_normalize(seg["text"], is_end=True)
|
51 |
|
@@ -63,7 +69,11 @@ async def synthesize_ssml(
|
|
63 |
import logging
|
64 |
|
65 |
logging.exception(e)
|
66 |
-
|
|
|
|
|
|
|
|
|
67 |
|
68 |
|
69 |
def setup(api_manager: APIManager):
|
|
|
7 |
|
8 |
|
9 |
from modules.normalization import text_normalize
|
10 |
+
from modules.ssml_parser.SSMLParser import create_ssml_parser
|
11 |
from modules.SynthesizeSegments import (
|
12 |
SynthesizeSegments,
|
13 |
combine_audio_segments,
|
|
|
34 |
):
|
35 |
try:
|
36 |
ssml = request.ssml
|
37 |
+
format = request.format.lower()
|
38 |
batch_size = request.batch_size
|
39 |
|
40 |
if batch_size < 1:
|
|
|
42 |
status_code=400, detail="Batch size must be greater than 0."
|
43 |
)
|
44 |
|
45 |
+
if not ssml or ssml == "":
|
46 |
raise HTTPException(status_code=400, detail="SSML content is required.")
|
47 |
|
48 |
+
if format not in ["mp3", "wav"]:
|
49 |
+
raise HTTPException(
|
50 |
+
status_code=400, detail="Format must be 'mp3' or 'wav'."
|
51 |
+
)
|
52 |
+
|
53 |
+
parser = create_ssml_parser()
|
54 |
+
segments = parser.parse(ssml)
|
55 |
for seg in segments:
|
56 |
seg["text"] = text_normalize(seg["text"], is_end=True)
|
57 |
|
|
|
69 |
import logging
|
70 |
|
71 |
logging.exception(e)
|
72 |
+
|
73 |
+
if isinstance(e, HTTPException):
|
74 |
+
raise e
|
75 |
+
else:
|
76 |
+
raise HTTPException(status_code=500, detail=str(e))
|
77 |
|
78 |
|
79 |
def setup(api_manager: APIManager):
|
modules/api/impl/tts_api.py
CHANGED
@@ -44,6 +44,39 @@ class TTSParams(BaseModel):
|
|
44 |
|
45 |
async def synthesize_tts(params: TTSParams = Depends()):
|
46 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
text = text_normalize(params.text, is_end=False)
|
48 |
|
49 |
calc_params = api_utils.calc_spk_style(spk=params.spk, style=params.style)
|
@@ -87,7 +120,11 @@ async def synthesize_tts(params: TTSParams = Depends()):
|
|
87 |
import logging
|
88 |
|
89 |
logging.exception(e)
|
90 |
-
|
|
|
|
|
|
|
|
|
91 |
|
92 |
|
93 |
def setup(api_manager: APIManager):
|
|
|
44 |
|
45 |
async def synthesize_tts(params: TTSParams = Depends()):
|
46 |
try:
|
47 |
+
# Validate text
|
48 |
+
if not params.text.strip():
|
49 |
+
raise HTTPException(
|
50 |
+
status_code=422, detail="Text parameter cannot be empty"
|
51 |
+
)
|
52 |
+
|
53 |
+
# Validate temperature
|
54 |
+
if not (0 <= params.temperature <= 1):
|
55 |
+
raise HTTPException(
|
56 |
+
status_code=422, detail="Temperature must be between 0 and 1"
|
57 |
+
)
|
58 |
+
|
59 |
+
# Validate top_P
|
60 |
+
if not (0 <= params.top_P <= 1):
|
61 |
+
raise HTTPException(status_code=422, detail="top_P must be between 0 and 1")
|
62 |
+
|
63 |
+
# Validate top_K
|
64 |
+
if params.top_K <= 0:
|
65 |
+
raise HTTPException(
|
66 |
+
status_code=422, detail="top_K must be a positive integer"
|
67 |
+
)
|
68 |
+
if params.top_K > 100:
|
69 |
+
raise HTTPException(
|
70 |
+
status_code=422, detail="top_K must be less than or equal to 100"
|
71 |
+
)
|
72 |
+
|
73 |
+
# Validate format
|
74 |
+
if params.format not in ["mp3", "wav"]:
|
75 |
+
raise HTTPException(
|
76 |
+
status_code=422,
|
77 |
+
detail="Invalid format. Supported formats are mp3 and wav",
|
78 |
+
)
|
79 |
+
|
80 |
text = text_normalize(params.text, is_end=False)
|
81 |
|
82 |
calc_params = api_utils.calc_spk_style(spk=params.spk, style=params.style)
|
|
|
120 |
import logging
|
121 |
|
122 |
logging.exception(e)
|
123 |
+
|
124 |
+
if isinstance(e, HTTPException):
|
125 |
+
raise e
|
126 |
+
else:
|
127 |
+
raise HTTPException(status_code=500, detail=str(e))
|
128 |
|
129 |
|
130 |
def setup(api_manager: APIManager):
|
modules/generate_audio.py
CHANGED
@@ -79,7 +79,11 @@ def generate_audio_batch(
|
|
79 |
params_infer_code["spk_emb"] = spk.emb
|
80 |
logger.info(("spk", spk.name))
|
81 |
else:
|
82 |
-
|
|
|
|
|
|
|
|
|
83 |
|
84 |
logger.info(
|
85 |
{
|
|
|
79 |
params_infer_code["spk_emb"] = spk.emb
|
80 |
logger.info(("spk", spk.name))
|
81 |
else:
|
82 |
+
logger.warn(
|
83 |
+
f"spk must be int or Speaker, but: <{type(spk)}> {spk}, wiil set to default voice"
|
84 |
+
)
|
85 |
+
with SeedContext(2, True):
|
86 |
+
params_infer_code["spk_emb"] = chat_tts.sample_random_speaker()
|
87 |
|
88 |
logger.info(
|
89 |
{
|
modules/normalization.py
CHANGED
@@ -5,6 +5,10 @@ from modules.utils.markdown import markdown_to_text
|
|
5 |
from modules import models
|
6 |
import re
|
7 |
|
|
|
|
|
|
|
|
|
8 |
|
9 |
@lru_cache(maxsize=64)
|
10 |
def is_chinese(text):
|
@@ -159,6 +163,8 @@ def replace_unk_tokens(text):
|
|
159 |
"""
|
160 |
把不在字典里的字符替换为 " , "
|
161 |
"""
|
|
|
|
|
162 |
chat_tts = models.load_chat_tts()
|
163 |
if "tokenizer" not in chat_tts.pretrain_models:
|
164 |
# 这个地方只有在 huggingface spaces 中才会触发
|
|
|
5 |
from modules import models
|
6 |
import re
|
7 |
|
8 |
+
# 是否关闭 unk token 检查
|
9 |
+
# NOTE: 单测的时候用于跳过模型加载
|
10 |
+
DISABLE_UNK_TOKEN_CHECK = False
|
11 |
+
|
12 |
|
13 |
@lru_cache(maxsize=64)
|
14 |
def is_chinese(text):
|
|
|
163 |
"""
|
164 |
把不在字典里的字符替换为 " , "
|
165 |
"""
|
166 |
+
if DISABLE_UNK_TOKEN_CHECK:
|
167 |
+
return text
|
168 |
chat_tts = models.load_chat_tts()
|
169 |
if "tokenizer" not in chat_tts.pretrain_models:
|
170 |
# 这个地方只有在 huggingface spaces 中才会触发
|
modules/ssml.py
CHANGED
@@ -66,245 +66,3 @@ def apply_random_seed(attrs: dict):
|
|
66 |
seed = random.randint(0, 2**32 - 1)
|
67 |
attrs["seed"] = seed
|
68 |
logger.info(f"random seed: {seed}")
|
69 |
-
|
70 |
-
|
71 |
-
class NotSupportSSML(Exception):
|
72 |
-
pass
|
73 |
-
|
74 |
-
|
75 |
-
def parse_ssml(ssml: str) -> List[Dict[str, Any]]:
|
76 |
-
root = etree.fromstring(ssml)
|
77 |
-
|
78 |
-
ssml_version = root.get("version", "NONE")
|
79 |
-
if ssml_version != "0.1":
|
80 |
-
raise NotSupportSSML("Unsupported ssml version: {ssml_version}")
|
81 |
-
|
82 |
-
segments = []
|
83 |
-
|
84 |
-
for voice in root.findall(".//voice"):
|
85 |
-
voice_attrs = {
|
86 |
-
"spk": voice.get("spk"),
|
87 |
-
"style": voice.get("style"),
|
88 |
-
"seed": voice.get("seed"),
|
89 |
-
"top_p": voice.get("top_p"),
|
90 |
-
"top_k": voice.get("top_k"),
|
91 |
-
"temp": voice.get("temp"),
|
92 |
-
"prompt1": voice.get("prompt1"),
|
93 |
-
"prompt2": voice.get("prompt2"),
|
94 |
-
"prefix": voice.get("prefix"),
|
95 |
-
"normalize": voice.get("normalize"),
|
96 |
-
}
|
97 |
-
|
98 |
-
voice_attrs = {k: v for k, v in voice_attrs.items() if v is not None}
|
99 |
-
|
100 |
-
expand_spk(voice_attrs)
|
101 |
-
expand_style(voice_attrs)
|
102 |
-
|
103 |
-
merge_prompt(voice_attrs, voice)
|
104 |
-
apply_random_seed(voice_attrs)
|
105 |
-
|
106 |
-
voice_segments = []
|
107 |
-
|
108 |
-
if voice_attrs.get("temp", "") == "min":
|
109 |
-
# ref: https://github.com/2noise/ChatTTS/issues/123#issue-2326908144
|
110 |
-
voice_attrs["temp"] = 0.000000000001
|
111 |
-
if voice_attrs.get("temp", "") == "max":
|
112 |
-
voice_attrs["temp"] = 1
|
113 |
-
|
114 |
-
# 处理 voice 开头的文本
|
115 |
-
if voice.text and voice.text.strip():
|
116 |
-
voice_segments.append(
|
117 |
-
{"text": voice.text.strip(), "attrs": voice_attrs.copy()}
|
118 |
-
)
|
119 |
-
|
120 |
-
# 处理 voice 内部的文本和 prosody 元素
|
121 |
-
for node in voice.iterchildren():
|
122 |
-
if node.tag == "prosody":
|
123 |
-
prosody_attrs = voice_attrs.copy()
|
124 |
-
new_attrs = {
|
125 |
-
"rate": node.get("rate"),
|
126 |
-
"volume": node.get("volume"),
|
127 |
-
"pitch": node.get("pitch"),
|
128 |
-
}
|
129 |
-
prosody_attrs.update(
|
130 |
-
{k: v for k, v in new_attrs.items() if v is not None}
|
131 |
-
)
|
132 |
-
expand_style(prosody_attrs)
|
133 |
-
merge_prompt(prosody_attrs, node)
|
134 |
-
apply_random_seed(voice_attrs)
|
135 |
-
|
136 |
-
if node.text and node.text.strip():
|
137 |
-
voice_segments.append(
|
138 |
-
{"text": node.text.strip(), "attrs": prosody_attrs}
|
139 |
-
)
|
140 |
-
elif node.tag == "break":
|
141 |
-
time_ms = int(node.get("time", "0").replace("ms", ""))
|
142 |
-
segment = {"break": time_ms}
|
143 |
-
voice_segments.append(segment)
|
144 |
-
|
145 |
-
if node.tail and node.tail.strip():
|
146 |
-
voice_segments.append(
|
147 |
-
{"text": node.tail.strip(), "attrs": voice_attrs.copy()}
|
148 |
-
)
|
149 |
-
|
150 |
-
end_segment = voice_segments[-1]
|
151 |
-
end_segment["is_end"] = True
|
152 |
-
|
153 |
-
segments = segments + voice_segments
|
154 |
-
|
155 |
-
logger.info(f"collect len(segments): {len(segments)}")
|
156 |
-
# logger.info(f"segments: {json.dumps(segments, ensure_ascii=False)}")
|
157 |
-
|
158 |
-
return segments
|
159 |
-
|
160 |
-
|
161 |
-
if __name__ == "__main__":
|
162 |
-
# 示例 SSML 输入
|
163 |
-
ssml1 = """
|
164 |
-
<speak version="0.1">
|
165 |
-
<voice spk="20398768" seed="42" temp="min" top_p="0.9" top_k="20">
|
166 |
-
电影中梁朝伟扮演的陈永仁的
|
167 |
-
<prosody volume="5">
|
168 |
-
编号27149
|
169 |
-
</prosody>
|
170 |
-
<prosody rate="2">
|
171 |
-
编号27149
|
172 |
-
</prosody>
|
173 |
-
<prosody pitch="-12">
|
174 |
-
编号27149
|
175 |
-
</prosody>
|
176 |
-
<prosody pitch="12">
|
177 |
-
编号27149
|
178 |
-
</prosody>
|
179 |
-
</voice>
|
180 |
-
<voice spk="20398768" seed="42" speed="9">
|
181 |
-
编号27149
|
182 |
-
</voice>
|
183 |
-
<voice spk="20398768" seed="42">
|
184 |
-
电影中梁朝伟扮演的陈永仁的编号27149
|
185 |
-
</voice>
|
186 |
-
</speak>
|
187 |
-
"""
|
188 |
-
|
189 |
-
ssml2 = """
|
190 |
-
<speak version="0.1">
|
191 |
-
<voice spk="Bob">
|
192 |
-
也可以合成多角色多情感的有声 [uv_break] 书 [uv_break] ,例如:
|
193 |
-
</voice>
|
194 |
-
<voice spk="Bob">
|
195 |
-
黛玉冷笑道:
|
196 |
-
</voice>
|
197 |
-
<voice spk="female2">
|
198 |
-
我说呢,亏了绊住,不然,早就飞了来了。
|
199 |
-
</voice>
|
200 |
-
<voice spk="Bob" speed="0">
|
201 |
-
宝玉道:
|
202 |
-
</voice>
|
203 |
-
<voice spk="Alice">
|
204 |
-
“只许和你玩,替你解闷。不过偶然到他那里,就说这些闲话。”
|
205 |
-
</voice>
|
206 |
-
<voice spk="female2">
|
207 |
-
“好没意思的话!去不去,关我什么事儿?又没叫你替我解闷儿,还许你不理我呢”
|
208 |
-
</voice>
|
209 |
-
<voice spk="Bob">
|
210 |
-
说着,便赌气回房去了。
|
211 |
-
</voice>
|
212 |
-
</speak>
|
213 |
-
"""
|
214 |
-
ssml22 = """
|
215 |
-
<speak version="0.1">
|
216 |
-
<voice spk="Bob" style="narration-relaxed">
|
217 |
-
下面是一个 ChatTTS 用于合成多角色多情感的有声书示例
|
218 |
-
</voice>
|
219 |
-
<voice spk="Bob" style="narration-relaxed">
|
220 |
-
黛玉冷笑道:
|
221 |
-
</voice>
|
222 |
-
<voice spk="female2" style="angry">
|
223 |
-
我说呢 [uv_break] ,亏了绊住,不然,早就飞起来了。
|
224 |
-
</voice>
|
225 |
-
<voice spk="Bob" style="narration-relaxed">
|
226 |
-
宝玉道:
|
227 |
-
</voice>
|
228 |
-
<voice spk="Alice" style="unfriendly">
|
229 |
-
“只许和你玩 [uv_break] ,替你解闷。不过偶然到他那里,就说这些闲话。”
|
230 |
-
</voice>
|
231 |
-
<voice spk="female2" style="angry">
|
232 |
-
“好没意思的话![uv_break] 去不去,关我什么事儿? 又没叫你替我解闷儿 [uv_break],还许你不理我呢”
|
233 |
-
</voice>
|
234 |
-
<voice spk="Bob" style="narration-relaxed">
|
235 |
-
说着,便赌气回房去了。
|
236 |
-
</voice>
|
237 |
-
</speak>
|
238 |
-
"""
|
239 |
-
|
240 |
-
ssml3 = """
|
241 |
-
<speak version="0.1">
|
242 |
-
<voice spk="Bob" style="angry">
|
243 |
-
“你到底在想什么?这已经是第三次了!每次我都告诉你要按时完成任务,可你总是拖延。你知道这对整个团队有多大的影响吗?!”
|
244 |
-
</voice>
|
245 |
-
<voice spk="Bob" style="assistant">
|
246 |
-
“你到底在想什么?这已经是第三次了!每次我都告诉你要按时完成任务,可你总是拖延。你知道这对整个团队有多大的影响吗?!”
|
247 |
-
</voice>
|
248 |
-
<voice spk="Bob" style="gentle">
|
249 |
-
“你到底在想什么?这已经是第三次了!每次我都告诉你要按时完成任务,可你总是拖延。你知道这对整个团队有多大的影响吗?!”
|
250 |
-
</voice>
|
251 |
-
</speak>
|
252 |
-
"""
|
253 |
-
|
254 |
-
ssml4 = """
|
255 |
-
<speak version="0.1">
|
256 |
-
<voice spk="Bob" style="narration-relaxed">
|
257 |
-
使用 prosody 控制生成文本的语速语调和音量,示例如下
|
258 |
-
|
259 |
-
<prosody>
|
260 |
-
无任何限制将会继承父级voice配置进行生成
|
261 |
-
</prosody>
|
262 |
-
<prosody rate="1.5">
|
263 |
-
设置 rate 大于1表示加速,小于1为减速
|
264 |
-
</prosody>
|
265 |
-
<prosody pitch="6">
|
266 |
-
设置 pitch 调整音调,设置为6表示提高6个半音
|
267 |
-
</prosody>
|
268 |
-
<prosody volume="2">
|
269 |
-
设置 volume 调整音量,设置为2表示提高2个分贝
|
270 |
-
</prosody>
|
271 |
-
|
272 |
-
在 voice 中无prosody包裹的文本即为默认生成状态下的语音
|
273 |
-
</voice>
|
274 |
-
</speak>
|
275 |
-
"""
|
276 |
-
|
277 |
-
ssml5 = """
|
278 |
-
<speak version="0.1">
|
279 |
-
<voice spk="Bob" style="narration-relaxed">
|
280 |
-
使用 break 标签将会简单的
|
281 |
-
|
282 |
-
<break time="500" />
|
283 |
-
|
284 |
-
插入一段空白到生成结果中
|
285 |
-
</voice>
|
286 |
-
</speak>
|
287 |
-
"""
|
288 |
-
|
289 |
-
ssml6 = """
|
290 |
-
<speak version="0.1">
|
291 |
-
<voice spk="Bob" style="excited">
|
292 |
-
temperature for sampling (may be overridden by style or speaker)
|
293 |
-
<break time="500" />
|
294 |
-
温度值用于采样,这个值有可能被 style 或者 speaker 覆盖
|
295 |
-
<break time="500" />
|
296 |
-
temperature for sampling ,这个值有可能被 style 或者 speaker 覆盖
|
297 |
-
<break time="500" />
|
298 |
-
温度值用于采样,(may be overridden by style or speaker)
|
299 |
-
</voice>
|
300 |
-
</speak>
|
301 |
-
"""
|
302 |
-
|
303 |
-
segments = parse_ssml(ssml6)
|
304 |
-
|
305 |
-
print(segments)
|
306 |
-
|
307 |
-
# audio_segments = synthesize_segments(segments)
|
308 |
-
# combined_audio = combine_audio_segments(audio_segments)
|
309 |
-
|
310 |
-
# combined_audio.export("output.wav", format="wav")
|
|
|
66 |
seed = random.randint(0, 2**32 - 1)
|
67 |
attrs["seed"] = seed
|
68 |
logger.info(f"random seed: {seed}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modules/ssml_parser/SSMLParser.py
CHANGED
@@ -29,6 +29,12 @@ class SSMLContext(Box):
|
|
29 |
self.prompt2 = None
|
30 |
self.prefix = None
|
31 |
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
|
33 |
class SSMLSegment(Box):
|
34 |
def __init__(self, text: str, attrs=SSMLContext()):
|
@@ -84,7 +90,7 @@ def create_ssml_parser():
|
|
84 |
|
85 |
@parser.resolver("speak")
|
86 |
def tag_speak(element, context, segments, parser):
|
87 |
-
ctx =
|
88 |
|
89 |
version = element.get("version")
|
90 |
if version != "0.1":
|
@@ -95,7 +101,7 @@ def create_ssml_parser():
|
|
95 |
|
96 |
@parser.resolver("voice")
|
97 |
def tag_voice(element, context, segments, parser):
|
98 |
-
ctx =
|
99 |
|
100 |
ctx.spk = element.get("spk", ctx.spk)
|
101 |
ctx.style = element.get("style", ctx.style)
|
@@ -131,7 +137,7 @@ def create_ssml_parser():
|
|
131 |
|
132 |
@parser.resolver("prosody")
|
133 |
def tag_prosody(element, context, segments, parser):
|
134 |
-
ctx =
|
135 |
|
136 |
ctx.spk = element.get("spk", ctx.spk)
|
137 |
ctx.style = element.get("style", ctx.style)
|
|
|
29 |
self.prompt2 = None
|
30 |
self.prefix = None
|
31 |
|
32 |
+
def clone(self):
|
33 |
+
ctx = SSMLContext()
|
34 |
+
for k, v in self.items():
|
35 |
+
ctx[k] = v
|
36 |
+
return ctx
|
37 |
+
|
38 |
|
39 |
class SSMLSegment(Box):
|
40 |
def __init__(self, text: str, attrs=SSMLContext()):
|
|
|
90 |
|
91 |
@parser.resolver("speak")
|
92 |
def tag_speak(element, context, segments, parser):
|
93 |
+
ctx = context.clone() if context is not None else SSMLContext()
|
94 |
|
95 |
version = element.get("version")
|
96 |
if version != "0.1":
|
|
|
101 |
|
102 |
@parser.resolver("voice")
|
103 |
def tag_voice(element, context, segments, parser):
|
104 |
+
ctx = context.clone() if context is not None else SSMLContext()
|
105 |
|
106 |
ctx.spk = element.get("spk", ctx.spk)
|
107 |
ctx.style = element.get("style", ctx.style)
|
|
|
137 |
|
138 |
@parser.resolver("prosody")
|
139 |
def tag_prosody(element, context, segments, parser):
|
140 |
+
ctx = context.clone() if context is not None else SSMLContext()
|
141 |
|
142 |
ctx.spk = element.get("spk", ctx.spk)
|
143 |
ctx.style = element.get("style", ctx.style)
|
modules/utils/git.py
CHANGED
@@ -2,14 +2,25 @@ from functools import lru_cache
|
|
2 |
import os
|
3 |
import subprocess
|
4 |
|
|
|
5 |
from modules.utils import constants
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
git = os.environ.get("GIT", "git")
|
8 |
|
|
|
|
|
9 |
|
10 |
@lru_cache()
|
11 |
def commit_hash():
|
12 |
try:
|
|
|
|
|
13 |
return subprocess.check_output(
|
14 |
[git, "-C", constants.ROOT_DIR, "rev-parse", "HEAD"],
|
15 |
shell=False,
|
@@ -22,6 +33,8 @@ def commit_hash():
|
|
22 |
@lru_cache()
|
23 |
def git_tag():
|
24 |
try:
|
|
|
|
|
25 |
return subprocess.check_output(
|
26 |
[git, "-C", constants.ROOT_DIR, "describe", "--tags"],
|
27 |
shell=False,
|
@@ -44,6 +57,8 @@ def git_tag():
|
|
44 |
@lru_cache()
|
45 |
def branch_name():
|
46 |
try:
|
|
|
|
|
47 |
return subprocess.check_output(
|
48 |
[git, "-C", constants.ROOT_DIR, "rev-parse", "--abbrev-ref", "HEAD"],
|
49 |
shell=False,
|
|
|
2 |
import os
|
3 |
import subprocess
|
4 |
|
5 |
+
|
6 |
from modules.utils import constants
|
7 |
|
8 |
+
# 用于判断是否在hf spaces
|
9 |
+
try:
|
10 |
+
import spaces
|
11 |
+
except:
|
12 |
+
spaces = None
|
13 |
+
|
14 |
git = os.environ.get("GIT", "git")
|
15 |
|
16 |
+
in_hf_spaces = spaces is not None
|
17 |
+
|
18 |
|
19 |
@lru_cache()
|
20 |
def commit_hash():
|
21 |
try:
|
22 |
+
if in_hf_spaces:
|
23 |
+
return "<hf>"
|
24 |
return subprocess.check_output(
|
25 |
[git, "-C", constants.ROOT_DIR, "rev-parse", "HEAD"],
|
26 |
shell=False,
|
|
|
33 |
@lru_cache()
|
34 |
def git_tag():
|
35 |
try:
|
36 |
+
if in_hf_spaces:
|
37 |
+
return "<hf>"
|
38 |
return subprocess.check_output(
|
39 |
[git, "-C", constants.ROOT_DIR, "describe", "--tags"],
|
40 |
shell=False,
|
|
|
57 |
@lru_cache()
|
58 |
def branch_name():
|
59 |
try:
|
60 |
+
if in_hf_spaces:
|
61 |
+
return "<hf>"
|
62 |
return subprocess.check_output(
|
63 |
[git, "-C", constants.ROOT_DIR, "rev-parse", "--abbrev-ref", "HEAD"],
|
64 |
shell=False,
|
modules/utils/markdown.py
CHANGED
@@ -46,6 +46,10 @@ class PlainTextRenderer(mistune.HTMLRenderer):
|
|
46 |
# remove code
|
47 |
return ""
|
48 |
|
|
|
|
|
|
|
|
|
49 |
|
50 |
def markdown_to_text(markdown_text):
|
51 |
renderer = PlainTextRenderer()
|
@@ -69,6 +73,9 @@ console.log(1)
|
|
69 |
- 列表项 2
|
70 |
- 列表项 3
|
71 |
|
|
|
|
|
|
|
72 |
> 这是一个引用。
|
73 |
|
74 |
`代码片段`
|
|
|
46 |
# remove code
|
47 |
return ""
|
48 |
|
49 |
+
def thematic_break(self) -> str:
|
50 |
+
# remove break
|
51 |
+
return "\n"
|
52 |
+
|
53 |
|
54 |
def markdown_to_text(markdown_text):
|
55 |
renderer = PlainTextRenderer()
|
|
|
73 |
- 列表项 2
|
74 |
- 列表项 3
|
75 |
|
76 |
+
1. 第一
|
77 |
+
2. 第二
|
78 |
+
|
79 |
> 这是一个引用。
|
80 |
|
81 |
`代码片段`
|
modules/webui/localization.py
CHANGED
@@ -1,7 +1,9 @@
|
|
1 |
import json
|
2 |
import os
|
3 |
import gradio as gr
|
|
|
4 |
|
|
|
5 |
|
6 |
current_translation = {}
|
7 |
localization_root = os.path.join(
|
@@ -24,11 +26,15 @@ def localization_js(filename):
|
|
24 |
assert isinstance(v, str) or isinstance(
|
25 |
v, list
|
26 |
), f"Value for key {k} is not a string or list"
|
|
|
|
|
27 |
except Exception as e:
|
28 |
-
|
29 |
-
|
30 |
else:
|
31 |
-
|
|
|
|
|
32 |
|
33 |
# current_translation = {k: 'XXX' for k in current_translation.keys()} # use this to see if all texts are covered
|
34 |
|
|
|
1 |
import json
|
2 |
import os
|
3 |
import gradio as gr
|
4 |
+
import logging
|
5 |
|
6 |
+
logger = logging.getLogger(__name__)
|
7 |
|
8 |
current_translation = {}
|
9 |
localization_root = os.path.join(
|
|
|
26 |
assert isinstance(v, str) or isinstance(
|
27 |
v, list
|
28 |
), f"Value for key {k} is not a string or list"
|
29 |
+
|
30 |
+
logger.info(f"Loaded localization file {full_name}")
|
31 |
except Exception as e:
|
32 |
+
logger.warning(str(e))
|
33 |
+
logger.warning(f"Failed to load localization file {full_name}")
|
34 |
else:
|
35 |
+
logger.warning(f"Localization file {full_name} does not exist")
|
36 |
+
else:
|
37 |
+
logger.warning(f"Localization file {filename} is not a string")
|
38 |
|
39 |
# current_translation = {k: 'XXX' for k in current_translation.keys()} # use this to see if all texts are covered
|
40 |
|
modules/webui/speaker/speaker_editor.py
ADDED
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from modules.speaker import Speaker
|
4 |
+
from modules.hf import spaces
|
5 |
+
from modules.webui import webui_config
|
6 |
+
from modules.webui.webui_utils import tts_generate
|
7 |
+
|
8 |
+
import tempfile
|
9 |
+
|
10 |
+
|
11 |
+
@torch.inference_mode()
|
12 |
+
@spaces.GPU
|
13 |
+
def test_spk_voice(spk_file, text: str):
|
14 |
+
if spk_file == "" or spk_file is None:
|
15 |
+
return None
|
16 |
+
spk = Speaker.from_file(spk_file)
|
17 |
+
return tts_generate(
|
18 |
+
spk=spk,
|
19 |
+
text=text,
|
20 |
+
)
|
21 |
+
|
22 |
+
|
23 |
+
def speaker_editor_ui():
|
24 |
+
def on_generate(spk_file, name, gender, desc):
|
25 |
+
spk: Speaker = Speaker.from_file(spk_file)
|
26 |
+
spk.name = name
|
27 |
+
spk.gender = gender
|
28 |
+
spk.desc = desc
|
29 |
+
|
30 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".pt") as tmp_file:
|
31 |
+
torch.save(spk, tmp_file)
|
32 |
+
tmp_file_path = tmp_file.name
|
33 |
+
|
34 |
+
return tmp_file_path
|
35 |
+
|
36 |
+
def create_test_voice_card(spk_file):
|
37 |
+
with gr.Group():
|
38 |
+
gr.Markdown("🎤Test voice")
|
39 |
+
with gr.Row():
|
40 |
+
test_voice_btn = gr.Button(
|
41 |
+
"Test Voice", variant="secondary", interactive=False
|
42 |
+
)
|
43 |
+
|
44 |
+
with gr.Column(scale=4):
|
45 |
+
test_text = gr.Textbox(
|
46 |
+
label="Test Text",
|
47 |
+
placeholder="Please input test text",
|
48 |
+
value=webui_config.localization.DEFAULT_SPEAKER_TEST_TEXT,
|
49 |
+
)
|
50 |
+
with gr.Row():
|
51 |
+
with gr.Column(scale=4):
|
52 |
+
output_audio = gr.Audio(label="Output Audio", format="mp3")
|
53 |
+
|
54 |
+
test_voice_btn.click(
|
55 |
+
fn=test_spk_voice,
|
56 |
+
inputs=[spk_file, test_text],
|
57 |
+
outputs=[output_audio],
|
58 |
+
)
|
59 |
+
|
60 |
+
return test_voice_btn
|
61 |
+
|
62 |
+
has_file = gr.State(False)
|
63 |
+
|
64 |
+
# TODO 也许需要写个说明?
|
65 |
+
# gr.Markdown("SPEAKER_CREATOR_GUIDE")
|
66 |
+
|
67 |
+
with gr.Row():
|
68 |
+
with gr.Column(scale=2):
|
69 |
+
with gr.Group():
|
70 |
+
gr.Markdown("💼Speaker file")
|
71 |
+
spk_file = gr.File(label="*.pt file", file_types=[".pt"])
|
72 |
+
|
73 |
+
with gr.Group():
|
74 |
+
gr.Markdown("ℹ️Speaker info")
|
75 |
+
name_input = gr.Textbox(
|
76 |
+
label="Name",
|
77 |
+
placeholder="Enter speaker name",
|
78 |
+
value="*",
|
79 |
+
interactive=False,
|
80 |
+
)
|
81 |
+
gender_input = gr.Textbox(
|
82 |
+
label="Gender",
|
83 |
+
placeholder="Enter gender",
|
84 |
+
value="*",
|
85 |
+
interactive=False,
|
86 |
+
)
|
87 |
+
desc_input = gr.Textbox(
|
88 |
+
label="Description",
|
89 |
+
placeholder="Enter description",
|
90 |
+
value="*",
|
91 |
+
interactive=False,
|
92 |
+
)
|
93 |
+
with gr.Group():
|
94 |
+
gr.Markdown("🔊Generate speaker.pt")
|
95 |
+
generate_button = gr.Button("Save .pt file", interactive=False)
|
96 |
+
output_file = gr.File(label="Save to File")
|
97 |
+
with gr.Column(scale=5):
|
98 |
+
btn1 = create_test_voice_card(spk_file=spk_file)
|
99 |
+
btn2 = create_test_voice_card(spk_file=spk_file)
|
100 |
+
btn3 = create_test_voice_card(spk_file=spk_file)
|
101 |
+
btn4 = create_test_voice_card(spk_file=spk_file)
|
102 |
+
|
103 |
+
generate_button.click(
|
104 |
+
fn=on_generate,
|
105 |
+
inputs=[spk_file, name_input, gender_input, desc_input],
|
106 |
+
outputs=[output_file],
|
107 |
+
)
|
108 |
+
|
109 |
+
def spk_file_change(spk_file):
|
110 |
+
empty = spk_file is None or spk_file == ""
|
111 |
+
if empty:
|
112 |
+
return [
|
113 |
+
gr.Textbox(value="*", interactive=False),
|
114 |
+
gr.Textbox(value="*", interactive=False),
|
115 |
+
gr.Textbox(value="*", interactive=False),
|
116 |
+
gr.Button(interactive=False),
|
117 |
+
gr.Button(interactive=False),
|
118 |
+
gr.Button(interactive=False),
|
119 |
+
gr.Button(interactive=False),
|
120 |
+
gr.Button(interactive=False),
|
121 |
+
]
|
122 |
+
spk: Speaker = Speaker.from_file(spk_file)
|
123 |
+
return [
|
124 |
+
gr.Textbox(value=spk.name, interactive=True),
|
125 |
+
gr.Textbox(value=spk.gender, interactive=True),
|
126 |
+
gr.Textbox(value=spk.describe, interactive=True),
|
127 |
+
gr.Button(interactive=True),
|
128 |
+
gr.Button(interactive=True),
|
129 |
+
gr.Button(interactive=True),
|
130 |
+
gr.Button(interactive=True),
|
131 |
+
gr.Button(interactive=True),
|
132 |
+
]
|
133 |
+
|
134 |
+
spk_file.change(
|
135 |
+
fn=spk_file_change,
|
136 |
+
inputs=[spk_file],
|
137 |
+
outputs=[
|
138 |
+
name_input,
|
139 |
+
gender_input,
|
140 |
+
desc_input,
|
141 |
+
generate_button,
|
142 |
+
btn1,
|
143 |
+
btn2,
|
144 |
+
btn3,
|
145 |
+
btn4,
|
146 |
+
],
|
147 |
+
)
|
modules/webui/speaker_tab.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
|
|
|
3 |
from modules.webui.speaker.speaker_merger import create_speaker_merger
|
4 |
from modules.webui.speaker.speaker_creator import speaker_creator_ui
|
5 |
|
@@ -7,6 +8,8 @@ from modules.webui.speaker.speaker_creator import speaker_creator_ui
|
|
7 |
def create_speaker_panel():
|
8 |
|
9 |
with gr.Tabs():
|
|
|
|
|
10 |
with gr.TabItem("Creator"):
|
11 |
speaker_creator_ui()
|
12 |
with gr.TabItem("Merger"):
|
|
|
1 |
import gradio as gr
|
2 |
|
3 |
+
from modules.webui.speaker.speaker_editor import speaker_editor_ui
|
4 |
from modules.webui.speaker.speaker_merger import create_speaker_merger
|
5 |
from modules.webui.speaker.speaker_creator import speaker_creator_ui
|
6 |
|
|
|
8 |
def create_speaker_panel():
|
9 |
|
10 |
with gr.Tabs():
|
11 |
+
with gr.Tab("Editor"):
|
12 |
+
speaker_editor_ui()
|
13 |
with gr.TabItem("Creator"):
|
14 |
speaker_creator_ui()
|
15 |
with gr.TabItem("Merger"):
|
modules/webui/ssml/podcast_tab.py
CHANGED
@@ -7,45 +7,65 @@ from modules.webui import webui_utils
|
|
7 |
from modules.hf import spaces
|
8 |
|
9 |
podcast_default_case = [
|
10 |
-
[
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
[
|
13 |
3,
|
14 |
"Bob",
|
15 |
-
"没错,中华料理有着几千年的历史,而且每个地区都有自己的特色菜。",
|
16 |
-
"
|
17 |
],
|
18 |
[
|
19 |
4,
|
20 |
"female2",
|
21 |
-
"那我们先从最有名的川菜开始吧。川菜以其麻辣著称,是很多人的最爱。",
|
22 |
-
"
|
23 |
],
|
24 |
[
|
25 |
5,
|
26 |
"Alice",
|
27 |
-
"对,我特别喜欢吃麻婆豆腐和辣子鸡。那种麻辣的感觉真是让人难以忘怀。",
|
28 |
-
"
|
29 |
],
|
30 |
[
|
31 |
6,
|
32 |
"Bob",
|
33 |
-
"除了川菜,粤菜也是很受欢迎的。粤菜讲究鲜美,像是白切鸡和蒸鱼都是经典。",
|
34 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
],
|
36 |
-
[7, "female2", "对啊,粤菜的烹饪方式比较清淡,更注重食材本身的味道。", "chat"],
|
37 |
-
[8, "Alice", "还有北京的京菜,像北京烤鸭,那可是来北京必吃的美食。", "chat"],
|
38 |
[
|
39 |
9,
|
40 |
"Bob",
|
41 |
-
"不仅如此,还有淮扬菜、湘菜、鲁菜等等,每个菜系都有其独特的风味。",
|
42 |
-
"
|
43 |
],
|
44 |
[
|
45 |
10,
|
46 |
"female2",
|
47 |
-
"对对对,像淮扬菜的狮子头,湘菜的剁椒鱼头,都是让人垂涎三尺的美味。",
|
48 |
-
"
|
49 |
],
|
50 |
]
|
51 |
|
@@ -111,10 +131,11 @@ def create_ssml_podcast_tab(ssml_input: gr.Textbox, tabs1: gr.Tabs, tabs2: gr.Ta
|
|
111 |
script_table = gr.DataFrame(
|
112 |
headers=["index", "speaker", "text", "style"],
|
113 |
datatype=["number", "str", "str", "str"],
|
114 |
-
interactive=
|
115 |
wrap=True,
|
116 |
value=podcast_default_case,
|
117 |
row_count=(0, "dynamic"),
|
|
|
118 |
)
|
119 |
|
120 |
send_to_ssml_btn = gr.Button("📩Send to SSML", variant="primary")
|
|
|
7 |
from modules.hf import spaces
|
8 |
|
9 |
podcast_default_case = [
|
10 |
+
[
|
11 |
+
1,
|
12 |
+
"female2",
|
13 |
+
"你好,欢迎收听今天的播客内容。今天我们要聊的是中华料理。 [lbreak]",
|
14 |
+
"podcast_p",
|
15 |
+
],
|
16 |
+
[
|
17 |
+
2,
|
18 |
+
"Alice",
|
19 |
+
"嗨,我特别期待这个话题!中华料理真的是博大精深。 [lbreak]",
|
20 |
+
"podcast_p",
|
21 |
+
],
|
22 |
[
|
23 |
3,
|
24 |
"Bob",
|
25 |
+
"没错,中华料理有着几千年的历史,而且每个地区都有自己的特色菜。 [lbreak]",
|
26 |
+
"podcast_p",
|
27 |
],
|
28 |
[
|
29 |
4,
|
30 |
"female2",
|
31 |
+
"那我们先从最有名的川菜开始吧。川菜以其麻辣著称,是很多人的最爱。 [lbreak]",
|
32 |
+
"podcast_p",
|
33 |
],
|
34 |
[
|
35 |
5,
|
36 |
"Alice",
|
37 |
+
"对,我特别喜欢吃麻婆豆腐和辣子鸡。那种麻辣的感觉真是让人难以忘怀。 [lbreak]",
|
38 |
+
"podcast_p",
|
39 |
],
|
40 |
[
|
41 |
6,
|
42 |
"Bob",
|
43 |
+
"除了川菜,粤菜也是很受欢迎的。粤菜讲究鲜美,像是白切鸡和蒸鱼都是经典。 [lbreak]",
|
44 |
+
"podcast_p",
|
45 |
+
],
|
46 |
+
[
|
47 |
+
7,
|
48 |
+
"female2",
|
49 |
+
"对啊,粤菜的烹饪方式比较清淡,更注重食材本身的味道。 [lbreak]",
|
50 |
+
"podcast_p",
|
51 |
+
],
|
52 |
+
[
|
53 |
+
8,
|
54 |
+
"Alice",
|
55 |
+
"还有北京的京菜,像北京烤鸭,那可是来北京必吃的美食。 [lbreak]",
|
56 |
+
"podcast_p",
|
57 |
],
|
|
|
|
|
58 |
[
|
59 |
9,
|
60 |
"Bob",
|
61 |
+
"不仅如此,还有淮扬菜、湘菜、鲁菜等等,每个菜系都有其独特的风味。 [lbreak]",
|
62 |
+
"podcast_p",
|
63 |
],
|
64 |
[
|
65 |
10,
|
66 |
"female2",
|
67 |
+
"对对对,像淮扬菜的狮子头,湘菜的剁椒鱼头,都是让人垂涎三尺的美味。 [lbreak]",
|
68 |
+
"podcast_p",
|
69 |
],
|
70 |
]
|
71 |
|
|
|
131 |
script_table = gr.DataFrame(
|
132 |
headers=["index", "speaker", "text", "style"],
|
133 |
datatype=["number", "str", "str", "str"],
|
134 |
+
interactive=True,
|
135 |
wrap=True,
|
136 |
value=podcast_default_case,
|
137 |
row_count=(0, "dynamic"),
|
138 |
+
col_count=(4, "fixed"),
|
139 |
)
|
140 |
|
141 |
send_to_ssml_btn = gr.Button("📩Send to SSML", variant="primary")
|
modules/webui/tts_tab.py
CHANGED
@@ -91,7 +91,9 @@ def create_tts_interface():
|
|
91 |
)
|
92 |
|
93 |
with gr.Tab(label="Upload"):
|
94 |
-
spk_file_upload = gr.File(
|
|
|
|
|
95 |
|
96 |
gr.Markdown("📝Speaker info")
|
97 |
infos = gr.Markdown("empty")
|
|
|
91 |
)
|
92 |
|
93 |
with gr.Tab(label="Upload"):
|
94 |
+
spk_file_upload = gr.File(
|
95 |
+
label="Speaker (Upload)", file_types=[".pt"]
|
96 |
+
)
|
97 |
|
98 |
gr.Markdown("📝Speaker info")
|
99 |
infos = gr.Markdown("empty")
|
modules/webui/webui_utils.py
CHANGED
@@ -93,13 +93,11 @@ def apply_audio_enhance(audio_data, sr, enable_denoise, enable_enhance):
|
|
93 |
tensor = torch.from_numpy(audio_data).float().squeeze().cpu()
|
94 |
enhancer = load_enhancer(device)
|
95 |
|
96 |
-
if enable_enhance:
|
97 |
lambd = 0.9 if enable_denoise else 0.1
|
98 |
tensor, sr = enhancer.enhance(
|
99 |
tensor, sr, tau=0.5, nfe=64, solver="rk4", lambd=lambd, device=device
|
100 |
)
|
101 |
-
elif enable_denoise:
|
102 |
-
tensor, sr = enhancer.denoise(tensor, sr)
|
103 |
|
104 |
audio_data = tensor.cpu().numpy()
|
105 |
return audio_data, int(sr)
|
|
|
93 |
tensor = torch.from_numpy(audio_data).float().squeeze().cpu()
|
94 |
enhancer = load_enhancer(device)
|
95 |
|
96 |
+
if enable_enhance or enable_denoise:
|
97 |
lambd = 0.9 if enable_denoise else 0.1
|
98 |
tensor, sr = enhancer.enhance(
|
99 |
tensor, sr, tau=0.5, nfe=64, solver="rk4", lambd=lambd, device=device
|
100 |
)
|
|
|
|
|
101 |
|
102 |
audio_data = tensor.cpu().numpy()
|
103 |
return audio_data, int(sr)
|
webui.py
CHANGED
@@ -84,7 +84,6 @@ if __name__ == "__main__":
|
|
84 |
parser.add_argument(
|
85 |
"--language",
|
86 |
type=str,
|
87 |
-
default="zh-CN",
|
88 |
help="Set the default language for the webui",
|
89 |
)
|
90 |
args = parser.parse_args()
|
@@ -106,7 +105,7 @@ if __name__ == "__main__":
|
|
106 |
device_id = get_and_update_env(args, "device_id", None, str)
|
107 |
use_cpu = get_and_update_env(args, "use_cpu", [], list)
|
108 |
compile = get_and_update_env(args, "compile", False, bool)
|
109 |
-
language = get_and_update_env(args, "language",
|
110 |
|
111 |
webui_config.experimental = get_and_update_env(
|
112 |
args, "webui_experimental", False, bool
|
@@ -115,8 +114,6 @@ if __name__ == "__main__":
|
|
115 |
webui_config.ssml_max = get_and_update_env(args, "ssml_max_len", 5000, int)
|
116 |
webui_config.max_batch_size = get_and_update_env(args, "max_batch_size", 8, int)
|
117 |
|
118 |
-
config.runtime_env_vars.language = "zh-CN"
|
119 |
-
|
120 |
webui_init()
|
121 |
demo = create_interface()
|
122 |
|
|
|
84 |
parser.add_argument(
|
85 |
"--language",
|
86 |
type=str,
|
|
|
87 |
help="Set the default language for the webui",
|
88 |
)
|
89 |
args = parser.parse_args()
|
|
|
105 |
device_id = get_and_update_env(args, "device_id", None, str)
|
106 |
use_cpu = get_and_update_env(args, "use_cpu", [], list)
|
107 |
compile = get_and_update_env(args, "compile", False, bool)
|
108 |
+
language = get_and_update_env(args, "language", "zh-CN", str)
|
109 |
|
110 |
webui_config.experimental = get_and_update_env(
|
111 |
args, "webui_experimental", False, bool
|
|
|
114 |
webui_config.ssml_max = get_and_update_env(args, "ssml_max_len", 5000, int)
|
115 |
webui_config.max_batch_size = get_and_update_env(args, "max_batch_size", 8, int)
|
116 |
|
|
|
|
|
117 |
webui_init()
|
118 |
demo = create_interface()
|
119 |
|