File size: 12,250 Bytes
be9690e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42a2435
c0b6e11
42a2435
 
be9690e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb0955a
42a2435
 
 
 
be9690e
 
 
42a2435
 
 
 
 
 
 
 
be9690e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42a2435
be9690e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42a2435
f4470dc
42a2435
 
 
 
 
 
c0b6e11
 
42a2435
24b8dff
 
c0b6e11
 
 
 
 
 
9cd1263
 
c0b6e11
 
 
42a2435
c0b6e11
42a2435
 
c0b6e11
42a2435
 
 
 
 
 
 
 
c0b6e11
 
42a2435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#   http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))

import argparse
import gradio as gr
import numpy as np
import torch
import torchaudio
import random
import librosa

import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)

from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav

import spaces
from textwrap import dedent

logging.basicConfig(level=logging.WARNING,
                    format='%(asctime)s %(levelname)s %(message)s')

def generate_seed():
    seed = random.randint(1, 100000000)
    return {
        "__type__": "update",
        "value": seed
    }

def set_all_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

max_val = 0.8
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
    speech, _ = librosa.effects.trim(
        speech, top_db=top_db,
        frame_length=win_length,
        hop_length=hop_length
    )
    if speech.abs().max() > max_val:
        speech = speech / speech.abs().max() * max_val
    speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
    return speech

inference_mode_list = ['自然语言控制']
instruct_dict = {'预训练音色': '1. 选择预训练音色\n2.点击生成音频按钮',
                 '3s极速复刻': '1. 本地上传参考音频,或麦克风录入\n2. 输入参考音频对应的文本以及希望声音复刻的文本\n3.点击“一键开启声音复刻💕”',
                 '跨语种复刻': '1. 本地上传参考音频,或麦克风录入\n2. **无需输入**参考音频对应的文本\n3.点击“一键开启声音复刻💕”',
                 '自然语言控制': '1. 输入instruct文本\n2.点击生成音频按钮'}
def change_instruction(mode_checkbox_group):
    return instruct_dict[mode_checkbox_group]

@spaces.GPU(duration=70)
def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, instruct_text, seed):
    prompt_wav_record = None
    tts_text = "".join([item1 for item1 in tts_text.strip().split("\n") if item1 != ""]) + ".。"
    print(tts_text)
    prompt_text = "".join([item2 for item2 in prompt_text.strip().split("\n") if item2 != ""])
    #if len(tts_text)>108:
    #    raise Exception('抱歉!你输入的文本超过了100字符,请您删减文本!')
    if prompt_wav_upload is not None:
        prompt_wav = prompt_wav_upload
    elif prompt_wav_record is not None:
        prompt_wav = prompt_wav_record
    else:
        prompt_wav = None
    # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
    if mode_checkbox_group in ['自然语言控制']:
        if cosyvoice.frontend.instruct is False:
            gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M-Instruct模型'.format(args.model_dir))
            return (target_sr, default_data)
        if instruct_text == '':
            gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
            return (target_sr, default_data)
        if prompt_wav is not None or prompt_text != '':
            gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
    # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
    if mode_checkbox_group in ['跨语种复刻']:
        if cosyvoice.frontend.instruct is True:
            gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir))
            return (target_sr, default_data)
        if instruct_text != '':
            gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
        if prompt_wav is None:
            gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
            return (target_sr, default_data)
        gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和参考音频对应的文本为不同语言')
    # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
    if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
        if prompt_wav is None:
            gr.Warning('prompt音频为空,您是否忘记输入prompt音频?')
            return (target_sr, default_data)
        if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
            gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
            return (target_sr, default_data)
    # sft mode only use sft_dropdown
    if mode_checkbox_group in ['预训练音色']:
        if instruct_text != '' or prompt_wav is not None or prompt_text != '':
            gr.Info('您正在使用预训练音色模式,prompt文本/prompt音频/instruct文本会被忽略!')
    # zero_shot mode only use prompt_wav prompt text
    if mode_checkbox_group in ['3s极速复刻']:
        if prompt_text == '':
            gr.Warning('prompt文本为空,您是否忘记输入prompt文本?')
            return (target_sr, default_data)
        if instruct_text != '':
            gr.Info('您正在使用3s极速复刻模式,预训练音色/instruct文本会被忽略!')

    if mode_checkbox_group == '预训练音色':
        logging.info('get sft inference request')
        set_all_random_seed(seed)
        output = cosyvoice.inference_sft(tts_text, sft_dropdown)
    elif mode_checkbox_group == '3s极速复刻':
        logging.info('get zero_shot inference request')
        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
        set_all_random_seed(seed)
        output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
    elif mode_checkbox_group == '跨语种复刻':
        logging.info('get cross_lingual inference request')
        prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
        set_all_random_seed(seed)
        output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
    else:
        logging.info('get instruct inference request')
        set_all_random_seed(seed)
        output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text)
    audio_data = output['tts_speech'].numpy().flatten()
    return (target_sr, audio_data)


cosyvoice = CosyVoice('pretrained_models/CosyVoice-300M-Instruct')
sft_spk = cosyvoice.list_avaliable_spks()
prompt_sr, target_sr = 16000, 22050
default_data = np.zeros(target_sr)
    
app = gr.Blocks(theme="JohnSmith9982/small_and_pretty")
with app:
    gr.Markdown("# <center>🌊💕🎶 [CosyVoice](https://www.bilibili.com/video/BV1vz421q7ir/) Instruct 一句话实现声音定制,听你想听</center>")
    gr.Markdown("## <center>🌟 只需自然语言,就可以控制说话人的音色、语调、情感!</center>")
    gr.Markdown("### <center>🤗 更多精彩,尽在[滔滔AI](https://www.talktalkai.com/);滔滔AI,为爱滔滔!💕</center>")
    with gr.Accordion("💡 查看自然语言定制声音的例子(您只需要复制英文部分的内容)", open=False):
        _ = f"""
            * 义愤填膺: Theo 'Crimson', is a fiery, passionate rebel leader. Fights with fervor for justice, but struggles with impulsiveness.
            * 冷静安抚: Kai 'Torrent', is a cool-headed, tactical water mage who plans his moves carefully. A soothing presence with hidden depths.
            * 活力鲁莽: Zara 'Wildfire', is an impulsive, fearless firebrand who loves a challenge. Her bravery inspires others, though she often acts recklessly.
            * 神秘优雅: Selene 'Moonshade', is a mysterious, elegant dancer with a connection to the night. Her movements are both mesmerizing and deadly.
            * 友善亲切: Priya, the humanitarian doctor, heals wounds of the world with her boundless empathy and skill.
            * 智慧善良: Ivan, the old sea captain, navigates life's storms with timeless wisdom and a heart of gold.
            * 诗词朗诵: A male voice with very slow pace and very sad emotion that is perfect for reciting poems.
            * 霸道总裁: A male tyrannical arrogant CEO that controls everything in his company.\n
            您也可以这样控制说话人: A [male/female] speaker with [normal/high/low] pitch, [normal/fast/slow] speaking rate, and [happy/sad/angry] emotion.
            """
        gr.Markdown(dedent(_))
    with gr.Row():
        tts_text = gr.Textbox(label="请填写您希望声音定制的文本内容", lines=3, info="中文文本建议不超过100个字,英文文本不超过100个单词", placeholder="想说却还没说的,还很多...")
        mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='请选择声音复刻类型', value=inference_mode_list[0], info="如果声音复刻的文本和参考音频对应的文本是同一种语言,请选择“3s极速复刻”;不同语言,请选“跨语种复刻”", visible=False)
        instruction_text = gr.Text(label="📔 操作指南", value=instruct_dict[inference_mode_list[0]], scale=0.5, visible=False)
        sft_dropdown = gr.Dropdown(choices=sft_spk, label='选择预训练音色', info="会在一定程度上影响生成结果", value=sft_spk[1], scale=0.25)
        with gr.Column(scale=0.25):
            seed_button = gr.Button(value="\U0001F3B2", visible=True)
            seed = gr.Number(value=0, label="随机推理种子", info="若数值保持不变,则每次生成结果一致", visible=True)

    with gr.Row():
        prompt_text = gr.Textbox(label="请填写参考音频对应的文本内容", lines=3, placeholder="告诉我参考音频说了些什么吧...", visible=False)
        prompt_wav_upload = gr.Audio(type='filepath', label='请从本地上传您喜欢的参考音频,注意采样率不低于16kHz,时长不超过30s', visible=False)
        #prompt_wav_record = gr.Audio(type='filepath', label='通过麦克风录制参考音频,程序会优先使用本地上传的参考音频', visible=False)
        instruct_text = gr.Textbox(label="请用自然语言定制您的说话人,仅限英文。您可以参考我们在上方提供的例子", lines=1, value="", placeholder="A male voice with very slow pace and very sad emotion that is perfect for reciting poems.")
        generate_button = gr.Button("一键开启声音定制💕", variant="primary")


    audio_output = gr.Audio(label="为您生成的专属音频🎶")

    seed_button.click(fn=generate_seed, inputs=[], outputs=seed)
    generate_button.click(fn=generate_audio,
                            inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, instruct_text, seed],
                            outputs=audio_output, queue=False)
    mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
    gr.Markdown("### <center>注意❗:请不要生成会对个人以及组织造成侵害的内容,此程序仅供科研、学习及个人娱乐使用。请自觉合规使用此程序,程序开发者不负有任何责任。</center>")
    gr.HTML('''
        <div class="footer">
                    <p>🌊🏞️🎶 - 江水东流急,滔滔无尽声。 明·顾璘
                    </p>
        </div>
    ''')
#app.queue(max_size=40, api_open=False)
app.launch(show_error=True)