|
import gradio as gr |
|
from transformers import VitsModel, AutoTokenizer, set_seed |
|
import torch |
|
import scipy.io.wavfile |
|
from ruaccent import RUAccent |
|
|
|
speakers={"woman": 0, |
|
"man": 1} |
|
|
|
model = VitsModel.from_pretrained("utrobinmv/tts_ru_free_hf_vits_low_multispeaker") |
|
tokenizer = AutoTokenizer.from_pretrained("utrobinmv/tts_ru_free_hf_vits_low_multispeaker") |
|
model.eval() |
|
set_seed(555) |
|
|
|
accentizer = RUAccent() |
|
accentizer.load(omograph_model_size='turbo', use_dictionary=True) |
|
|
|
|
|
def generate_audio(speaker_name, text): |
|
text = accentizer.process_all(text) |
|
inputs = tokenizer(text, return_tensors="pt") |
|
inputs['speaker_id'] = speakers[speaker_name] |
|
|
|
with torch.no_grad(): |
|
output = model(**inputs).waveform |
|
|
|
scipy.io.wavfile.write("output.wav", rate=model.config.sampling_rate, data=output[0].cpu().numpy()) |
|
|
|
return "output.wav" |
|
|
|
speaker_dropdown = gr.Dropdown( |
|
choices=speakers, |
|
label="Speaker id", |
|
value='woman', |
|
info=f"Models are trained on 2 speakers", |
|
interactive=True |
|
) |
|
|
|
iface = gr.Interface(fn=generate_audio, inputs=[speaker_dropdown,"text"], outputs="audio", title="Text to Speech Russian free multispeaker model", description="Введите текст на русском языке, чтобы преобразовать его в русскую звуковую речь. Пример текста: Привет, как дела? А у тебя как?") |
|
iface.launch(share=True) |
|
|