import gradio as gr from transformers import VitsModel, AutoTokenizer, set_seed import torch import scipy.io.wavfile from ruaccent import RUAccent speakers={"woman": 0, "man": 1} model = VitsModel.from_pretrained("utrobinmv/tts_ru_free_hf_vits_low_multispeaker") tokenizer = AutoTokenizer.from_pretrained("utrobinmv/tts_ru_free_hf_vits_low_multispeaker") model.eval() set_seed(555) accentizer = RUAccent() accentizer.load(omograph_model_size='turbo', use_dictionary=True) def generate_audio(speaker_name, text): text = accentizer.process_all(text) inputs = tokenizer(text, return_tensors="pt") inputs['speaker_id'] = speakers[speaker_name] with torch.no_grad(): output = model(**inputs).waveform scipy.io.wavfile.write("output.wav", rate=model.config.sampling_rate, data=output[0].cpu().numpy()) return "output.wav" speaker_dropdown = gr.Dropdown( choices=speakers, label="Speaker id", value='woman', info=f"Models are trained on 2 speakers", interactive=True ) iface = gr.Interface(fn=generate_audio, inputs=[speaker_dropdown,"text"], outputs="audio", title="Text to Speech Russian free multispeaker model", description="Введите текст на русском языке, чтобы преобразовать его в русскую звуковую речь. Пример текста: Привет, как дела? А у тебя как?") iface.launch(share=True)