import os import gradio as gr import numpy as np from fairseq.checkpoint_utils import load_model_ensemble_and_task_from_hf_hub from fairseq.models.speech_to_speech.hub_interface import S2SHubInterface from fairseq.models.speech_to_text.hub_interface import S2THubInterface from audio_pipe import SpeechToSpeechPipeline io1 = gr.Interface.load("huggingface/facebook/xm_transformer_s2ut_en-hk", api_key=os.environ['api_key']) io2 = gr.Interface.load("huggingface/facebook/xm_transformer_s2ut_hk-en", api_key=os.environ['api_key']) io3 = gr.Interface.load("huggingface/facebook/xm_transformer_unity_en-hk", api_key=os.environ['api_key']) io4 = gr.Interface.load("huggingface/facebook/xm_transformer_unity_hk-en", api_key=os.environ['api_key']) pipe = SpeechToSpeechPipeline("facebook/xm_transformer_unity_hk-en") def call_model(audio, model): # pipe = SpeechToSpeechPipeline("facebook/xm_transformer_unity_hk-en") # wav, sr, text = pipe(audio) temp_file = pipe(audio) return gr.Audio(temp_file) def inference(audio, model): if model == "xm_transformer_s2ut_en-hk": out_audio = io1(audio) elif model == "xm_transformer_s2ut_hk-en": out_audio = io2(audio) elif model == "xm_transformer_unity_en-hk": out_audio = io3(audio) elif model == "xm_transformer_unity_hk-en_gpu": out_audio = call_model(audio, model) else: out_audio = io4(audio) return out_audio css = """ .gradio-container { font-family: 'IBM Plex Sans', sans-serif; } .gr-button { color: black; border-color: grey; background: white; } input[type='range'] { accent-color: black; } .dark input[type='range'] { accent-color: #dfdfdf; } .container { max-width: 730px; margin: auto; padding-top: 1.5rem; } .details:hover { text-decoration: underline; } .gr-button { white-space: nowrap; } .gr-button:focus { border-color: rgb(147 197 253 / var(--tw-border-opacity)); outline: none; box-shadow: var(--tw-ring-offset-shadow), var(--tw-ring-shadow), var(--tw-shadow, 0 0 #0000); --tw-border-opacity: 1; --tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color); --tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(3px var(--tw-ring-offset-width)) var(--tw-ring-color); --tw-ring-color: rgb(191 219 254 / var(--tw-ring-opacity)); --tw-ring-opacity: .5; } .footer { margin-bottom: 45px; margin-top: 35px; text-align: center; border-bottom: 1px solid #e5e5e5; } .footer>p { font-size: .8rem; display: inline-block; padding: 0 10px; transform: translateY(10px); background: white; } .dark .footer { border-color: #303030; } .dark .footer>p { background: #0b0f19; } .prompt h4{ margin: 1.25em 0 .25em 0; font-weight: bold; font-size: 115%; } .animate-spin { animation: spin 1s linear infinite; } @keyframes spin { from { transform: rotate(0deg); } to { transform: rotate(360deg); } } """ block = gr.Blocks(css=css) with block: gr.HTML( """
A demo for fairseq speech-to-speech translation models. It supports S2UT and UnitY models for bidirectional Hokkien and English translation. Please select the model and record the input to submit.