import gradio as gr | |
import torch | |
import scipy.io.wavfile as wavfile | |
from transformers import AutoProcessor, SeamlessM4TModel, pipeline | |
# tokenizer = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium") | |
# model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium") | |
# text = "some example text in the English language" | |
# def greet(text): | |
# inputs = tokenizer(text, return_tensors="pt") | |
# with torch.no_grad(): | |
# output = model(**inputs, decoder_input_ids=inputs["input_ids"]).waveform | |
# out = output[0] | |
# wavfile.write("tmp.wav", rate=16000, data=out) | |
# return open("tmp.wav", "rb").read() | |
def stt(audio): | |
print(audio) | |
br, data = audio | |
tscrb = pipeline("automatic-speech-recognition", model="facebook/hubert-large-ls960-ft") | |
return tscrb(data) | |
iface = gr.Interface(fn=stt, inputs="audio", outputs="text") | |
iface.launch() |