File size: 906 Bytes
fa52b2c
96b4603
9dd385e
44742a9
fa52b2c
44742a9
 
9dd385e
44742a9
fa52b2c
44742a9
 
 
 
 
 
 
fa52b2c
44742a9
 
 
 
 
 
 
fa52b2c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import gradio as gr
import torch
import scipy.io.wavfile as wavfile
from transformers import AutoProcessor, SeamlessM4TModel, pipeline

# tokenizer = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
# model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")

# text = "some example text in the English language"

# def greet(text):
#     inputs = tokenizer(text, return_tensors="pt")
#     with torch.no_grad():
#         output = model(**inputs, decoder_input_ids=inputs["input_ids"]).waveform
#         out = output[0]
#         wavfile.write("tmp.wav", rate=16000, data=out)
#         return open("tmp.wav", "rb").read()

def stt(audio):
    print(audio)
    br, data = audio
    tscrb = pipeline("automatic-speech-recognition", model="facebook/hubert-large-ls960-ft")
    return tscrb(data)

iface = gr.Interface(fn=stt, inputs="audio", outputs="text")
iface.launch()