Spaces:
Runtime error
Runtime error
# import torch | |
# import torchaudio | |
import numpy as np | |
from espnet2.bin.st_inference_streaming import Speech2TextStreaming | |
import gradio as gr | |
import soundfile as sf | |
import librosa | |
# Load your custom model | |
model = Speech2TextStreaming( | |
st_model_file="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/valid.acc.ave_10best.pth", # path to your model weights | |
st_train_config="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/config.yaml", # path to your config file | |
device="cuda", | |
minlenratio=0.1, | |
maxlenratio=0.7, | |
beam_size=1 # change to "cuda" if using GPU | |
) | |
silence_threshold = 0.01 # Adjust this threshold based on your audio levels | |
silence_duration = 1.0 # Duration of silence to detect (in seconds) | |
def is_silence(audio_chunk, sr, threshold=silence_threshold): | |
return np.mean(np.abs(audio_chunk)) < threshold | |
def transcribe(state, new_chunk): | |
stream, silence_time = state | |
if new_chunk is None: | |
return (None, None), "" | |
sr, y = new_chunk | |
y = y.astype(np.float32) | |
if sr != 16000: | |
y = librosa.resample(y=y, orig_sr=sr, target_sr=16000) | |
y /= np.max(np.abs(y)) | |
if stream is not None: | |
stream = np.concatenate([stream, y]) | |
else: | |
stream = y | |
model(np.zeros(stream.shape), is_final=True) | |
if is_silence(y, sr): | |
silence_time += len(y) / sr | |
else: | |
silence_time = 0 | |
if silence_time >= silence_duration: | |
output = model(stream, is_final=True) | |
return (None, 0), output[0][0] if output else "" | |
else: | |
output = model(stream) | |
return (stream, silence_time), output[0][0] if output else "" | |
def clear_transcription(): | |
return (None, 0), "" | |
with gr.Blocks() as demo: | |
state = gr.State((None, 0)) | |
audio = gr.Audio(sources=["microphone"], type="numpy", streaming=True) | |
text = gr.Textbox() | |
clear_button = gr.Button("Clear") | |
audio.stream(transcribe, inputs=[state, audio], outputs=[state, text]) | |
clear_button.click(clear_transcription, inputs=[], outputs=[state, text]) | |
demo.launch() |