ankita-01's picture
add app.py
32e6cc8
# import torch
# import torchaudio
import numpy as np
from espnet2.bin.st_inference_streaming import Speech2TextStreaming
import gradio as gr
import soundfile as sf
import librosa
# Load your custom model
model = Speech2TextStreaming(
st_model_file="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/valid.acc.ave_10best.pth", # path to your model weights
st_train_config="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/config.yaml", # path to your config file
device="cuda",
minlenratio=0.1,
maxlenratio=0.7,
beam_size=1 # change to "cuda" if using GPU
)
silence_threshold = 0.01 # Adjust this threshold based on your audio levels
silence_duration = 1.0 # Duration of silence to detect (in seconds)
def is_silence(audio_chunk, sr, threshold=silence_threshold):
return np.mean(np.abs(audio_chunk)) < threshold
def transcribe(state, new_chunk):
stream, silence_time = state
if new_chunk is None:
return (None, None), ""
sr, y = new_chunk
y = y.astype(np.float32)
if sr != 16000:
y = librosa.resample(y=y, orig_sr=sr, target_sr=16000)
y /= np.max(np.abs(y))
if stream is not None:
stream = np.concatenate([stream, y])
else:
stream = y
model(np.zeros(stream.shape), is_final=True)
if is_silence(y, sr):
silence_time += len(y) / sr
else:
silence_time = 0
if silence_time >= silence_duration:
output = model(stream, is_final=True)
return (None, 0), output[0][0] if output else ""
else:
output = model(stream)
return (stream, silence_time), output[0][0] if output else ""
def clear_transcription():
return (None, 0), ""
with gr.Blocks() as demo:
state = gr.State((None, 0))
audio = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
text = gr.Textbox()
clear_button = gr.Button("Clear")
audio.stream(transcribe, inputs=[state, audio], outputs=[state, text])
clear_button.click(clear_transcription, inputs=[], outputs=[state, text])
demo.launch()