File size: 2,082 Bytes
32e6cc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# import torch
# import torchaudio
import numpy as np
from espnet2.bin.st_inference_streaming import Speech2TextStreaming
import gradio as gr
import soundfile as sf
import librosa

# Load your custom model
model = Speech2TextStreaming(
    st_model_file="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/valid.acc.ave_10best.pth",  # path to your model weights
    st_train_config="/data1/ankita/st1/exp/st_train_st_raw_en_de_bpe_de2000_sp/config.yaml",  # path to your config file
    device="cuda",
    minlenratio=0.1,
    maxlenratio=0.7,
    beam_size=1  # change to "cuda" if using GPU
)



silence_threshold = 0.01  # Adjust this threshold based on your audio levels
silence_duration = 1.0  # Duration of silence to detect (in seconds)

def is_silence(audio_chunk, sr, threshold=silence_threshold):
    return np.mean(np.abs(audio_chunk)) < threshold

def transcribe(state, new_chunk):
    stream, silence_time = state
    if new_chunk is None:
        return (None, None), ""

    sr, y = new_chunk
    y = y.astype(np.float32)

    if sr != 16000:
        y = librosa.resample(y=y, orig_sr=sr, target_sr=16000)
    y /= np.max(np.abs(y))

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y
        model(np.zeros(stream.shape), is_final=True)

    if is_silence(y, sr):
        silence_time += len(y) / sr
    else:
        silence_time = 0

    if silence_time >= silence_duration:
        output = model(stream, is_final=True)
        return (None, 0), output[0][0] if output else ""
    else:
        output = model(stream)
        return (stream, silence_time), output[0][0] if output else ""

def clear_transcription():
    return (None, 0), ""

with gr.Blocks() as demo:
    state = gr.State((None, 0))
    audio = gr.Audio(sources=["microphone"], type="numpy", streaming=True)
    text = gr.Textbox()
    clear_button = gr.Button("Clear")

    audio.stream(transcribe, inputs=[state, audio], outputs=[state, text])
    clear_button.click(clear_transcription, inputs=[], outputs=[state, text])

demo.launch()