File size: 2,898 Bytes
2ddf3fe 0412962 50a5992 2ddf3fe b0ab37c a00d114 0412962 b0ab37c a00d114 0412962 a00d114 0412962 2ddf3fe b0ab37c a00d114 50a5992 2ddf3fe 50a5992 0412962 50a5992 2ddf3fe 50a5992 2ddf3fe b865c6d 50a5992 a00d114 2ddf3fe 50a5992 0412962 2ddf3fe 0412962 50a5992 2ddf3fe 50a5992 2ddf3fe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 |
import gradio as gr
import numpy as np
from vad_utils import get_speech_probs, make_visualization, probs2speech_timestamps, read_audio
import torch
import pandas as pd
import gdown
probs = None
audio_length_samples = None
def process_audio(audio_input):
global probs
global audio_length_samples
wav = read_audio(audio_input, sampling_rate=16_000)
audio_length_samples = len(wav)
probs = get_speech_probs(wav, sampling_rate=16_000)
return make_visualization(probs, 512 / 16_000)
def process_parameters(threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms):
timestamps = probs2speech_timestamps(probs, audio_length_samples,
threshold = threshold,
min_speech_duration_ms = min_speech_duration_ms,
min_silence_duration_ms=min_silence_duration_ms,
window_size_samples=window_size_samples,
speech_pad_ms=speech_pad_ms)
df = pd.DataFrame(timestamps)
df["note"] = ""
df.to_csv("timestamps.txt", sep = '\t', header=False, index=False)
return "timestamps.txt", df
def download_gdrive(id):
output_file = "audio.wav" # Replace "data_file.ext" with the desired output filename and extension
gdown.download(f"https://drive.google.com/uc?id={id}", output_file)
return "output_file.wav"
def main():
with gr.Blocks() as demo:
with gr.Row():
gdrive_str = gr.Text("File ID")
download_button = gr.Button("Download Audio")
with gr.Row():
audio_input = gr.Audio(type="filepath")
button1 = gr.Button("Compute Speech Probabilities")
figure = gr.Plot()
download_button.click(download_gdrive, inputs=[gdrive_str], outputs=audio_input)
button1.click(process_audio, inputs=[audio_input], outputs=figure)
with gr.Row():
threshold = gr.Number(label="Threshold", value=0.5, minimum=0.0, maximum=1.0)
min_speech_duration_ms = gr.Number(label="Min Speech Duration (ms)", value=250)
min_silence_duration_ms = gr.Number(label="Min Silence Duration (ms)", value=100)
window_size_samples = gr.Dropdown(label="Window Size Samples", choices=[512, 1024, 1536], value=1536)
speech_pad_ms = gr.Number(label="Speech Pad (ms)", value=30)
button2 = gr.Button("Compute Speech Timestamps")
output_file = gr.File()
with gr.Row():
output_df = gr.DataFrame()
button2.click(process_parameters, inputs=[threshold, min_speech_duration_ms, min_silence_duration_ms, window_size_samples, speech_pad_ms],
outputs=[output_file, output_df])
demo.launch()
if __name__ == "__main__":
main()
|