Spaces:
Running
Running
import gradio as gr | |
import whisper_at | |
link = "https://github.com/YuanGongND/whisper-AT" | |
text = "[Github]" | |
paper_link = "https://arxiv.org/pdf/2307.03183.pdf" | |
paper_text = "[Paper]" | |
def predict(audio_path, time_resolution): | |
def round_time_resolution(time_resolution): | |
multiple = time_resolution / 0.4 | |
rounded_multiple = round(multiple) | |
rounded_time_resolution = rounded_multiple * 0.4 | |
return rounded_time_resolution | |
audio_tagging_time_resolution = round_time_resolution(time_resolution) | |
model = whisper.load_model("tiny") | |
result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution) | |
# ASR Results | |
print(result["text"]) | |
# Audio Tagging Results | |
audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=5, p_threshold=-1, include_class_list=list(range(527))) | |
print(audio_tag_result) | |
asr_output = "" | |
for segment in result['segments']: | |
asr_output = asr_output + segment['start'] + 's-' + segment['end'] + 's' + segment['text'] + '\n' | |
return asr_output, audio_tag_result | |
iface = gr.Interface(fn=predict, | |
inputs=[gr.Audio(type="filepath", source='microphone'), gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')], | |
outputs=[gr.Textbox(label="ASR Output"), gr.Textbox(label="Audio Tagging Output")], | |
cache_examples=True, | |
title="Quick Demo of Whisper-AT", | |
description="We are glad to introduce Whisper-AT - A new joint audio tagging and speech recognition model. It outputs background sound labels in addition to text." + f"<a href='{paper_link}'>{paper_text}</a> " + f"<a href='{link}'>{text}</a> <br>" + | |
"Whisper-AT is authored by Yuan Gong, Sameer Khurana, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab).") | |
iface.launch() |