Spaces:
Running
Running
yuangongfdu
commited on
Commit
•
0ea1ca0
1
Parent(s):
7653446
Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import whisper_at
|
3 |
+
|
4 |
+
link = "https://github.com/YuanGongND/whisper-AT"
|
5 |
+
text = "[Github]"
|
6 |
+
paper_link = "https://arxiv.org/pdf/2307.03183.pdf"
|
7 |
+
paper_text = "[Paper]"
|
8 |
+
|
9 |
+
def predict(audio_path, time_resolution):
|
10 |
+
def round_time_resolution(time_resolution):
|
11 |
+
multiple = time_resolution / 0.4
|
12 |
+
rounded_multiple = round(multiple)
|
13 |
+
rounded_time_resolution = rounded_multiple * 0.4
|
14 |
+
return rounded_time_resolution
|
15 |
+
audio_tagging_time_resolution = round_time_resolution(time_resolution)
|
16 |
+
model = whisper.load_model("large-v1")
|
17 |
+
result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution)
|
18 |
+
# ASR Results
|
19 |
+
print(result["text"])
|
20 |
+
# Audio Tagging Results
|
21 |
+
audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=5, p_threshold=-1, include_class_list=list(range(527)))
|
22 |
+
print(audio_tag_result)
|
23 |
+
|
24 |
+
asr_output = ""
|
25 |
+
for segment in result['segments']:
|
26 |
+
asr_output = asr_output + segment['start'] + 's-' + segment['end'] + 's' + segment['text'] + '\n'
|
27 |
+
return asr_output, audio_tag_result
|
28 |
+
|
29 |
+
iface = gr.Interface(fn=predict,
|
30 |
+
inputs=[gr.Audio(type="filepath"), gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
|
31 |
+
outputs=[gr.Textbox(label="ASR Output"), gr.Textbox(label="Audio Tagging Output")],
|
32 |
+
cache_examples=True,
|
33 |
+
title="Quick Demo of Whisper-AT",
|
34 |
+
description="We are glad to introduce Whisper-AT - A new joint audio tagging and speech recognition model. It outputs background sound labels in addition to text." + f"<a href='{paper_link}'>{paper_text}</a> " + f"<a href='{link}'>{text}</a> <br>" +
|
35 |
+
"Whisper-AT is authored by Yuan Gong, Sameer Khurana, Leonid Karlinsky, and James Glass (MIT & MIT-IBM Watson AI Lab).")
|
36 |
+
iface.launch(debug=False, share=True)
|