Spaces:
Running
Running
yuangongfdu
commited on
Commit
•
3d1236b
1
Parent(s):
3b082be
Update app.py
Browse files
app.py
CHANGED
@@ -15,28 +15,26 @@ def round_time_resolution(time_resolution):
|
|
15 |
rounded_time_resolution = rounded_multiple * 0.4
|
16 |
return rounded_time_resolution
|
17 |
|
18 |
-
def predict(
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
#asr_output, at_output = str(result["text"]), 'k'
|
36 |
-
return asr_output, at_output
|
37 |
|
38 |
iface = gr.Interface(fn=predict,
|
39 |
-
inputs=[gr.Audio(type="filepath", source='microphone'), gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
|
40 |
outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],
|
41 |
cache_examples=True,
|
42 |
title="Quick Demo of Whisper-AT",
|
|
|
15 |
rounded_time_resolution = rounded_multiple * 0.4
|
16 |
return rounded_time_resolution
|
17 |
|
18 |
+
def predict(audio_path_m, audio_path_t, time_resolution):
|
19 |
+
if (audio_path_m is None) != (audio_path_t is None):
|
20 |
+
return "Please only upload one recording, either upload it or record using microphone.", "Please only upload one recording, either upload it or record using microphone."
|
21 |
+
else:
|
22 |
+
audio_path = audio_path_m or audio_path_t
|
23 |
+
audio_tagging_time_resolution = round_time_resolution(time_resolution)
|
24 |
+
result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution)
|
25 |
+
audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=5, p_threshold=-1, include_class_list=list(range(527)))
|
26 |
+
asr_output = ""
|
27 |
+
for segment in result['segments']:
|
28 |
+
asr_output = asr_output + str(segment['start']) + 's-' + str(segment['end']) + 's: ' + segment['text'] + '\n'
|
29 |
+
at_output = ""
|
30 |
+
for segment in audio_tag_result:
|
31 |
+
print(segment)
|
32 |
+
at_output = at_output + str(segment['time']['start']) + 's-' + str(segment['time']['end']) + 's: ' + ','.join([x[0] for x in segment['audio tags']]) + '\n'
|
33 |
+
print(at_output)
|
34 |
+
return asr_output, at_output
|
|
|
|
|
35 |
|
36 |
iface = gr.Interface(fn=predict,
|
37 |
+
inputs=[gr.Audio(type="filepath", source='microphone'), gr.Audio(type="filepath"), gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
|
38 |
outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],
|
39 |
cache_examples=True,
|
40 |
title="Quick Demo of Whisper-AT",
|