Spaces:

yuangongfdu
/

whisper-at

Running

App Files Files Community

yuangongfdu commited on Jul 19, 2023

Commit

3d1236b

•

1 Parent(s): 3b082be

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -20

app.py CHANGED Viewed

@@ -15,28 +15,26 @@ def round_time_resolution(time_resolution):
     rounded_time_resolution = rounded_multiple * 0.4
     return rounded_time_resolution
-def predict(audio_path, time_resolution):
-    audio_tagging_time_resolution = round_time_resolution(time_resolution)
-    result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution)
-    # # ASR Results
-    # print(result["text"])
-    # # Audio Tagging Results
-    audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=5, p_threshold=-1, include_class_list=list(range(527)))
-    # print(audio_tag_result)
-    asr_output = ""
-    for segment in result['segments']:
-      asr_output = asr_output + str(segment['start']) + 's-' + str(segment['end']) + 's: ' + segment['text'] + '\n'
-    at_output = ""
-    for segment in audio_tag_result:
-        print(segment)
-        at_output = at_output + str(segment['time']['start']) + 's-' + str(segment['time']['end']) + 's: ' + ','.join([x[0] for x in segment['audio tags']]) + '\n'
-        print(at_output)
-    #asr_output, at_output = str(result["text"]), 'k'
-    return asr_output, at_output
 iface = gr.Interface(fn=predict,
-                    inputs=[gr.Audio(type="filepath", source='microphone'), gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
                     outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],
                     cache_examples=True,
                     title="Quick Demo of Whisper-AT",

     rounded_time_resolution = rounded_multiple * 0.4
     return rounded_time_resolution
+def predict(audio_path_m, audio_path_t, time_resolution):
+    if (audio_path_m is None) != (audio_path_t is None):
+        return "Please only upload one recording, either upload it or record using microphone.", "Please only upload one recording, either upload it or record using microphone."
+    else:
+        audio_path = audio_path_m or audio_path_t
+        audio_tagging_time_resolution = round_time_resolution(time_resolution)
+        result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution)
+        audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=5, p_threshold=-1, include_class_list=list(range(527)))
+        asr_output = ""
+        for segment in result['segments']:
+          asr_output = asr_output + str(segment['start']) + 's-' + str(segment['end']) + 's: ' + segment['text'] + '\n'
+        at_output = ""
+        for segment in audio_tag_result:
+            print(segment)
+            at_output = at_output + str(segment['time']['start']) + 's-' + str(segment['time']['end']) + 's: ' + ','.join([x[0] for x in segment['audio tags']]) + '\n'
+            print(at_output)
+        return asr_output, at_output
 iface = gr.Interface(fn=predict,
+                    inputs=[gr.Audio(type="filepath", source='microphone'), gr.Audio(type="filepath"), gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
                     outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],
                     cache_examples=True,
                     title="Quick Demo of Whisper-AT",