Spaces:

yuangongfdu
/

whisper-at

Running

App Files Files Community

yuangongfdu commited on Sep 10, 2023

Commit

d45d3e7

•

1 Parent(s): cf1523c

Update app.py

Browse files

Files changed (1) hide show

app.py +2 -2

app.py CHANGED Viewed

@@ -34,7 +34,7 @@ def predict(audio_path_m, audio_path_t, model_size, language, top_k, threshold,
             result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution)
         else:
             result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution, language=lan_dict[language])
-        audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=top_k, p_threshold=threshold, include_class_list=list(range(527)))
         asr_output = ""
         for segment in result['segments']:
           asr_output = asr_output + format(segment['start'], ".1f") + 's-' + format(segment['end'], ".1f") + 's: ' + segment['text'] + '\n'
@@ -49,7 +49,7 @@ iface = gr.Interface(fn=predict,
                     inputs=[gr.Audio(type="filepath", source='microphone', label='Please either upload an audio file or record using the microphone.', show_label=True), gr.Audio(type="filepath"),
                             gr.Radio(["tiny", "tiny.en", "small", "large"], value='large', label="Model size", info="The larger the model, the better the performance and the slower the speed."),
                             gr.Radio(["Auto Detection", "English", "Chinese"], value='Auto Detection', label="Language", info="Please specify the language, or let the model detect it automatically"),
-                            gr.Slider(1, 10, value=5, label="Top-K", info="The max number of labels to predict."),
                             gr.Slider(-10, 0, value=-1, label="Prediction Threshold", info="The lower the threshold, the more predicted labels."),
                             gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
                     outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],

             result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution)
         else:
             result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution, language=lan_dict[language])
+        audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=int(top_k), p_threshold=threshold, include_class_list=list(range(527)))
         asr_output = ""
         for segment in result['segments']:
           asr_output = asr_output + format(segment['start'], ".1f") + 's-' + format(segment['end'], ".1f") + 's: ' + segment['text'] + '\n'
                     inputs=[gr.Audio(type="filepath", source='microphone', label='Please either upload an audio file or record using the microphone.', show_label=True), gr.Audio(type="filepath"),
                             gr.Radio(["tiny", "tiny.en", "small", "large"], value='large', label="Model size", info="The larger the model, the better the performance and the slower the speed."),
                             gr.Radio(["Auto Detection", "English", "Chinese"], value='Auto Detection', label="Language", info="Please specify the language, or let the model detect it automatically"),
+                            gr.Slider(1, 10, value=5, step=1, label="Top-K", info="The max number of labels to predict."),
                             gr.Slider(-10, 0, value=-1, label="Prediction Threshold", info="The lower the threshold, the more predicted labels."),
                             gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
                     outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],