Spaces:
Running
Running
yuangongfdu
commited on
Commit
•
d45d3e7
1
Parent(s):
cf1523c
Update app.py
Browse files
app.py
CHANGED
@@ -34,7 +34,7 @@ def predict(audio_path_m, audio_path_t, model_size, language, top_k, threshold,
|
|
34 |
result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution)
|
35 |
else:
|
36 |
result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution, language=lan_dict[language])
|
37 |
-
audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=top_k, p_threshold=threshold, include_class_list=list(range(527)))
|
38 |
asr_output = ""
|
39 |
for segment in result['segments']:
|
40 |
asr_output = asr_output + format(segment['start'], ".1f") + 's-' + format(segment['end'], ".1f") + 's: ' + segment['text'] + '\n'
|
@@ -49,7 +49,7 @@ iface = gr.Interface(fn=predict,
|
|
49 |
inputs=[gr.Audio(type="filepath", source='microphone', label='Please either upload an audio file or record using the microphone.', show_label=True), gr.Audio(type="filepath"),
|
50 |
gr.Radio(["tiny", "tiny.en", "small", "large"], value='large', label="Model size", info="The larger the model, the better the performance and the slower the speed."),
|
51 |
gr.Radio(["Auto Detection", "English", "Chinese"], value='Auto Detection', label="Language", info="Please specify the language, or let the model detect it automatically"),
|
52 |
-
gr.Slider(1, 10, value=5, label="Top-K", info="The max number of labels to predict."),
|
53 |
gr.Slider(-10, 0, value=-1, label="Prediction Threshold", info="The lower the threshold, the more predicted labels."),
|
54 |
gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
|
55 |
outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],
|
|
|
34 |
result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution)
|
35 |
else:
|
36 |
result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution, language=lan_dict[language])
|
37 |
+
audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=int(top_k), p_threshold=threshold, include_class_list=list(range(527)))
|
38 |
asr_output = ""
|
39 |
for segment in result['segments']:
|
40 |
asr_output = asr_output + format(segment['start'], ".1f") + 's-' + format(segment['end'], ".1f") + 's: ' + segment['text'] + '\n'
|
|
|
49 |
inputs=[gr.Audio(type="filepath", source='microphone', label='Please either upload an audio file or record using the microphone.', show_label=True), gr.Audio(type="filepath"),
|
50 |
gr.Radio(["tiny", "tiny.en", "small", "large"], value='large', label="Model size", info="The larger the model, the better the performance and the slower the speed."),
|
51 |
gr.Radio(["Auto Detection", "English", "Chinese"], value='Auto Detection', label="Language", info="Please specify the language, or let the model detect it automatically"),
|
52 |
+
gr.Slider(1, 10, value=5, step=1, label="Top-K", info="The max number of labels to predict."),
|
53 |
gr.Slider(-10, 0, value=-1, label="Prediction Threshold", info="The lower the threshold, the more predicted labels."),
|
54 |
gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
|
55 |
outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],
|