yuangongfdu commited on
Commit
d45d3e7
1 Parent(s): cf1523c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +2 -2
app.py CHANGED
@@ -34,7 +34,7 @@ def predict(audio_path_m, audio_path_t, model_size, language, top_k, threshold,
34
  result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution)
35
  else:
36
  result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution, language=lan_dict[language])
37
- audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=top_k, p_threshold=threshold, include_class_list=list(range(527)))
38
  asr_output = ""
39
  for segment in result['segments']:
40
  asr_output = asr_output + format(segment['start'], ".1f") + 's-' + format(segment['end'], ".1f") + 's: ' + segment['text'] + '\n'
@@ -49,7 +49,7 @@ iface = gr.Interface(fn=predict,
49
  inputs=[gr.Audio(type="filepath", source='microphone', label='Please either upload an audio file or record using the microphone.', show_label=True), gr.Audio(type="filepath"),
50
  gr.Radio(["tiny", "tiny.en", "small", "large"], value='large', label="Model size", info="The larger the model, the better the performance and the slower the speed."),
51
  gr.Radio(["Auto Detection", "English", "Chinese"], value='Auto Detection', label="Language", info="Please specify the language, or let the model detect it automatically"),
52
- gr.Slider(1, 10, value=5, label="Top-K", info="The max number of labels to predict."),
53
  gr.Slider(-10, 0, value=-1, label="Prediction Threshold", info="The lower the threshold, the more predicted labels."),
54
  gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
55
  outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],
 
34
  result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution)
35
  else:
36
  result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution, language=lan_dict[language])
37
+ audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=int(top_k), p_threshold=threshold, include_class_list=list(range(527)))
38
  asr_output = ""
39
  for segment in result['segments']:
40
  asr_output = asr_output + format(segment['start'], ".1f") + 's-' + format(segment['end'], ".1f") + 's: ' + segment['text'] + '\n'
 
49
  inputs=[gr.Audio(type="filepath", source='microphone', label='Please either upload an audio file or record using the microphone.', show_label=True), gr.Audio(type="filepath"),
50
  gr.Radio(["tiny", "tiny.en", "small", "large"], value='large', label="Model size", info="The larger the model, the better the performance and the slower the speed."),
51
  gr.Radio(["Auto Detection", "English", "Chinese"], value='Auto Detection', label="Language", info="Please specify the language, or let the model detect it automatically"),
52
+ gr.Slider(1, 10, value=5, step=1, label="Top-K", info="The max number of labels to predict."),
53
  gr.Slider(-10, 0, value=-1, label="Prediction Threshold", info="The lower the threshold, the more predicted labels."),
54
  gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
55
  outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],