yuangongfdu commited on
Commit
3d1236b
1 Parent(s): 3b082be

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -20
app.py CHANGED
@@ -15,28 +15,26 @@ def round_time_resolution(time_resolution):
15
  rounded_time_resolution = rounded_multiple * 0.4
16
  return rounded_time_resolution
17
 
18
- def predict(audio_path, time_resolution):
19
- audio_tagging_time_resolution = round_time_resolution(time_resolution)
20
- result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution)
21
- # # ASR Results
22
- # print(result["text"])
23
- # # Audio Tagging Results
24
- audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=5, p_threshold=-1, include_class_list=list(range(527)))
25
- # print(audio_tag_result)
26
-
27
- asr_output = ""
28
- for segment in result['segments']:
29
- asr_output = asr_output + str(segment['start']) + 's-' + str(segment['end']) + 's: ' + segment['text'] + '\n'
30
- at_output = ""
31
- for segment in audio_tag_result:
32
- print(segment)
33
- at_output = at_output + str(segment['time']['start']) + 's-' + str(segment['time']['end']) + 's: ' + ','.join([x[0] for x in segment['audio tags']]) + '\n'
34
- print(at_output)
35
- #asr_output, at_output = str(result["text"]), 'k'
36
- return asr_output, at_output
37
 
38
  iface = gr.Interface(fn=predict,
39
- inputs=[gr.Audio(type="filepath", source='microphone'), gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
40
  outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],
41
  cache_examples=True,
42
  title="Quick Demo of Whisper-AT",
 
15
  rounded_time_resolution = rounded_multiple * 0.4
16
  return rounded_time_resolution
17
 
18
+ def predict(audio_path_m, audio_path_t, time_resolution):
19
+ if (audio_path_m is None) != (audio_path_t is None):
20
+ return "Please only upload one recording, either upload it or record using microphone.", "Please only upload one recording, either upload it or record using microphone."
21
+ else:
22
+ audio_path = audio_path_m or audio_path_t
23
+ audio_tagging_time_resolution = round_time_resolution(time_resolution)
24
+ result = model.transcribe(audio_path, at_time_res=audio_tagging_time_resolution)
25
+ audio_tag_result = whisper.parse_at_label(result, language='follow_asr', top_k=5, p_threshold=-1, include_class_list=list(range(527)))
26
+ asr_output = ""
27
+ for segment in result['segments']:
28
+ asr_output = asr_output + str(segment['start']) + 's-' + str(segment['end']) + 's: ' + segment['text'] + '\n'
29
+ at_output = ""
30
+ for segment in audio_tag_result:
31
+ print(segment)
32
+ at_output = at_output + str(segment['time']['start']) + 's-' + str(segment['time']['end']) + 's: ' + ','.join([x[0] for x in segment['audio tags']]) + '\n'
33
+ print(at_output)
34
+ return asr_output, at_output
 
 
35
 
36
  iface = gr.Interface(fn=predict,
37
+ inputs=[gr.Audio(type="filepath", source='microphone'), gr.Audio(type="filepath"), gr.Textbox(value='10', label='Time Resolution in Seconds (Must be must be an integer multiple of 0.4, e.g., 0.4, 2, 10)')],
38
  outputs=[gr.Textbox(label="Speech Output"), gr.Textbox(label="Audio Tag Output")],
39
  cache_examples=True,
40
  title="Quick Demo of Whisper-AT",