MikeTangoEcho commited on
Commit
c092255
1 Parent(s): 75b7975

fix: app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -1
app.py CHANGED
@@ -57,7 +57,16 @@ def transcribe(audio: str | Path | bytes | tuple[int, np.ndarray] | None):
57
  # https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__
58
  # Whisper input format for tuple differ from output provided by gradio audio component
59
  if asr_model.startswith("openai/whisper"):
60
- inputs = {"sampling_rate": audio[0], "raw": audio[1]} if type(audio) is tuple else audio
 
 
 
 
 
 
 
 
 
61
  transcript = asr(inputs)
62
  text = transcript['text']
63
 
 
57
  # https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline.__call__
58
  # Whisper input format for tuple differ from output provided by gradio audio component
59
  if asr_model.startswith("openai/whisper"):
60
+ sampling_rate, raw = audio
61
+
62
+ # Convert to mono if stereo
63
+ if raw.ndim > 1:
64
+ raw = raw.mean(axis=1)
65
+
66
+ raw = raw.astype(np.float32)
67
+ raw /= np.max(np.abs(raw))
68
+
69
+ inputs = {"sampling_rate": sampling_rate, "raw": raw} if type(audio) is tuple else audio
70
  transcript = asr(inputs)
71
  text = transcript['text']
72