Spaces:

anzorq
/

w2v-bert-2.0-kbd

Paused

App Files Files Community

anzorq commited on May 22

Commit

800e3a8

•

1 Parent(s): 7cdf3f3

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -20

app.py CHANGED Viewed

@@ -51,32 +51,22 @@ def wiener_filter(audio_tensor):
     return torch.tensor(filtered_audio, dtype=audio_tensor.dtype)
 @spaces.GPU
-def transcribe_speech(audio, apply_wiener_filter=False, apply_normalization=False, apply_spectral_gating=False, progress=gr.Progress()):
-    if audio is None:
         return "No audio received.", None
     progress(0.1, desc="Preprocessing audio...")
-    audio_tensor, original_sample_rate = torchaudio.load(audio)
     audio_tensor = preprocess_audio(audio_tensor, original_sample_rate, apply_normalization)
-    if apply_wiener_filter:
-        progress(0.3, desc="Applying Wiener filter...")
-        audio_tensor = wiener_filter(audio_tensor)
-    if apply_spectral_gating:
-        progress(0.5, desc="Applying Spectral Gating filter...")
-        audio_tensor = spectral_gating(audio_tensor)
     progress(0.7, desc="Transcribing audio...")
     audio_np = audio_tensor.numpy().squeeze()
     transcription = pipe(audio_np, chunk_length_s=10)['text']
     transcription = replace_symbols_back(transcription)
-    audio_np = audio_tensor.numpy().squeeze()
-    sf.write("temp_audio.wav", audio_np, 16000, subtype='PCM_16')
-    return transcription, "temp_audio.wav"
 def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply_spectral_gating, progress=gr.Progress()):
     progress(0, "Downloading YouTube audio...")
@@ -95,10 +85,13 @@ def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply
             audio_tensor = wiener_filter(audio_tensor)
         if apply_spectral_gating:
-            progress(0.4, "Applying Spectral Gating filter...")
             audio_tensor = spectral_gating(audio_tensor)
-        transcription, _ = transcribe_speech(audio_tensor)
         audio_np = audio_tensor.numpy().squeeze()
         sf.write("temp_audio.wav", audio_np, 16000, subtype='PCM_16')
@@ -106,7 +99,7 @@ def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply
     except Exception as e:
         return str(e), None
-    return transcription, "temp_audio.wav"
 def populate_metadata(url):
     yt = YouTube(url)
@@ -131,9 +124,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         mic_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label="Record or upload an audio")
         transcribe_button = gr.Button("Transcribe")
         transcription_output = gr.Textbox(label="Transcription")
-        audio_output = gr.Audio(label="Processed Audio")
-        transcribe_button.click(fn=transcribe_speech, inputs=[mic_audio], outputs=[transcription_output, audio_output])
     with gr.Tab("YouTube URL"):
         gr.Markdown("## Transcribe speech from YouTube video")

     return torch.tensor(filtered_audio, dtype=audio_tensor.dtype)
 @spaces.GPU
+def transcribe_speech(audio_path, progress=gr.Progress()):
+    if audio_path is None:
         return "No audio received.", None
     progress(0.1, desc="Preprocessing audio...")
+    audio_tensor, original_sample_rate = torchaudio.load(audio_path)
     audio_tensor = preprocess_audio(audio_tensor, original_sample_rate, apply_normalization)
     progress(0.7, desc="Transcribing audio...")
     audio_np = audio_tensor.numpy().squeeze()
     transcription = pipe(audio_np, chunk_length_s=10)['text']
     transcription = replace_symbols_back(transcription)
+    return transcription
+@spaces.GPU
 def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply_spectral_gating, progress=gr.Progress()):
     progress(0, "Downloading YouTube audio...")
             audio_tensor = wiener_filter(audio_tensor)
         if apply_spectral_gating:
+            progress(0.6, "Applying Spectral Gating filter...")
             audio_tensor = spectral_gating(audio_tensor)
+        progress(0.8, "Transcribing audio...")
+        audio_np = audio_tensor.numpy().squeeze()
+        transcription = pipe(audio_np, chunk_length_s=10)['text']
+        transcription = replace_symbols_back(transcription)
         audio_np = audio_tensor.numpy().squeeze()
         sf.write("temp_audio.wav", audio_np, 16000, subtype='PCM_16')
     except Exception as e:
         return str(e), None
+    return transcription, "temp_audio.wav"
 def populate_metadata(url):
     yt = YouTube(url)
         mic_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label="Record or upload an audio")
         transcribe_button = gr.Button("Transcribe")
         transcription_output = gr.Textbox(label="Transcription")
+        transcribe_button.click(fn=transcribe_speech, inputs=[mic_audio], outputs=[transcription_output])
     with gr.Tab("YouTube URL"):
         gr.Markdown("## Transcribe speech from YouTube video")