Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -36,7 +36,6 @@ def preprocess_audio(audio_tensor, original_sample_rate, apply_normalization):
|
|
36 |
|
37 |
if apply_normalization:
|
38 |
audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor)) # Normalize
|
39 |
-
# audio_tensor = torch.clamp(audio_tensor, min=-1, max=1)
|
40 |
|
41 |
audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=original_sample_rate, new_freq=16000) # Resample
|
42 |
return audio_tensor
|
@@ -52,13 +51,31 @@ def wiener_filter(audio_tensor):
|
|
52 |
return torch.tensor(filtered_audio, dtype=audio_tensor.dtype)
|
53 |
|
54 |
@spaces.GPU
|
55 |
-
def transcribe_speech(audio, progress=gr.Progress()):
|
56 |
if audio is None:
|
57 |
return "No audio received.", None
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
transcription = pipe(audio_np, chunk_length_s=10)['text']
|
61 |
-
|
|
|
|
|
|
|
|
|
|
|
62 |
|
63 |
def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply_spectral_gating, progress=gr.Progress()):
|
64 |
progress(0, "Downloading YouTube audio...")
|
@@ -70,20 +87,20 @@ def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply
|
|
70 |
stream.stream_to_buffer(audio_data)
|
71 |
audio_data.seek(0)
|
72 |
|
73 |
-
|
74 |
-
|
75 |
|
76 |
if apply_wiener_filter:
|
77 |
progress(0.4, "Applying Wiener filter...")
|
78 |
-
|
79 |
|
80 |
if apply_spectral_gating:
|
81 |
progress(0.4, "Applying Spectral Gating filter...")
|
82 |
-
|
83 |
|
84 |
-
transcription, _ = transcribe_speech(
|
85 |
|
86 |
-
audio_np =
|
87 |
sf.write("temp_audio.wav", audio_np, 16000, subtype='PCM_16')
|
88 |
|
89 |
except Exception as e:
|
@@ -116,7 +133,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
116 |
transcription_output = gr.Textbox(label="Transcription")
|
117 |
audio_output = gr.Audio(label="Processed Audio")
|
118 |
|
119 |
-
transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=[transcription_output, audio_output])
|
120 |
|
121 |
with gr.Tab("YouTube URL"):
|
122 |
gr.Markdown("## Transcribe speech from YouTube video")
|
|
|
36 |
|
37 |
if apply_normalization:
|
38 |
audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor)) # Normalize
|
|
|
39 |
|
40 |
audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=original_sample_rate, new_freq=16000) # Resample
|
41 |
return audio_tensor
|
|
|
51 |
return torch.tensor(filtered_audio, dtype=audio_tensor.dtype)
|
52 |
|
53 |
@spaces.GPU
|
54 |
+
def transcribe_speech(audio, apply_wiener_filter=False, apply_normalization=False, apply_spectral_gating=False, progress=gr.Progress()):
|
55 |
if audio is None:
|
56 |
return "No audio received.", None
|
57 |
+
|
58 |
+
progress(0.1, desc="Preprocessing audio...")
|
59 |
+
audio_tensor, original_sample_rate = torchaudio.load(audio)
|
60 |
+
audio_tensor = preprocess_audio(audio_tensor, original_sample_rate, apply_normalization)
|
61 |
+
|
62 |
+
if apply_wiener_filter:
|
63 |
+
progress(0.3, desc="Applying Wiener filter...")
|
64 |
+
audio_tensor = wiener_filter(audio_tensor)
|
65 |
+
|
66 |
+
if apply_spectral_gating:
|
67 |
+
progress(0.5, desc="Applying Spectral Gating filter...")
|
68 |
+
audio_tensor = spectral_gating(audio_tensor)
|
69 |
+
|
70 |
+
progress(0.7, desc="Transcribing audio...")
|
71 |
+
audio_np = audio_tensor.numpy().squeeze()
|
72 |
transcription = pipe(audio_np, chunk_length_s=10)['text']
|
73 |
+
transcription = replace_symbols_back(transcription)
|
74 |
+
|
75 |
+
audio_np = audio_tensor.numpy().squeeze()
|
76 |
+
sf.write("temp_audio.wav", audio_np, 16000, subtype='PCM_16')
|
77 |
+
|
78 |
+
return transcription, "temp_audio.wav"
|
79 |
|
80 |
def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply_spectral_gating, progress=gr.Progress()):
|
81 |
progress(0, "Downloading YouTube audio...")
|
|
|
87 |
stream.stream_to_buffer(audio_data)
|
88 |
audio_data.seek(0)
|
89 |
|
90 |
+
audio_tensor, original_sample_rate = torchaudio.load(audio_data)
|
91 |
+
audio_tensor = preprocess_audio(audio_tensor, original_sample_rate, apply_normalization)
|
92 |
|
93 |
if apply_wiener_filter:
|
94 |
progress(0.4, "Applying Wiener filter...")
|
95 |
+
audio_tensor = wiener_filter(audio_tensor)
|
96 |
|
97 |
if apply_spectral_gating:
|
98 |
progress(0.4, "Applying Spectral Gating filter...")
|
99 |
+
audio_tensor = spectral_gating(audio_tensor)
|
100 |
|
101 |
+
transcription, _ = transcribe_speech(audio_tensor)
|
102 |
|
103 |
+
audio_np = audio_tensor.numpy().squeeze()
|
104 |
sf.write("temp_audio.wav", audio_np, 16000, subtype='PCM_16')
|
105 |
|
106 |
except Exception as e:
|
|
|
133 |
transcription_output = gr.Textbox(label="Transcription")
|
134 |
audio_output = gr.Audio(label="Processed Audio")
|
135 |
|
136 |
+
transcribe_button.click(fn=transcribe_speech, inputs=[mic_audio], outputs=[transcription_output, audio_output])
|
137 |
|
138 |
with gr.Tab("YouTube URL"):
|
139 |
gr.Markdown("## Transcribe speech from YouTube video")
|