Spaces:

anzorq
/

w2v-bert-2.0-kbd

Paused

App Files Files Community

anzorq commited on May 17

Commit

550d732

•

1 Parent(s): 6fd478d

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -9

app.py CHANGED Viewed

@@ -17,22 +17,23 @@ def transcribe_speech(audio):
     waveform, sr = torchaudio.load(audio)
     # Resample the audio if needed
-    resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
-    waveform = resampler(waveform)
     # Convert to mono if needed
     if waveform.dim() > 1:
-        waveform = torch.mean(waveform, dim=0)
     # Normalize the audio
     waveform = waveform / torch.max(torch.abs(waveform))
     # Extract input features
-    input_features = processor(waveform.unsqueeze(0), sampling_rate=16000).input_features
-    input_features = torch.from_numpy(input_features).to(device)
-    # Generate logits using the model
     with torch.no_grad():
         logits = model(input_features).logits
     # Decode the predicted ids to text
@@ -44,7 +45,7 @@ def transcribe_speech(audio):
 @spaces.GPU
 def transcribe_from_youtube(url):
     # Download audio from YouTube using yt-dlp
-    audio_path = "downloaded_audio.wav"
     ydl_opts = {
         'format': 'bestaudio/best',
         'outtmpl': audio_path,
@@ -60,8 +61,17 @@ def transcribe_from_youtube(url):
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
         ydl.download([url])
     # Transcribe the downloaded audio
-    return transcribe_speech(audio_path)
 with gr.Blocks() as demo:
     with gr.Tab("Microphone Input"):

     waveform, sr = torchaudio.load(audio)
     # Resample the audio if needed
+    if sr != 16000:
+        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
+        waveform = resampler(waveform)
     # Convert to mono if needed
     if waveform.dim() > 1:
+        waveform = torchaudio.transforms.DownmixMono()(waveform)
     # Normalize the audio
     waveform = waveform / torch.max(torch.abs(waveform))
     # Extract input features
     with torch.no_grad():
+        input_features = processor(waveform.unsqueeze(0), sampling_rate=16000).input_features
+        input_features = torch.from_numpy(input_features).to(device)
+        # Generate logits using the model
         logits = model(input_features).logits
     # Decode the predicted ids to text
 @spaces.GPU
 def transcribe_from_youtube(url):
     # Download audio from YouTube using yt-dlp
+    audio_path = f"downloaded_audio_{url.split('=')[-1]}.wav"
     ydl_opts = {
         'format': 'bestaudio/best',
         'outtmpl': audio_path,
     with yt_dlp.YoutubeDL(ydl_opts) as ydl:
         ydl.download([url])
+    # # Check if the file exists
+    # if not os.path.exists(audio_path):
+    #     raise FileNotFoundError(f"Failed to find the audio file {audio_path}")
     # Transcribe the downloaded audio
+    transcription = transcribe_speech(audio_path)
+    # Optionally, clean up the downloaded file
+    os.remove(audio_path)
+    return transcription
 with gr.Blocks() as demo:
     with gr.Tab("Microphone Input"):