Spaces:
Paused
Paused
File size: 5,962 Bytes
bfb5ccb 1c4ba6c 8ca2e83 0863f8c 15ae509 3f40220 da952ef 15ae509 be67fae 0ba67b4 be67fae 8ca2e83 15ae509 3f40220 15ae509 3f40220 15ae509 3f40220 1c4ba6c 1ce7124 15ae509 b20428c 1ce7124 b20428c 15ae509 6f0ed3e a28d64a 15ae509 6f0ed3e 15ae509 6f0ed3e bfb5ccb 800e3a8 a28d64a 7cdf3f3 800e3a8 7cdf3f3 15ae509 7cdf3f3 800e3a8 d1e3f48 800e3a8 a28d64a 6e5c3ef 15ae509 f580f81 7cdf3f3 15ae509 a28d64a 15ae509 7cdf3f3 a28d64a 800e3a8 7cdf3f3 15ae509 800e3a8 15ae509 7cdf3f3 6723721 194bfe5 15ae509 a28d64a 6f0ed3e 800e3a8 eaed2c2 1c4ba6c 14f36e9 1c4ba6c 29c16a4 1c4ba6c 6e35142 1c4ba6c 6e35142 1c4ba6c 6f0ed3e eaed2c2 beedcb4 d1e3f48 beedcb4 6f0ed3e 800e3a8 eaed2c2 1ce7124 de5267f a28d64a 6f0ed3e 578d71a 030116b 6f0ed3e eaed2c2 1c4ba6c a28d64a 6f0ed3e a28d64a 1c4ba6c 8ca2e83 da952ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import spaces
import os
import gradio as gr
import torch
import torchaudio
from transformers import pipeline
from pytube import YouTube
import re
import numpy as np
from scipy.signal import wiener
from io import BytesIO
import noisereduce as nr
import soundfile as sf
pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0)
# Define the replacements for Kabardian transcription
replacements = [
('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'),
('щӏ', 'ɕ'), ('я', 'йа')
]
# Reverse replacements for transcription
reverse_replacements = {v: k for k, v in replacements}
reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements))
def replace_symbols_back(text):
return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)
def preprocess_audio(audio_tensor, original_sample_rate, apply_normalization):
audio_tensor = audio_tensor.to(dtype=torch.float32)
audio_tensor = torch.mean(audio_tensor, dim=0, keepdim=True) # Convert to mono
if apply_normalization:
audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor)) # Normalize
audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=original_sample_rate, new_freq=16000) # Resample
return audio_tensor
def spectral_gating(audio_tensor):
audio_data = audio_tensor.numpy()
reduced_noise = nr.reduce_noise(y=audio_data, sr=16_000)
return torch.tensor(reduced_noise, dtype=audio_tensor.dtype)
def wiener_filter(audio_tensor):
audio_data = audio_tensor.numpy()
filtered_audio = wiener(audio_data)
return torch.tensor(filtered_audio, dtype=audio_tensor.dtype)
@spaces.GPU
def transcribe_speech(audio_path, progress=gr.Progress()):
if audio_path is None:
return "No audio received.", None
progress(0.1, desc="Preprocessing audio...")
audio_tensor, original_sample_rate = torchaudio.load(audio_path)
audio_tensor = preprocess_audio(audio_tensor, original_sample_rate, apply_normalization)
progress(0.7, desc="Transcribing audio...")
audio_np = audio_tensor.numpy().squeeze()
transcription = pipe(audio_np, chunk_length_s=10)['text']
transcription = replace_symbols_back(transcription)
return transcription
@spaces.GPU
def transcribe_from_youtube(url, apply_wiener_filter, apply_normalization, apply_spectral_gating, progress=gr.Progress()):
progress(0, "Downloading YouTube audio...")
try:
yt = YouTube(url)
stream = yt.streams.filter(only_audio=True).first()
audio_data = BytesIO()
stream.stream_to_buffer(audio_data)
audio_data.seek(0)
audio_tensor, original_sample_rate = torchaudio.load(audio_data)
audio_tensor = preprocess_audio(audio_tensor, original_sample_rate, apply_normalization)
if apply_wiener_filter:
progress(0.4, "Applying Wiener filter...")
audio_tensor = wiener_filter(audio_tensor)
if apply_spectral_gating:
progress(0.6, "Applying Spectral Gating filter...")
audio_tensor = spectral_gating(audio_tensor)
progress(0.8, "Transcribing audio...")
audio_np = audio_tensor.numpy().squeeze()
transcription = pipe(audio_np, chunk_length_s=10)['text']
transcription = replace_symbols_back(transcription)
audio_np = audio_tensor.numpy().squeeze()
sf.write("temp_audio.wav", audio_np, 16000, subtype='PCM_16')
except Exception as e:
return str(e), None
return transcription, "temp_audio.wav"
def populate_metadata(url):
yt = YouTube(url)
return yt.thumbnail_url, yt.title
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 500px; margin: 0 auto;">
<div>
<h1>Kabardian Speech Transcription</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model
</p>
</div>
"""
)
with gr.Tab("Microphone Input"):
gr.Markdown("## Transcribe speech from microphone")
mic_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label="Record or upload an audio")
transcribe_button = gr.Button("Transcribe")
transcription_output = gr.Textbox(label="Transcription")
transcribe_button.click(fn=transcribe_speech, inputs=[mic_audio], outputs=[transcription_output])
with gr.Tab("YouTube URL"):
gr.Markdown("## Transcribe speech from YouTube video")
youtube_url = gr.Textbox(label="Enter YouTube video URL")
with gr.Accordion("Audio Improvements", open=False):
apply_normalization = gr.Checkbox(label="Normalize audio volume", value=False)
apply_spectral_gating = gr.Checkbox(label="Apply Spectral Gating filter", info="Noise reduction", value=False)
apply_wiener = gr.Checkbox(label="Apply Wiener filter", info="Noise reduction", value=False)
with gr.Row():
img = gr.Image(label="Thumbnail", height=240, width=240, scale=1)
title = gr.Label(label="Video Title", scale=2)
transcribe_button = gr.Button("Transcribe")
transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
audio_output = gr.Audio(label="Processed Audio")
transcribe_button.click(fn=transcribe_from_youtube, inputs=[youtube_url, apply_wiener, apply_normalization, apply_spectral_gating], outputs=[transcription_output, audio_output])
youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
demo.launch() |