Spaces:
Sleeping
Sleeping
| import spaces | |
| import os | |
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| from transformers import pipeline | |
| from pytube import YouTube | |
| import re | |
| import numpy as np | |
| from scipy.signal import wiener | |
| from io import BytesIO | |
| pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0) | |
| # Define the replacements for Kabardian transcription | |
| replacements = [ | |
| ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'), | |
| ('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'), | |
| ('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'), | |
| ('щӏ', 'ɕ'), ('я', 'йа') | |
| ] | |
| # Reverse replacements for transcription | |
| reverse_replacements = {v: k for k, v in replacements} | |
| reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements)) | |
| def replace_symbols_back(text): | |
| return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text) | |
| def preprocess_audio(audio_tensor, original_sample_rate): | |
| audio_tensor = audio_tensor.to(dtype=torch.float32) | |
| audio_tensor = torch.mean(audio_tensor, dim=0, keepdim=True) # Convert to mono | |
| audio_tensor = audio_tensor / torch.max(torch.abs(audio_tensor)) # Normalize | |
| audio_tensor = torchaudio.functional.resample(audio_tensor, orig_freq=original_sample_rate, new_freq=16000) # Resample | |
| return audio_tensor | |
| def apply_wiener_filter(audio_tensor): | |
| audio_data = audio_tensor.numpy() | |
| filtered_audio = wiener(audio_data) | |
| return torch.tensor(filtered_audio, dtype=audio_tensor.dtype) | |
| def transcribe_speech(audio, progress=gr.Progress()): | |
| if audio is None: | |
| return "No audio received." | |
| progress(0.5, desc="Transcribing audio...") | |
| audio_np = audio.numpy().squeeze() | |
| transcription = pipe(audio_np, chunk_length_s=10)['text'] | |
| return replace_symbols_back(transcription) | |
| def transcribe_from_youtube(url, apply_improvements, progress=gr.Progress()): | |
| progress(0, "Downloading YouTube audio...") | |
| yt = YouTube(url) | |
| stream = yt.streams.filter(only_audio=True).first() | |
| audio_data = BytesIO() | |
| stream.stream_to_buffer(audio_data) | |
| audio_data.seek(0) | |
| try: | |
| audio, original_sample_rate = torchaudio.load(audio_data) | |
| audio = preprocess_audio(audio, original_sample_rate) | |
| if apply_improvements: | |
| progress(0.4, "Applying Wiener filter...") | |
| audio = apply_wiener_filter(audio) | |
| transcription = transcribe_speech(audio) | |
| except Exception as e: | |
| return str(e) | |
| return transcription | |
| def populate_metadata(url): | |
| yt = YouTube(url) | |
| return yt.thumbnail_url, yt.title | |
| with gr.Blocks(theme=gr.themes.Soft()) as demo: | |
| gr.HTML( | |
| """ | |
| <div style="text-align: center; max-width: 500px; margin: 0 auto;"> | |
| <div> | |
| <h1>Kabardian Speech Transcription</h1> | |
| </div> | |
| <p style="margin-bottom: 10px; font-size: 94%"> | |
| Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model | |
| </p> | |
| </div> | |
| """ | |
| ) | |
| with gr.Tab("Microphone Input"): | |
| gr.Markdown("## Transcribe speech from microphone") | |
| mic_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label="Record or upload an audio") | |
| transcribe_button = gr.Button("Transcribe") | |
| transcription_output = gr.Textbox(label="Transcription") | |
| transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output) | |
| with gr.Tab("YouTube URL"): | |
| gr.Markdown("## Transcribe speech from YouTube video") | |
| youtube_url = gr.Textbox(label="Enter YouTube video URL") | |
| apply_improvements = gr.Checkbox(label="Apply Audio Improvements", value=True) | |
| with gr.Row(): | |
| img = gr.Image(label="Thumbnail", height=240, width=240, scale=1) | |
| title = gr.Label(label="Video Title", scale=2) | |
| transcribe_button = gr.Button("Transcribe") | |
| transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10) | |
| transcribe_button.click(fn=transcribe_from_youtube, inputs=[youtube_url, apply_improvements], outputs=transcription_output) | |
| youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title]) | |
| demo.launch() |