import spaces import os import gradio as gr import torch import torchaudio from transformers import AutoModelForCTC, Wav2Vec2BertProcessor from pytube import YouTube from transformers import pipeline import re # pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0) # old model pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0) # new model with a new tokenizer replacements = [ ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'), ('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'), ('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'), ('щӏ', 'ɕ'), ('я', 'йа') ] reverse_replacements = {v: k for k, v in replacements} reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements)) def replace_symbols_back(text): return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text) @spaces.GPU def transcribe_speech(audio, progress=gr.Progress()): if audio is None: # Handle the NoneType error for microphone input return "No audio received." progress(0, desc="Transcribing audio...") transcription = pipe(audio, chunk_length_s=10)['text'] progress(1, desc="Transcription finished") return replace_symbols_back(transcription) def transcribe_from_youtube(url, progress=gr.Progress()): progress(0, "Starting YouTube audio download...") # Download audio from YouTube using pytube audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4") progress(50, "Transcribing audio...") transcription = transcribe_speech(audio_path) os.remove(audio_path) progress(100, "Done!") return transcription def populate_metadata(url): yt = YouTube(url) return yt.thumbnail_url, yt.title with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.HTML( """
Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model