import spaces import os import gradio as gr import torch import torchaudio from transformers import AutoModelForCTC, Wav2Vec2BertProcessor from pytube import YouTube from transformers import pipeline import re # pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0) # old model pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0) # new model with a new tokenizer replacements = [ ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'), ('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'), ('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'), ('щӏ', 'ɕ'), ('я', 'йа') ] reverse_replacements = {v: k for k, v in replacements} reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements)) def replace_symbols_back(text): return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text) @spaces.GPU def transcribe_speech(audio, progress=gr.Progress()): if audio is None: # Handle the NoneType error for microphone input return "No audio received." progress(0, desc="Transcribing audio...") transcription = pipe(audio, chunk_length_s=10)['text'] progress(1, desc="Transcription finished") return replace_symbols_back(transcription) def transcribe_from_youtube(url, progress=gr.Progress()): progress(0, "Starting YouTube audio download...") # Download audio from YouTube using pytube audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4") progress(50, "Transcribing audio...") transcription = transcribe_speech(audio_path) os.remove(audio_path) progress(100, "Done!") return transcription def populate_metadata(url): yt = YouTube(url) return yt.thumbnail_url, yt.title with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.HTML( """

Kabardian Speech Transcription

Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model

""" ) with gr.Tab("Microphone Input"): gr.Markdown("## Transcribe speech from microphone") mic_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label="Record or upload an audio") transcribe_button = gr.Button("Transcribe") transcription_output = gr.Textbox(label="Transcription") transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output) with gr.Tab("YouTube URL"): gr.Markdown("## Transcribe speech from YouTube video") youtube_url = gr.Textbox(label="Enter YouTube video URL") title = gr.Label(label="Video Title") img = gr.Image(label="Thumbnail", height=120, width=120) transcribe_button = gr.Button("Transcribe") transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10) transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output) youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title]) demo.launch()