Kabardian Speech Transcription

import spaces
import os
import gradio as gr
import torch
import torchaudio
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
from pytube import YouTube
from transformers import pipeline
import re

# pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0) # old model
pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0) # new model with a new tokenizer

replacements = [
    ('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
    ('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
    ('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'),
    ('щӏ', 'ɕ'), ('я', 'йа')
]

reverse_replacements = {v: k for k, v in replacements}
reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements))

def replace_symbols_back(text):
    return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)

@spaces.GPU
def transcribe_speech(audio, progress=gr.Progress()):
    if audio is None:  # Handle the NoneType error for microphone input
        return "No audio received."
    
    progress(0, desc="Transcribing audio...")
    transcription = pipe(audio, chunk_length_s=10)['text']
    progress(1, desc="Transcription finished")
    
    return replace_symbols_back(transcription)

def transcribe_from_youtube(url, progress=gr.Progress()):
    progress(0, "Starting YouTube audio download...")
    # Download audio from YouTube using pytube
    audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4")

    progress(50, "Transcribing audio...")
    transcription = transcribe_speech(audio_path)

    os.remove(audio_path)
    
    progress(100, "Done!")
    return transcription

def populate_metadata(url):
    yt = YouTube(url)
    return yt.thumbnail_url, yt.title

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.HTML(
        """
            <div style="text-align: center; max-width: 500px; margin: 0 auto;">
              <div>
                <h1>Kabardian Speech Transcription</h1>
              </div>
              <p style="margin-bottom: 10px; font-size: 94%">
                Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model
              </p>
            </div>
        """
    )
    
    with gr.Tab("Microphone Input"):
        gr.Markdown("## Transcribe speech from microphone")
        mic_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label="Record or upload an audio")
        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription")
        
        transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output)

    with gr.Tab("YouTube URL"):
        gr.Markdown("## Transcribe speech from YouTube video")
        youtube_url = gr.Textbox(label="Enter YouTube video URL")
        title = gr.Label(label="Video Title")
        img = gr.Image(label="Thumbnail", height=120, width=120)
        transcribe_button = gr.Button("Transcribe")
        transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
        
        transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
        youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])

demo.launch()