anzorq's picture
Update app.py
8884e76 verified
raw
history blame
3.42 kB
import spaces
import os
import gradio as gr
import torch
import torchaudio
from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
from pytube import YouTube
from transformers import pipeline
import re
# pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0) # old model
pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd-v2", device=0) # new model with a new tokenizer
replacements = [
('гъ', 'ɣ'), ('дж', 'j'), ('дз', 'ӡ'), ('жь', 'ʐ'), ('кӏ', 'қ'),
('кхъ', 'qҳ'), ('къ', 'q'), ('лъ', 'ɬ'), ('лӏ', 'ԯ'), ('пӏ', 'ԥ'),
('тӏ', 'ҭ'), ('фӏ', 'ჶ'), ('хь', 'h'), ('хъ', 'ҳ'), ('цӏ', 'ҵ'),
('щӏ', 'ɕ'), ('я', 'йа')
]
reverse_replacements = {v: k for k, v in replacements}
reverse_pattern = re.compile('|'.join(re.escape(key) for key in reverse_replacements))
def replace_symbols_back(text):
return reverse_pattern.sub(lambda match: reverse_replacements[match.group(0)], text)
@spaces.GPU
def transcribe_speech(audio, progress=gr.Progress()):
if audio is None: # Handle the NoneType error for microphone input
return "No audio received."
progress(0, desc="Transcribing audio...")
transcription = pipe(audio, chunk_length_s=10)['text']
progress(1, desc="Transcription finished")
return replace_symbols_back(transcription)
def transcribe_from_youtube(url, progress=gr.Progress()):
progress(0, "Starting YouTube audio download...")
# Download audio from YouTube using pytube
audio_path = YouTube(url).streams.filter(only_audio=True)[0].download(filename="tmp.mp4")
progress(50, "Transcribing audio...")
transcription = transcribe_speech(audio_path)
os.remove(audio_path)
progress(100, "Done!")
return transcription
def populate_metadata(url):
yt = YouTube(url)
return yt.thumbnail_url, yt.title
with gr.Blocks(theme=gr.themes.Soft()) as demo:
gr.HTML(
"""
<div style="text-align: center; max-width: 500px; margin: 0 auto;">
<div>
<h1>Kabardian Speech Transcription</h1>
</div>
<p style="margin-bottom: 10px; font-size: 94%">
Kabardian speech to text transcription using a fine-tuned Wav2Vec2-BERT model
</p>
</div>
"""
)
with gr.Tab("Microphone Input"):
gr.Markdown("## Transcribe speech from microphone")
mic_audio = gr.Audio(sources=['microphone','upload'], type="filepath", label="Record or upload an audio")
transcribe_button = gr.Button("Transcribe")
transcription_output = gr.Textbox(label="Transcription")
transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output)
with gr.Tab("YouTube URL"):
gr.Markdown("## Transcribe speech from YouTube video")
youtube_url = gr.Textbox(label="Enter YouTube video URL")
title = gr.Label(label="Video Title")
img = gr.Image(label="Thumbnail", height=120, width=120)
transcribe_button = gr.Button("Transcribe")
transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10)
transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output)
youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title])
demo.launch()