import spaces import os import gradio as gr import torch import torchaudio from transformers import AutoModelForCTC, Wav2Vec2BertProcessor from pytube import YouTube from transformers import pipeline pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0) @spaces.GPU def transcribe_speech(audio): if audio is None: # Handle the NoneType error for microphone input return "No audio received." return pipe(audio, chunk_length_s=10)['text']#, return_timestamps='word') def transcribe_from_youtube(url): # Download audio from YouTube using pytube yt = YouTube(url) audio_path = yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4") # Transcribe the downloaded audio transcription = transcribe_speech(audio_path) # Clean up the downloaded file os.remove(audio_path) return transcription def populate_metadata(url): yt = YouTube(url) return yt.thumbnail_url, yt.title with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.HTML( """

Youtube Speech Transcription

Speech to text transcription of Youtube videos using Wav2Vec2-BERT

""" ) with gr.Tab("Microphone Input"): gr.Markdown("## Transcribe speech from microphone") mic_audio = gr.Audio(sources="microphone", type="filepath", label="Speak into your microphone") transcribe_button = gr.Button("Transcribe") transcription_output = gr.Textbox(label="Transcription") transcribe_button.click(fn=transcribe_speech, inputs=mic_audio, outputs=transcription_output) with gr.Tab("YouTube URL"): gr.Markdown("## Transcribe speech from YouTube video") youtube_url = gr.Textbox(label="Enter YouTube video URL") title = gr.Label(label="Video Title") img = gr.Image(label="Thumbnail", height=120, width=120) transcribe_button = gr.Button("Transcribe") transcription_output = gr.Textbox(label="Transcription", placeholder="Transcription Output", lines=10) transcribe_button.click(fn=transcribe_from_youtube, inputs=youtube_url, outputs=transcription_output) youtube_url.change(populate_metadata, inputs=[youtube_url], outputs=[img, title]) demo.launch()