import spaces import os import gradio as gr import torch import torchaudio from transformers import AutoModelForCTC, Wav2Vec2BertProcessor from pytube import YouTube from transformers import pipeline pipe = pipeline(model="anzorq/w2v-bert-2.0-kbd", device=0) @spaces.GPU def transcribe_speech(audio): if audio is None: # Handle the NoneType error for microphone input return "No audio received." return pipe(audio, chunk_length_s=10)['text']#, return_timestamps='word') def transcribe_from_youtube(url): # Download audio from YouTube using pytube yt = YouTube(url) audio_path = yt.streams.filter(only_audio=True)[0].download(filename="tmp.mp4") # Transcribe the downloaded audio transcription = transcribe_speech(audio_path) # Clean up the downloaded file os.remove(audio_path) return transcription def populate_metadata(url): yt = YouTube(url) return yt.thumbnail_url, yt.title with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.HTML( """
Speech to text transcription of Youtube videos using Wav2Vec2-BERT