import gradio as gr from pytube import YouTube from transformers import pipeline import os pipe = pipeline(model="GIanlucaRub/whisper-tiny-it-6") # change to "your-username/the-name-you-picked" def transcribe_yt(link): yt = YouTube(link) audio = yt.streams.filter(only_audio=True)[0].download(filename="audio.mp3") text = pipe(audio)["text"] os.remove(audio) return text def transcribe_audio(audio): text = pipe(audio)["text"] return text def populate_metadata(link): yt = YouTube(link) return yt.thumbnail_url, yt.title title="Youtube Whisperer" description="Speech to text transcription of Youtube videos using OpenAI's Whisper" block = gr.Blocks() with block: gr.HTML( """

Youtube Whisperer

Speech to text transcription of Youtube videos using OpenAI's Whisper

""" ) with gr.Group(): with gr.Box(): text = gr.Textbox( label="Transcription", placeholder="Transcription Output", lines=5) microphone=gr.Audio(source="microphone", type="filepath") with gr.Row().style(mobile_collapse=False, equal_height=True): btn_microphone = gr.Button("Transcribe microphone audio") audio_uploaded=gr.Audio(source="upload", type="filepath") with gr.Row().style(mobile_collapse=False, equal_height=True): btn_audio_uploaded = gr.Button("Transcribe audio uploaded") link = gr.Textbox(label="YouTube Link") with gr.Row().style(mobile_collapse=False, equal_height=True): btn_youtube = gr.Button("Transcribe Youtube video") with gr.Row().style(mobile_collapse=False, equal_height=True): title = gr.Label(label="Video Title", placeholder="Title") img = gr.Image(label="Thumbnail") # Events btn_youtube.click(transcribe_yt, inputs=[link], outputs=[text]) btn_microphone.click(transcribe_audio, inputs=[microphone], outputs=[text]) btn_audio_uploaded.click(transcribe_audio, inputs=[audio_uploaded], outputs=[text]) link.change(populate_metadata, inputs=[link], outputs=[img, title]) block.launch(debug=True)