from transformers import pipeline import gradio as gr import pytube pipe = pipeline(model="kk90ujhun/whisper-small-zh") def transcribe(my_url,audio): if my_url: my_audio = pytube.YouTube(my_url).streams.filter(subtype='mp4').first().download() text = pipe(my_audio)["text"] return text else: text = pipe(audio)["text"] return text iface = gr.Interface( fn=transcribe, inputs=[ gr.Textbox(label="Enter your YouTube URL:"), gr.Audio(label="Speak to your microphone",source="microphone", type="filepath"), ], # outputs="text", title="Whisper Small Chinese", description="Realtime demo for Chinese speech recognition using a fine-tuned Whisper small model.", ) iface.launch()