from transformers import pipeline import gradio as gr from pytube import YouTube from transformers import Dataset, Audio pipe = pipeline(model="Neprox/model") def transcribe(audio, url): if url: # Download YouTube video streams = YouTube(url).streams.filter(only_audio=True, file_extension='mp4') audio_fpath = streams.first().download() # TODO: # Process up to 10 minutes by segmenting into 30 second blocks # Use pyMovie for selecting time ranges # query every block individually # Annotate text with timestamps audio_dataset = Dataset.from_dict({"audio": [audio_fpath]}).cast_column("audio", Audio()) text = pipe(audio_dataset[0]["audio"]) return text else: text = pipe(audio)["text"] return text iface = gr.Interface( fn=transcribe, inputs=[ gr.Audio(source="microphone", type="filepath") gr.Text(max_lines=1, placeholder="Enter YouTube Link with Swedish speech to be transcribed") ], outputs="text", title="Whisper Small Swedish", description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model.", ) iface.launch()