import os import gradio as gr from transformers import pipeline from pytube import YouTube pipe = pipeline(model="irena/whisper-small-sv-SE") def main_generator(youtube_id:str): YouTubeID = youtube_id.split("https://www.youtube.com/watch?v=") # if len(YouTubeID)>1: YouTubeID = YouTubeID[1] else: YouTubeID ='xOZM-1p-jAk' OutputFile = f'test_audio_youtube_{YouTubeID}.m4a' os.system(f"youtube-dl -o {OutputFile} {YouTubeID} --extract-audio --restrict-filenames -f 'bestaudio[ext=m4a]'") result = pipe(OutputFile) text = result['text'] output_list = [] output_list.append(text) return text def transcribe(audio): text = pipe(audio)["text"] return text demo = gr.Blocks() iface = gr.Interface( fn=transcribe, inputs=gr.Audio(source="microphone", type="filepath"), outputs="text", title="Whisper Small Swedish-Microphone", description="Realtime demo for Swedish speech recognition using a fine-tuned Whisper small model. An audio for recognize.", ) inputs = [gr.Textbox(lines=1, placeholder="Link of youtube video here...", label="Input")] outputs = gr.HighlightedText() title="Transcription of Swedish videos" description = "This demo uses small Whisper to transcribe what is spoken in a swedish video" examples = ['https://www.youtube.com/watch?v=6eWhV7xYH-Q'] io = gr.Interface(fn=main_generator, inputs=inputs, outputs=outputs, title=title, description = description, examples = examples, css= """.gr-button-primary { background: -webkit-linear-gradient( 90deg, #355764 0%, #55a8a1 100% ) !important; background: #355764; background: linear-gradient( 90deg, #355764 0%, #55a8a1 100% ) !important; background: -moz-linear-gradient( 90deg, #355764 0%, #55a8a1 100% ) !important; background: -webkit-linear-gradient( 90deg, #355764 0%, #55a8a1 100% ) !important; color:white !important}""" ) with demo: gr.TabbedInterface([iface, yt], ["Transcribe Audio", "Transcribe YouTube"]) demo.launch(enable_queue=True)