import torch from transformers import pipeline from transformers.pipelines.audio_utils import ffmpeg_read import gradio as gr MODEL_NAME = "JackismyShephard/whisper-medium.en-finetuned-gtzan" device = 0 if torch.cuda.is_available() else "cpu" pipe = pipeline( task="audio-classification", model=MODEL_NAME, device=device, ) def classify_audio(filepath): preds = pipe(filepath) outputs = {} for p in preds: outputs[p["label"]] = p["score"] return outputs demo = gr.Blocks() file_transcribe = gr.Interface( fn=transcribe, #TODO not sure we need list here inputs=[ #TODO not sure we need '.inputs.' gr.inputs.Audio(source="upload", optional=True, label="Audio file", type="filepath"), #TODO add inputs source upload here, if possible? #TODO add inputs source youtube here, if possible? ], outputs="label", #TODO not sure about this layout="horizontal", #TODO not sure we need this theme="huggingface", title="Classify Genre of Music", description=( "Classify long-form audio or microphone inputs with the click of a button! Demo uses the" f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to classify audio files" " of arbitrary length." ), examples=[ ["./example.flac", "transcribe", False], ["./example.flac", "transcribe", True], ], cache_examples=True, allow_flagging="never", ) mic_transcribe = gr.Interface( fn=transcribe, inputs=[ gr.inputs.Audio(source="microphone", type="filepath", optional=True), ], outputs="label", #TODO not sure about this layout="horizontal", theme="huggingface", title="Classify Genre of Music", description=( "Classify long-form audio or microphone inputs with the click of a button! Demo uses the" f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to classify audio files" " of arbitrary length." ), allow_flagging="never", ) with demo: gr.TabbedInterface([file_transcribe, mic_transcribe], ["Classify Audio File", "classify Microphone input"]) demo.launch(enable_queue=True)