import gradio import torchaudio from fastai.vision.all import * from fastai.learner import load_learner from torchvision.utils import save_image from huggingface_hub import hf_hub_download model = load_learner( hf_hub_download("kurianbenoy/music_genre_classification_baseline", "model.pkl") ) EXAMPLES_PATH = Path("./examples") labels = model.dls.vocab interface_options = { "title": "Music Genre Classification", "description": "A simple baseline model for classifying music genres with fast.ai on [Kaggle competition data](https://www.kaggle.com/competitions/kaggle-pog-series-s01e02/data)", "examples": [f"{EXAMPLES_PATH}/{f.name}" for f in EXAMPLES_PATH.iterdir()], "interpretation": "default", "layout": "horizontal", "theme": "default", } N_FFT = 2048 HOP_LEN = 1024 def create_spectrogram(filename): audio, sr = torchaudio.load(filename) specgram = torchaudio.transforms.MelSpectrogram( sample_rate=sr, n_fft=N_FFT, win_length=N_FFT, hop_length=HOP_LEN, center=True, pad_mode="reflect", power=2.0, norm="slaney", onesided=True, n_mels=224, mel_scale="htk", )(audio).mean(axis=0) specgram = torchaudio.transforms.AmplitudeToDB()(specgram) specgram = specgram - specgram.min() specgram = specgram / specgram.max() return specgram def create_image(filename): specgram = create_spectrogram(filename) dest = Path("temp.png") save_image(specgram, "temp.png") def predict(img): img = PILImage.create(img) _pred, _pred_w_idx, probs = model.predict(img) labels_probs = {labels[i]: float(probs[i]) for i, _ in enumerate(labels)} return labels_probs def end2endpipeline(filename): create_image(filename) return predict("temp.png") demo = gradio.Interface( fn=end2endpipeline, inputs=gradio.inputs.Audio( source="microphone", type="filepath", label="Record/ Drop audio" ), outputs=gradio.outputs.Label(num_top_classes=5), **interface_options, ) launch_options = { "enable_queue": True, "share": False, } demo.launch(**launch_options)