from fastai.vision.all import * import gradio as gr import librosa import matplotlib.pyplot as plt import numpy as np from PIL import Image from matplotlib.backends.backend_agg import FigureCanvasAgg from fastai.vision.all import * from fastcore.all import * import os learn = load_learner('spanish_vowels.pkl') categories = ('a', 'e', 'i', 'o', 'u') def classify_image(img): pred, idx, probs = learn.predict(img) return dict(zip(categories, map(float, probs))) def classify_voice(voice): img = voice_to_image(voice) pred, idx, probs = learn.predict(img) return dict(zip(categories, map(float, probs))), img def get_voice(voice): global voice_rec voice_rec = voice voice = gr.Audio(type='filepath') label = [gr.Label(), gr.Image()] def voice_to_image(voice): audio_data, sample_rate = librosa.load(voice) # Generate the spectrogram using librosa spectrogram = librosa.stft(audio_data) spectrogram_db = librosa.amplitude_to_db(abs(spectrogram)) # Create a matplotlib figure and plot the spectrogram # fig = plt.figure(frameon=False, figsize=(320 / 80, 240 / 80), dpi=80) fig = plt.figure(frameon=False) ax = fig.add_axes([0, 0, 1, 1]) ax.pcolormesh(spectrogram_db, cmap="gray") ax.set_axisbelow(True) ax.set_xlabel("Time") ax.set_ylabel("Frequency") ax.set_title("Spectrogram") # Remove the extra whitespace around the plot fig.tight_layout(pad=0) # Convert the figure to an image using the `PIL` library canvas = FigureCanvasAgg(fig) canvas.draw() image_data = canvas.tostring_rgb() width, height = fig.get_size_inches() * fig.get_dpi() image = Image.frombytes("RGB", (int(width), int(height)), image_data) # fig.savefig('tmp.jpg') # image = Image.open('tmp.jpg') # Close the figure to release memory plt.close(fig) return image # intf = gr.Interface(fn = classify_image, inputs = image, outputs = label) intf = gr.Interface(fn = classify_voice, inputs = voice, outputs = label) # intf = gr.Interface(fn = get_voice, inputs = voice, outputs = label) # intf.launch(debug=True)