File size: 2,125 Bytes
f2d2799
a99fdb4
b0de1c2
 
 
 
 
 
 
 
a99fdb4
f2d2799
a99fdb4
f2d2799
 
 
 
 
 
b0de1c2
 
 
 
 
 
 
 
 
e24d3c1
b0de1c2
 
 
 
e24d3c1
b0de1c2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f2d2799
b0de1c2
 
 
3225caa
dff31f5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
from fastai.vision.all import *
import gradio as gr
import librosa
import matplotlib.pyplot as plt
import numpy as np
from PIL import Image
from matplotlib.backends.backend_agg import FigureCanvasAgg
from fastai.vision.all import *
from fastcore.all import *
import os

learn = load_learner('spanish_vowels.pkl')

categories = ('a', 'e', 'i', 'o', 'u')

def classify_image(img):
    pred, idx, probs = learn.predict(img)
    return dict(zip(categories, map(float, probs)))

def classify_voice(voice):
    img = voice_to_image(voice)
    pred, idx, probs = learn.predict(img)
    return dict(zip(categories, map(float, probs))), img

def get_voice(voice):
    global voice_rec
    voice_rec = voice

voice = gr.Audio(type='filepath')
label = [gr.Label(), gr.Image()]

def voice_to_image(voice):

    audio_data, sample_rate = librosa.load(voice)

    # Generate the spectrogram using librosa
    spectrogram = librosa.stft(audio_data)
    spectrogram_db = librosa.amplitude_to_db(abs(spectrogram))

    # Create a matplotlib figure and plot the spectrogram
    # fig = plt.figure(frameon=False, figsize=(320 / 80, 240 / 80), dpi=80)
    fig = plt.figure(frameon=False)
    ax = fig.add_axes([0, 0, 1, 1])
    ax.pcolormesh(spectrogram_db, cmap="gray")
    ax.set_axisbelow(True)
    ax.set_xlabel("Time")
    ax.set_ylabel("Frequency")
    ax.set_title("Spectrogram")

    # Remove the extra whitespace around the plot
    fig.tight_layout(pad=0)

    # Convert the figure to an image using the `PIL` library
    canvas = FigureCanvasAgg(fig)
    canvas.draw()
    image_data = canvas.tostring_rgb()
    width, height = fig.get_size_inches() * fig.get_dpi()
    image = Image.frombytes("RGB", (int(width), int(height)), image_data)

    # fig.savefig('tmp.jpg')
    # image = Image.open('tmp.jpg')

    # Close the figure to release memory
    plt.close(fig)

    return image

# intf = gr.Interface(fn = classify_image, inputs = image, outputs = label)
intf = gr.Interface(fn = classify_voice, inputs = voice, outputs = label)
# intf = gr.Interface(fn = get_voice, inputs = voice, outputs = label)
# 
intf.launch(debug=True)