|
from transformers import pipeline |
|
from sessions import sessions |
|
import torchaudio |
|
import torchaudio.transforms as T |
|
import gradio as gr |
|
|
|
pipe = pipeline( |
|
"audio-classification", |
|
model="BilalHasan/distilhubert-finetuned-ravdess", |
|
) |
|
|
|
audio_batch = [] |
|
def split_audio(array): |
|
len_of_each_array = 30 * 16000 |
|
arr1, arr2 = array[0: len_of_each_array], array[int(len_of_each_array / 2):] |
|
audio_batch.append(arr1) |
|
if len(arr2) > len_of_each_array: |
|
split_audio(arr2) |
|
else: |
|
audio_batch.append(arr2) |
|
return audio_batch |
|
|
|
|
|
def prediction(path): |
|
predictions = [] |
|
array, sr = torchaudio.load(path) |
|
resampler = T.Resample(sr, 16000) |
|
resampled_audio = resampler(array) |
|
audio_batch = split_audio(resampled_audio[0].numpy()) |
|
for i in range(len(audio_batch)): |
|
predictions.append(pipe(audio_batch[i])[0]['label']) |
|
mood = max(set(predictions), key = predictions.count) |
|
if mood in ['neutral', 'calm', 'happy', 'surprised']: |
|
mood = 'other' |
|
session = sessions.mood2session[mood] |
|
return mood, session |
|
|
|
|
|
demo = gr.Interface( |
|
fn=prediction, |
|
inputs=[gr.Audio(type="filepath")], |
|
outputs=[gr.Textbox(label="Mood"), gr.Textbox(label="Recommended Yoga Session")] |
|
) |
|
demo.launch() |
|
|