|
import gradio as gr |
|
import torch |
|
import torch.nn.functional as F |
|
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification |
|
import torchaudio |
|
import numpy as np |
|
|
|
|
|
emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"] |
|
|
|
|
|
model_name = "Dpngtm/wav2vec2-emotion-recognition" |
|
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name) |
|
processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion_labels)) |
|
|
|
|
|
device = "cuda" if torch.cuda.is_available() else "cpu" |
|
model.to(device) |
|
model.eval() |
|
|
|
def recognize_emotion(audio): |
|
try: |
|
if audio is None: |
|
return {emotion: 0.0 for emotion in emotion_labels} |
|
|
|
audio_path = audio if isinstance(audio, str) else audio.name |
|
speech_array, sampling_rate = torchaudio.load(audio_path) |
|
|
|
duration = speech_array.shape[1] / sampling_rate |
|
if duration > 60: |
|
return { |
|
"Error": "Audio too long (max 1 minute)", |
|
**{emotion: 0.0 for emotion in emotion_labels} |
|
} |
|
|
|
if sampling_rate != 16000: |
|
resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000) |
|
speech_array = resampler(speech_array) |
|
|
|
if speech_array.shape[0] > 1: |
|
speech_array = torch.mean(speech_array, dim=0, keepdim=True) |
|
|
|
speech_array = speech_array / torch.max(torch.abs(speech_array)) |
|
speech_array = speech_array.squeeze().numpy() |
|
|
|
inputs = processor(speech_array, sampling_rate=16000, return_tensors='pt', padding=True) |
|
input_values = inputs.input_values.to(device) |
|
|
|
with torch.no_grad(): |
|
outputs = model(input_values) |
|
logits = outputs.logits |
|
probs = F.softmax(logits, dim=-1)[0].cpu().numpy() |
|
|
|
confidence_scores = { |
|
emotion: round(float(prob) * 100, 2) |
|
for emotion, prob in zip(emotion_labels, probs) |
|
} |
|
|
|
sorted_scores = dict(sorted( |
|
confidence_scores.items(), |
|
key=lambda x: x[1], |
|
reverse=True |
|
)) |
|
|
|
return sorted_scores |
|
|
|
except Exception as e: |
|
return { |
|
"Error": str(e), |
|
**{emotion: 0.0 for emotion in emotion_labels} |
|
} |
|
|
|
interface = gr.Interface( |
|
fn=recognize_emotion, |
|
inputs=gr.Audio( |
|
sources=["microphone", "upload"], |
|
type="filepath", |
|
label="Upload audio or record from microphone", |
|
max_length=60 |
|
), |
|
outputs=gr.Label( |
|
num_top_classes=len(emotion_labels), |
|
label="Emotion Predictions" |
|
), |
|
title="Speech Emotion Recognition", |
|
description=""" |
|
## Speech Emotion Recognition using Wav2Vec2 |
|
|
|
This model recognizes emotions from speech audio in the following categories: |
|
- Angry π |
|
- Calm π |
|
- Disgust π€’ |
|
- Fearful π¨ |
|
- Happy π |
|
- Neutral π |
|
- Sad π’ |
|
- Surprised π² |
|
|
|
### Instructions: |
|
1. Upload an audio file or record through the microphone |
|
2. Wait for processing |
|
3. View predicted emotions with confidence scores |
|
|
|
### Notes: |
|
- Maximum audio length: 1 minute |
|
- Best results with clear speech and minimal background noise |
|
- Confidence scores are shown as percentages |
|
""" |
|
) |
|
|
|
if __name__ == "__main__": |
|
interface.launch( |
|
share=True, |
|
debug=True, |
|
server_name="0.0.0.0", |
|
server_port=7860 |
|
) |