import gradio as gr
import torch
import torch.nn.functional as F
from transformers import Wav2Vec2Processor, Wav2Vec2ForSequenceClassification
import torchaudio
import numpy as np

# Define emotion labels
emotion_labels = ["angry", "calm", "disgust", "fearful", "happy", "neutral", "sad", "surprised"]

# Load model and processor
model_name = "Dpngtm/wav2vec2-emotion-recognition"
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)
processor = Wav2Vec2Processor.from_pretrained(model_name, num_labels=len(emotion_labels))

# Define device
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

def recognize_emotion(audio):
    try:
        if audio is None:
            return {emotion: 0.0 for emotion in emotion_labels}
            
        audio_path = audio if isinstance(audio, str) else audio.name
        speech_array, sampling_rate = torchaudio.load(audio_path)
        
        duration = speech_array.shape[1] / sampling_rate
        if duration > 60:
            return {
                "Error": "Audio too long (max 1 minute)",
                **{emotion: 0.0 for emotion in emotion_labels}
            }
        
        if sampling_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sampling_rate, new_freq=16000)
            speech_array = resampler(speech_array)
        
        if speech_array.shape[0] > 1:
            speech_array = torch.mean(speech_array, dim=0, keepdim=True)
            
        speech_array = speech_array / torch.max(torch.abs(speech_array))
        speech_array = speech_array.squeeze().numpy()
        
        inputs = processor(speech_array, sampling_rate=16000, return_tensors='pt', padding=True)
        input_values = inputs.input_values.to(device)
        
        with torch.no_grad():
            outputs = model(input_values)
            logits = outputs.logits
            probs = F.softmax(logits, dim=-1)[0].cpu().numpy()
            
            confidence_scores = {
                emotion: round(float(prob) * 100, 2)
                for emotion, prob in zip(emotion_labels, probs)
            }
            
            sorted_scores = dict(sorted(
                confidence_scores.items(), 
                key=lambda x: x[1], 
                reverse=True
            ))
            
            return sorted_scores
            
    except Exception as e:
        return {
            "Error": str(e),
            **{emotion: 0.0 for emotion in emotion_labels}
        }

interface = gr.Interface(
    fn=recognize_emotion,
    inputs=gr.Audio(
        sources=["microphone", "upload"],
        type="filepath",
        label="Upload audio or record from microphone",
        max_length=60
    ),
    outputs=gr.Label(
        num_top_classes=len(emotion_labels),
        label="Emotion Predictions"
    ),
    title="Speech Emotion Recognition",
    description="""
    ## Speech Emotion Recognition using Wav2Vec2
    
    This model recognizes emotions from speech audio in the following categories:
    - Angry 😠
    - Calm 😌
    - Disgust 🤢
    - Fearful 😨
    - Happy 😊
    - Neutral 😐
    - Sad 😢
    - Surprised 😲
    
    ### Instructions:
    1. Upload an audio file or record through the microphone
    2. Wait for processing
    3. View predicted emotions with confidence scores
    
    ### Notes:
    - Maximum audio length: 1 minute
    - Best results with clear speech and minimal background noise
    - Confidence scores are shown as percentages
    """
)

if __name__ == "__main__":
    interface.launch(
        share=True,
        debug=True,
        server_name="0.0.0.0",
        server_port=7860
    )