import gradio as gr import torch import whisper import warnings import os import librosa import numpy as np from transformers import pipeline warnings.filterwarnings('ignore') MODEL_NAME = "openai/whisper-small" BATCH_SIZE = 8 device = 0 if torch.cuda.is_available() else "cpu" # Whisper for transcription pipe = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device ) # Emotion classifier for text-based classification emotion_classifier = pipeline("text-classification", model='MilaNLProc/xlm-emo-t', return_all_scores=True) # Function to extract prosodic features using librosa def extract_audio_features(audio_file): y, sr = librosa.load(audio_file) # Pitch (Fundamental Frequency) pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr) pitch = np.mean([pitches[i][magnitudes[i] > 0] for i in range(len(pitches)) if len(pitches[i][magnitudes[i] > 0]) > 0]) # Intensity (RMS) rms = np.mean(librosa.feature.rms(y=y)) # Loudness (Using the perceptual C-weighting of the signal) S = np.abs(librosa.stft(y))**2 loudness = np.mean(librosa.perceptual_weighting(S, freqs=librosa.fft_frequencies(sr=sr))) return { "pitch": pitch, "rms": rms, "loudness": loudness } # Function to transcribe and classify emotions (dual-pipeline) def translate_and_classify(audio): # Step 1: Transcribe audio to text using Whisper text_result = pipe(audio, batch_size=BATCH_SIZE)["text"] # Step 2: Extract prosodic features from the audio using librosa prosodic_features = extract_audio_features(audio) # Step 3: Use the emotion classifier on the transcribed text emotion = emotion_classifier(text_result) detected_emotion = {} for emotion_item in emotion[0]: detected_emotion[emotion_item["label"]] = emotion_item["score"] # Combine prosodic features and text-based emotion detection combined_result = { "transcription": text_result, "text_based_emotion": detected_emotion, "prosody": prosodic_features } return combined_result["transcription"], combined_result["text_based_emotion"], combined_result["prosody"] # Gradio UI with gr.Blocks() as demo: gr.Markdown( """# Emotion Detection from Speech ##### Detection of anger, sadness, joy, fear in speech using OpenAI Whisper, XLM-RoBERTa, and prosodic features (pitch, loudness, intensity) """) with gr.Column(): with gr.Tab("Record Audio"): audio_input_r = gr.Audio(label='Record Audio Input', sources=["microphone"], type="filepath") transcribe_audio_r = gr.Button('Transcribe') with gr.Tab("Upload Audio as File"): audio_input_u = gr.Audio(label='Upload Audio', sources=["upload"], type="filepath") transcribe_audio_u = gr.Button('Transcribe') with gr.Row(): transcript_output = gr.Textbox(label="Transcription", lines=3) emotion_output = gr.Label(label="Detected Emotion from Text") prosody_output = gr.Label(label="Prosodic Features (Pitch, Loudness, Intensity)") transcribe_audio_r.click(translate_and_classify, inputs=audio_input_r, outputs=[transcript_output, emotion_output, prosody_output]) transcribe_audio_u.click(translate_and_classify, inputs=audio_input_u, outputs=[transcript_output, emotion_output, prosody_output]) demo.launch(share=True)