Spaces:

Ayushdavidkushwahaaaa
/

Dual_pipeline_model_testing_now

Sleeping

App Files Files Community

Dual_pipeline_model_testing_now / app.py

Ayushdavidkushwahaaaa

Update app.py

42bb520 verified about 1 month ago

raw

history blame

3.16 kB

	import gradio as gr
	import torch
	import whisper # Now you should be able to import the whisper module
	import warnings
	import os
	import librosa
	import numpy as np
	from transformers import pipeline

	warnings.filterwarnings('ignore')

	MODEL_NAME = "openai/whisper-small"
	BATCH_SIZE = 8

	device = 0 if torch.cuda.is_available() else "cpu"

	# Whisper for transcription
	pipe = pipeline(
	task="automatic-speech-recognition",
	model=MODEL_NAME,
	chunk_length_s=30,
	device=device
	)

	# Emotion classifier for text-based classification
	emotion_classifier = pipeline("text-classification", model='MilaNLProc/xlm-emo-t', return_all_scores=True)

	# Function to extract prosodic features using librosa
	def extract_audio_features(audio_file):
	y, sr = librosa.load(audio_file)

	# Pitch (Fundamental Frequency)
	pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
	pitch = np.mean([pitches[i][magnitudes[i] > 0] for i in range(len(pitches)) if len(pitches[i][magnitudes[i] > 0]) > 0])

	# Intensity (RMS)
	rms = np.mean(librosa.feature.rms(y=y))

	# Loudness (Using the perceptual C-weighting of the signal)
	S = np.abs(librosa.stft(y))**2
	loudness = np.mean(librosa.perceptual_weighting(S, freqs=librosa.fft_frequencies(sr=sr)))

	return {
	"pitch": pitch,
	"rms": rms,
	"loudness": loudness
	}

	# Function to transcribe and classify emotions (dual-pipeline)
	def translate_and_classify(audio):
	# Step 1: Transcribe audio to text using Whisper
	text_result = pipe(audio, batch_size=BATCH_SIZE)["text"]

	# Step 2: Extract prosodic features from the audio using librosa
	prosodic_features = extract_audio_features(audio)

	# Step 3: Use the emotion classifier on the transcribed text
	emotion = emotion_classifier(text_result)
	detected_emotion = {}
	for emotion_item in emotion[0]:
	detected_emotion[emotion_item["label"]] = emotion_item["score"]

	# Combine prosodic features and text-based emotion detection
	combined_result = {
	"transcription": text_result,
	"text_based_emotion": detected_emotion,
	"prosody": prosodic_features
	}

	return combined_result["transcription"], combined_result["text_based_emotion"], combined_result["prosody"]

	# Gradio UI
	with gr.Blocks() as demo:
	gr.Markdown(
	"""# Emotion Detection from Speech

	##### Detection of anger, sadness, joy, fear in speech using OpenAI Whisper, XLM-RoBERTa, and prosodic features (pitch, loudness, intensity)
	""")

	with gr.Column():
	with gr.Tab("Record Audio"):
	audio_input_r = gr.Audio(label='Record Audio Input', sources=["microphone"], type="filepath")
	transcribe_audio_r = gr.Button('Transcribe')

	with gr.Tab("Upload Audio as File"):
	audio_input_u = gr.Audio(label='Upload Audio', sources=["upload"], type="filepath")
	transcribe_audio_u = gr.Button('Transcribe')

	with gr.Row():
	transcript_output = gr.Textbox(label="Transcription", lines=3)
	emotion_output = gr.Label(label="Detected Emotion from Text")
	demo.launch(share=True)