|
import gradio as gr |
|
import torch |
|
import whisper |
|
import warnings |
|
warnings.filterwarnings('ignore') |
|
from transformers import pipeline |
|
import os |
|
|
|
MODEL_NAME = "openai/whisper-small" |
|
BATCH_SIZE = 8 |
|
|
|
device = 0 if torch.cuda.is_available() else "cpu" |
|
|
|
pipe = pipeline( |
|
task="automatic-speech-recognition", |
|
model=MODEL_NAME, |
|
chunk_length_s=30, |
|
device=device) |
|
|
|
emotion_classifier = pipeline("text-classification",model='MilaNLProc/xlm-emo-t', return_all_scores=True) |
|
|
|
def transcribe(microphone, file_upload, task): |
|
output = "" |
|
if (microphone is not None) and (file_upload is not None): |
|
warn_output = ( |
|
"WARNING: You've uploaded an audio file and used the microphone. " |
|
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n" |
|
) |
|
|
|
elif (microphone is None) and (file_upload is None): |
|
raise gr.Error("You have to either use the microphone or upload an audio file") |
|
|
|
file = microphone if microphone is not None else file_upload |
|
|
|
text = pipe(file, batch_size=BATCH_SIZE, generate_kwargs={"task": task})["text"] |
|
|
|
return output + text |
|
|
|
def translate_and_classify(audio): |
|
text_result = transcribe(audio, None, "transcribe") |
|
emotion = emotion_classifier(text_result) |
|
detected_emotion = {} |
|
for emotion in emotion[0]: |
|
detected_emotion[emotion["label"]] = emotion["score"] |
|
return text_result, detected_emotion |
|
|
|
with gr.Blocks() as demo: |
|
|
|
gr.Markdown( |
|
""" # Emotion Detection from Speech |
|
|
|
##### Detection of anger, sadness, joy, fear in speech using OpenAI Whisper and XLM-RoBERTa |
|
|
|
""") |
|
|
|
with gr.Column(): |
|
with gr.Tab("Record Audio"): |
|
|
|
audio_input_r = gr.Audio(label = 'Record Audio Input',sources=["microphone"],type="filepath") |
|
transcribe_audio_r = gr.Button('Transcribe') |
|
|
|
with gr.Tab("Upload Audio as File"): |
|
|
|
audio_input_u = gr.Audio(label = 'Upload Audio',sources=["upload"],type="filepath") |
|
transcribe_audio_u = gr.Button('Transcribe') |
|
|
|
with gr.Row(): |
|
transcript_output = gr.Textbox(label="Transcription in the language of speech/audio", lines = 3) |
|
emotion_output = gr.Label(label = "Detected Emotion") |
|
|
|
transcribe_audio_r.click(translate_and_classify, inputs = audio_input_r, outputs = [transcript_output,emotion_output]) |
|
transcribe_audio_u.click(translate_and_classify, inputs = audio_input_u, outputs = [transcript_output,emotion_output]) |
|
|
|
demo.launch(share=True) |