Spaces:

nouamanetazi
/

emotion_recognition

Build error

File size: 3,156 Bytes

import re
import glob
import pickle
import os
import torch
import numpy as np
from utils.audio import load_spectrograms
from utils.compute_args import compute_args
from utils.tokenize import (
    tokenize,
    create_dict,
    sent_to_ix,
    cmumosei_2,
    cmumosei_7,
    pad_feature,
)
from model_LA import Model_LA
import gradio as gr

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# load model
ckpts_path = "ckpt"
model_name = "Model_LA_e"
# Listing sorted checkpoints
ckpts = sorted(glob.glob(os.path.join(ckpts_path, model_name, "best*")), reverse=True)

# Load original args
args = torch.load(ckpts[0], map_location=torch.device(device))["args"]
args = compute_args(args)
pretrained_emb = np.load("train_glove.npy")
token_to_ix = pickle.load(open("token_to_ix.pkl", "rb"))
state_dict = torch.load(ckpts[0], map_location=torch.device(device))["state_dict"]

net = Model_LA(args, len(token_to_ix), pretrained_emb).to(device)
net.load_state_dict(state_dict)


def inference(source_video, transcription):
    # data preprocessing
    # text
    def clean(w):
        return (
            re.sub(r"([.,'!?\"()*#:;])", "", w.lower())
            .replace("-", " ")
            .replace("/", " ")
        )

    s = [clean(w) for w in transcription.split() if clean(w) != ""]

    # Sound
    _, mel, mag = load_spectrograms(source_video)

    l_max_len = args.lang_seq_len
    a_max_len = args.audio_seq_len
    v_max_len = args.video_seq_len
    L = sent_to_ix(s, token_to_ix, max_token=l_max_len)
    A = pad_feature(mel, a_max_len)
    V = pad_feature(mel, v_max_len)
    # print shapes
    print(f"Processed text shape from {len(s)} to {L.shape}")
    print(f"Processed audio shape from {mel.shape} to {A.shape}")
    print(f"Processed video shape from {mel.shape} to {V.shape}")

    net.train(False)
    x = np.expand_dims(L, axis=0)
    y = np.expand_dims(A, axis=0)
    z = np.expand_dims(V, axis=0)
    x, y, z = (
        torch.from_numpy(x).to(device),
        torch.from_numpy(y).to(device),
        torch.from_numpy(z).float().to(device),
    )
    pred = net(x, y, z).cpu().data.numpy()[0]
    # pred = np.exp(pred) / np.sum(np.exp(pred)) # softmax
    label_to_ix = ["happy", "sad", "angry", "fear", "disgust", "surprise"]
    # result_dict = {label_to_ix[i]: float(pred[i]) for i in range(len(label_to_ix))}
    result_dict = {label_to_ix[i]: float(pred[i]) > 0 for i in range(len(label_to_ix))}
    return result_dict


title = "Emotion Recognition"
description = ""

examples = [
    [
        "examples/0h-zjBukYpk_2.mp4",
        "NOW IM NOT EVEN GONNA SUGAR COAT THIS THIS MOVIE FRUSTRATED ME TO SUCH AN EXTREME EXTENT THAT I WAS LOUDLY EXCLAIMING WHY AT THE END OF THE FILM",
    ],
    ["examples/0h-zjBukYpk_19.mp4", "NOW OTHER PERFORMANCES ARE BORDERLINE OKAY"],
    ["examples/03bSnISJMiM_1.mp4", "IT WAS REALLY GOOD "],
    ["examples/03bSnISJMiM_5.mp4", "AND THEY SHOULDVE I GUESS "],
]

gr.Interface(
    inference,
    inputs=[gr.inputs.Video(type="avi", source="upload"), "text"],
    outputs=["label"],
    title=title,
    description=description,
    examples=examples,
).launch(debug=True)