File size: 3,156 Bytes
028a426
 
 
 
 
 
 
 
c731c61
 
 
 
 
 
 
 
028a426
92bb01c
028a426
 
 
 
c731c61
028a426
 
c731c61
028a426
 
c731c61
028a426
 
c731c61
 
028a426
 
 
 
c731c61
2d4dba1
028a426
 
 
c731c61
 
 
 
 
028a426
c731c61
028a426
 
2d4dba1
028a426
 
 
 
 
 
 
 
7551cd5
 
 
 
 
c731c61
 
 
 
 
 
 
 
3d63486
25b69c5
c731c61
b1b97c4
c731c61
92bb01c
028a426
 
c731c61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import re
import glob
import pickle
import os
import torch
import numpy as np
from utils.audio import load_spectrograms
from utils.compute_args import compute_args
from utils.tokenize import (
    tokenize,
    create_dict,
    sent_to_ix,
    cmumosei_2,
    cmumosei_7,
    pad_feature,
)
from model_LA import Model_LA
import gradio as gr

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# load model
ckpts_path = "ckpt"
model_name = "Model_LA_e"
# Listing sorted checkpoints
ckpts = sorted(glob.glob(os.path.join(ckpts_path, model_name, "best*")), reverse=True)

# Load original args
args = torch.load(ckpts[0], map_location=torch.device(device))["args"]
args = compute_args(args)
pretrained_emb = np.load("train_glove.npy")
token_to_ix = pickle.load(open("token_to_ix.pkl", "rb"))
state_dict = torch.load(ckpts[0], map_location=torch.device(device))["state_dict"]

net = Model_LA(args, len(token_to_ix), pretrained_emb).to(device)
net.load_state_dict(state_dict)


def inference(source_video, transcription):
    # data preprocessing
    # text
    def clean(w):
        return (
            re.sub(r"([.,'!?\"()*#:;])", "", w.lower())
            .replace("-", " ")
            .replace("/", " ")
        )

    s = [clean(w) for w in transcription.split() if clean(w) != ""]

    # Sound
    _, mel, mag = load_spectrograms(source_video)

    l_max_len = args.lang_seq_len
    a_max_len = args.audio_seq_len
    v_max_len = args.video_seq_len
    L = sent_to_ix(s, token_to_ix, max_token=l_max_len)
    A = pad_feature(mel, a_max_len)
    V = pad_feature(mel, v_max_len)
    # print shapes
    print(f"Processed text shape from {len(s)} to {L.shape}")
    print(f"Processed audio shape from {mel.shape} to {A.shape}")
    print(f"Processed video shape from {mel.shape} to {V.shape}")

    net.train(False)
    x = np.expand_dims(L, axis=0)
    y = np.expand_dims(A, axis=0)
    z = np.expand_dims(V, axis=0)
    x, y, z = (
        torch.from_numpy(x).to(device),
        torch.from_numpy(y).to(device),
        torch.from_numpy(z).float().to(device),
    )
    pred = net(x, y, z).cpu().data.numpy()[0]
    # pred = np.exp(pred) / np.sum(np.exp(pred)) # softmax
    label_to_ix = ["happy", "sad", "angry", "fear", "disgust", "surprise"]
    # result_dict = {label_to_ix[i]: float(pred[i]) for i in range(len(label_to_ix))}
    result_dict = {label_to_ix[i]: float(pred[i]) > 0 for i in range(len(label_to_ix))}
    return result_dict


title = "Emotion Recognition"
description = ""

examples = [
    [
        "examples/0h-zjBukYpk_2.mp4",
        "NOW IM NOT EVEN GONNA SUGAR COAT THIS THIS MOVIE FRUSTRATED ME TO SUCH AN EXTREME EXTENT THAT I WAS LOUDLY EXCLAIMING WHY AT THE END OF THE FILM",
    ],
    ["examples/0h-zjBukYpk_19.mp4", "NOW OTHER PERFORMANCES ARE BORDERLINE OKAY"],
    ["examples/03bSnISJMiM_1.mp4", "IT WAS REALLY GOOD "],
    ["examples/03bSnISJMiM_5.mp4", "AND THEY SHOULDVE I GUESS "],
]

gr.Interface(
    inference,
    inputs=[gr.inputs.Video(type="avi", source="upload"), "text"],
    outputs=["label"],
    title=title,
    description=description,
    examples=examples,
).launch(debug=True)