Spaces:

nouamanetazi
/

emotion_recognition

Build error

App Files Files Community

nouamanetazi HF staff commited on Mar 1, 2022

Commit

ff43e05

1 Parent(s): 0f6f21e

initial commit

Browse files

Files changed (17) hide show

.gitattributes +2 -0
.gitignore +166 -0
ckpt/Model_LA_e/best81.21325494388027_1117766.pkl +3 -0
inference.ipynb +0 -0
layers/fc.py +37 -0
layers/layer_norm.py +16 -0
model_LA.py +343 -0
model_LAV.py +367 -0
token_to_ix.pkl +3 -0
train_glove.npy +3 -0
utils/__init__.py +0 -0
utils/audio.py +163 -0
utils/audio_params.py +47 -0
utils/compute_args.py +28 -0
utils/plot.py +13 -0
utils/pred_func.py +9 -0
utils/tokenize.py +103 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,166 @@

+*.tar.gz
+data
+# Initially taken from Github's Python gitignore file
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# tests and logs
+tests/fixtures/cached_*_text.txt
+logs/
+lightning_logs/
+lang_code_data/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# celery beat schedule file
+celerybeat-schedule
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vs
+.vscode
+# Pycharm
+.idea
+# TF code
+tensorflow_code
+# Models
+proc_data
+# examples
+runs
+/runs_old
+/wandb
+/examples/runs
+/examples/**/*.args
+/examples/rag/sweep
+# data
+/data
+serialization_dir
+# emacs
+*.*~
+debug.env
+# vim
+.*.swp
+#ctags
+tags
+# pre-commit
+.pre-commit*
+# .lock
+*.lock

ckpt/Model_LA_e/best81.21325494388027_1117766.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c49968f8c2bcd7ec0489bd88c1f41418d15f01932264487c6d088807dcaaf4c
+size 391671429

inference.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

layers/fc.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import torch.nn as nn
+class FC(nn.Module):
+    def __init__(self, in_size, out_size, dropout_r=0., use_relu=True):
+        super(FC, self).__init__()
+        self.dropout_r = dropout_r
+        self.use_relu = use_relu
+        self.linear = nn.Linear(in_size, out_size)
+        if use_relu:
+            self.relu = nn.ReLU(inplace=True)
+        if dropout_r > 0:
+            self.dropout = nn.Dropout(dropout_r)
+    def forward(self, x):
+        x = self.linear(x)
+        if self.use_relu:
+            x = self.relu(x)
+        if self.dropout_r > 0:
+            x = self.dropout(x)
+        return x
+class MLP(nn.Module):
+    def __init__(self, in_size, mid_size, out_size, dropout_r=0., use_relu=True):
+        super(MLP, self).__init__()
+        self.fc = FC(in_size, mid_size, dropout_r=dropout_r, use_relu=use_relu)
+        self.linear = nn.Linear(mid_size, out_size)
+    def forward(self, x):
+        return self.linear(self.fc(x))

layers/layer_norm.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch.nn as nn
+import torch
+class LayerNorm(nn.Module):
+    def __init__(self, size, eps=1e-6):
+        super(LayerNorm, self).__init__()
+        self.eps = eps
+        self.a_2 = nn.Parameter(torch.ones(size))
+        self.b_2 = nn.Parameter(torch.zeros(size))
+    def forward(self, x):
+        mean = x.mean(-1, keepdim=True)
+        std = x.std(-1, keepdim=True)
+        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

model_LA.py ADDED Viewed

	@@ -0,0 +1,343 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from layers.fc import MLP, FC
+from layers.layer_norm import LayerNorm
+# ------------------------------------
+# ---------- Masking sequence --------
+# ------------------------------------
+def make_mask(feature):
+    return (torch.sum(
+        torch.abs(feature),
+        dim=-1
+    ) == 0).unsqueeze(1).unsqueeze(2)
+# ------------------------------
+# ---------- Flattening --------
+# ------------------------------
+class AttFlat(nn.Module):
+    def __init__(self, args, flat_glimpse, merge=False):
+        super(AttFlat, self).__init__()
+        self.args = args
+        self.merge = merge
+        self.flat_glimpse = flat_glimpse
+        self.mlp = MLP(
+            in_size=args.hidden_size,
+            mid_size=args.ff_size,
+            out_size=flat_glimpse,
+            dropout_r=args.dropout_r,
+            use_relu=True
+        )
+        if self.merge:
+            self.linear_merge = nn.Linear(
+                args.hidden_size * flat_glimpse,
+                args.hidden_size * 2
+            )
+    def forward(self, x, x_mask):
+        att = self.mlp(x)
+        if x_mask is not None:
+            att = att.masked_fill(
+                x_mask.squeeze(1).squeeze(1).unsqueeze(2),
+                -1e9
+            )
+        att = F.softmax(att, dim=1)
+        att_list = []
+        for i in range(self.flat_glimpse):
+            att_list.append(
+                torch.sum(att[:, :, i: i + 1] * x, dim=1)
+            )
+        if self.merge:
+            x_atted = torch.cat(att_list, dim=1)
+            x_atted = self.linear_merge(x_atted)
+            return x_atted
+        return torch.stack(att_list).transpose_(0, 1)
+# ------------------------
+# ---- Self Attention ----
+# ------------------------
+class SA(nn.Module):
+    def __init__(self, args):
+        super(SA, self).__init__()
+        self.mhatt = MHAtt(args)
+        self.ffn = FFN(args)
+        self.dropout1 = nn.Dropout(args.dropout_r)
+        self.norm1 = LayerNorm(args.hidden_size)
+        self.dropout2 = nn.Dropout(args.dropout_r)
+        self.norm2 = LayerNorm(args.hidden_size)
+    def forward(self, y, y_mask):
+        y = self.norm1(y + self.dropout1(
+            self.mhatt(y, y, y, y_mask)
+        ))
+        y = self.norm2(y + self.dropout2(
+            self.ffn(y)
+        ))
+        return y
+# -------------------------------
+# ---- Self Guided Attention ----
+# -------------------------------
+class SGA(nn.Module):
+    def __init__(self, args):
+        super(SGA, self).__init__()
+        self.mhatt1 = MHAtt(args)
+        self.mhatt2 = MHAtt(args)
+        self.ffn = FFN(args)
+        self.dropout1 = nn.Dropout(args.dropout_r)
+        self.norm1 = LayerNorm(args.hidden_size)
+        self.dropout2 = nn.Dropout(args.dropout_r)
+        self.norm2 = LayerNorm(args.hidden_size)
+        self.dropout3 = nn.Dropout(args.dropout_r)
+        self.norm3 = LayerNorm(args.hidden_size)
+    def forward(self, x, y, x_mask, y_mask):
+        x = self.norm1(x + self.dropout1(
+            self.mhatt1(v=x, k=x, q=x, mask=x_mask)
+        ))
+        x = self.norm2(x + self.dropout2(
+            self.mhatt2(v=y, k=y, q=x, mask=y_mask)
+        ))
+        x = self.norm3(x + self.dropout3(
+            self.ffn(x)
+        ))
+        return x
+# ------------------------------
+# ---- Multi-Head Attention ----
+# ------------------------------
+class MHAtt(nn.Module):
+    def __init__(self, args):
+        super(MHAtt, self).__init__()
+        self.args = args
+        self.linear_v = nn.Linear(args.hidden_size, args.hidden_size)
+        self.linear_k = nn.Linear(args.hidden_size, args.hidden_size)
+        self.linear_q = nn.Linear(args.hidden_size, args.hidden_size)
+        self.linear_merge = nn.Linear(args.hidden_size, args.hidden_size)
+        self.dropout = nn.Dropout(args.dropout_r)
+    def forward(self, v, k, q, mask):
+        n_batches = q.size(0)
+        v = self.linear_v(v).view(
+            n_batches,
+            -1,
+            self.args.multi_head,
+            int(self.args.hidden_size / self.args.multi_head)
+        ).transpose(1, 2)
+        k = self.linear_k(k).view(
+            n_batches,
+            -1,
+            self.args.multi_head,
+            int(self.args.hidden_size / self.args.multi_head)
+        ).transpose(1, 2)
+        q = self.linear_q(q).view(
+            n_batches,
+            -1,
+            self.args.multi_head,
+            int(self.args.hidden_size / self.args.multi_head)
+        ).transpose(1, 2)
+        atted = self.att(v, k, q, mask)
+        atted = atted.transpose(1, 2).contiguous().view(
+            n_batches,
+            -1,
+            self.args.hidden_size
+        )
+        atted = self.linear_merge(atted)
+        return atted
+    def att(self, value, key, query, mask):
+        d_k = query.size(-1)
+        scores = torch.matmul(
+            query, key.transpose(-2, -1)
+        ) / math.sqrt(d_k)
+        if mask is not None:
+            scores = scores.masked_fill(mask, -1e9)
+        att_map = F.softmax(scores, dim=-1)
+        att_map = self.dropout(att_map)
+        return torch.matmul(att_map, value)
+# ---------------------------
+# ---- Feed Forward Nets ----
+# ---------------------------
+class FFN(nn.Module):
+    def __init__(self, args):
+        super(FFN, self).__init__()
+        self.mlp = MLP(
+            in_size=args.hidden_size,
+            mid_size=args.ff_size,
+            out_size=args.hidden_size,
+            dropout_r=args.dropout_r,
+            use_relu=True
+        )
+    def forward(self, x):
+        return self.mlp(x)
+# ---------------------------
+# ---- FF + norm  -----------
+# ---------------------------
+class FFAndNorm(nn.Module):
+    def __init__(self, args):
+        super(FFAndNorm, self).__init__()
+        self.ffn = FFN(args)
+        self.norm1 = LayerNorm(args.hidden_size)
+        self.dropout2 = nn.Dropout(args.dropout_r)
+        self.norm2 = LayerNorm(args.hidden_size)
+    def forward(self, x):
+        x = self.norm1(x)
+        x = self.norm2(x + self.dropout2(self.ffn(x)))
+        return x
+class Block(nn.Module):
+    def __init__(self, args, i):
+        super(Block, self).__init__()
+        self.args = args
+        self.sa1 = SA(args)
+        self.sa3 = SGA(args)
+        self.last = (i == args.layer-1)
+        if not self.last:
+            self.att_lang = AttFlat(args, args.lang_seq_len, merge=False)
+            self.att_audio = AttFlat(args, args.audio_seq_len, merge=False)
+            self.norm_l = LayerNorm(args.hidden_size)
+            self.norm_i = LayerNorm(args.hidden_size)
+            self.dropout = nn.Dropout(args.dropout_r)
+    def forward(self, x, x_mask, y, y_mask):
+        ax = self.sa1(x, x_mask)
+        ay = self.sa3(y, x, y_mask, x_mask)
+        x = ax + x
+        y = ay + y
+        if self.last:
+            return x, y
+        ax = self.att_lang(x, x_mask)
+        ay = self.att_audio(y, y_mask)
+        return self.norm_l(x + self.dropout(ax)), \
+               self.norm_i(y + self.dropout(ay))
+class Model_LA(nn.Module):
+    def __init__(self, args, vocab_size, pretrained_emb):
+        super(Model_LA, self).__init__()
+        self.args = args
+        # LSTM
+        self.embedding = nn.Embedding(
+            num_embeddings=vocab_size,
+            embedding_dim=args.word_embed_size
+        )
+        # Loading the GloVe embedding weights
+        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
+        self.lstm_x = nn.LSTM(
+            input_size=args.word_embed_size,
+            hidden_size=args.hidden_size,
+            num_layers=1,
+            batch_first=True
+        )
+        # self.lstm_y = nn.LSTM(
+        #     input_size=args.audio_feat_size,
+        #     hidden_size=args.hidden_size,
+        #     num_layers=1,
+        #     batch_first=True
+        # )
+        # Feature size to hid size
+        self.adapter = nn.Linear(args.audio_feat_size, args.hidden_size)
+        # Encoder blocks
+        self.enc_list = nn.ModuleList([Block(args, i) for i in range(args.layer)])
+        # Flattenting features before proj
+        self.attflat_img  = AttFlat(args, 1, merge=True)
+        self.attflat_lang = AttFlat(args, 1, merge=True)
+        # Classification layers
+        self.proj_norm = LayerNorm(2 * args.hidden_size)
+        self.proj = self.proj = nn.Linear(2 * args.hidden_size, args.ans_size)
+    def forward(self, x, y, _):
+        x_mask = make_mask(x.unsqueeze(2))
+        y_mask = make_mask(y)
+        embedding = self.embedding(x)
+        x, _ = self.lstm_x(embedding)
+        # y, _ = self.lstm_y(y)
+        y = self.adapter(y)
+        for i, dec in enumerate(self.enc_list):
+            x_m, x_y = None, None
+            if i == 0:
+                x_m, x_y = x_mask, y_mask
+            x, y = dec(x, x_m, y, x_y)
+        x = self.attflat_lang(
+            x,
+            None
+        )
+        y = self.attflat_img(
+            y,
+            None
+        )
+        # Classification layers
+        proj_feat = x + y
+        proj_feat = self.proj_norm(proj_feat)
+        ans = self.proj(proj_feat)
+        return ans

model_LAV.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from layers.fc import MLP
+from layers.layer_norm import LayerNorm
+# ------------------------------------
+# ---------- Masking sequence --------
+# ------------------------------------
+def make_mask(feature):
+    return (torch.sum(
+        torch.abs(feature),
+        dim=-1
+    ) == 0).unsqueeze(1).unsqueeze(2)
+# ------------------------------
+# ---------- Flattening --------
+# ------------------------------
+class AttFlat(nn.Module):
+    def __init__(self, args, flat_glimpse, merge=False):
+        super(AttFlat, self).__init__()
+        self.args = args
+        self.merge = merge
+        self.flat_glimpse = flat_glimpse
+        self.mlp = MLP(
+            in_size=args.hidden_size,
+            mid_size=args.ff_size,
+            out_size=flat_glimpse,
+            dropout_r=args.dropout_r,
+            use_relu=True
+        )
+        if self.merge:
+            self.linear_merge = nn.Linear(
+                args.hidden_size * flat_glimpse,
+                args.hidden_size * 2
+            )
+    def forward(self, x, x_mask):
+        att = self.mlp(x)
+        if x_mask is not None:
+            att = att.masked_fill(
+                x_mask.squeeze(1).squeeze(1).unsqueeze(2),
+                -1e9
+            )
+        att = F.softmax(att, dim=1)
+        att_list = []
+        for i in range(self.flat_glimpse):
+            att_list.append(
+                torch.sum(att[:, :, i: i + 1] * x, dim=1)
+            )
+        if self.merge:
+            x_atted = torch.cat(att_list, dim=1)
+            x_atted = self.linear_merge(x_atted)
+            return x_atted
+        return torch.stack(att_list).transpose_(0, 1)
+# ------------------------
+# ---- Self Attention ----
+# ------------------------
+class SA(nn.Module):
+    def __init__(self, args):
+        super(SA, self).__init__()
+        self.mhatt = MHAtt(args)
+        self.ffn = FFN(args)
+        self.dropout1 = nn.Dropout(args.dropout_r)
+        self.norm1 = LayerNorm(args.hidden_size)
+        self.dropout2 = nn.Dropout(args.dropout_r)
+        self.norm2 = LayerNorm(args.hidden_size)
+    def forward(self, y, y_mask):
+        y = self.norm1(y + self.dropout1(
+            self.mhatt(y, y, y, y_mask)
+        ))
+        y = self.norm2(y + self.dropout2(
+            self.ffn(y)
+        ))
+        return y
+# -------------------------------
+# ---- Self Guided Attention ----
+# -------------------------------
+class SGA(nn.Module):
+    def __init__(self, args):
+        super(SGA, self).__init__()
+        self.mhatt1 = MHAtt(args)
+        self.mhatt2 = MHAtt(args)
+        self.ffn = FFN(args)
+        self.dropout1 = nn.Dropout(args.dropout_r)
+        self.norm1 = LayerNorm(args.hidden_size)
+        self.dropout2 = nn.Dropout(args.dropout_r)
+        self.norm2 = LayerNorm(args.hidden_size)
+        self.dropout3 = nn.Dropout(args.dropout_r)
+        self.norm3 = LayerNorm(args.hidden_size)
+    def forward(self, x, y, x_mask, y_mask):
+        x = self.norm1(x + self.dropout1(
+            self.mhatt1(v=x, k=x, q=x, mask=x_mask)
+        ))
+        x = self.norm2(x + self.dropout2(
+            self.mhatt2(v=y, k=y, q=x, mask=y_mask)
+        ))
+        x = self.norm3(x + self.dropout3(
+            self.ffn(x)
+        ))
+        return x
+# ------------------------------
+# ---- Multi-Head Attention ----
+# ------------------------------
+class MHAtt(nn.Module):
+    def __init__(self, args):
+        super(MHAtt, self).__init__()
+        self.args = args
+        self.linear_v = nn.Linear(args.hidden_size, args.hidden_size)
+        self.linear_k = nn.Linear(args.hidden_size, args.hidden_size)
+        self.linear_q = nn.Linear(args.hidden_size, args.hidden_size)
+        self.linear_merge = nn.Linear(args.hidden_size, args.hidden_size)
+        self.dropout = nn.Dropout(args.dropout_r)
+    def forward(self, v, k, q, mask):
+        n_batches = q.size(0)
+        v = self.linear_v(v).view(
+            n_batches,
+            -1,
+            self.args.multi_head,
+            int(self.args.hidden_size / self.args.multi_head)
+        ).transpose(1, 2)
+        k = self.linear_k(k).view(
+            n_batches,
+            -1,
+            self.args.multi_head,
+            int(self.args.hidden_size / self.args.multi_head)
+        ).transpose(1, 2)
+        q = self.linear_q(q).view(
+            n_batches,
+            -1,
+            self.args.multi_head,
+            int(self.args.hidden_size / self.args.multi_head)
+        ).transpose(1, 2)
+        atted = self.att(v, k, q, mask)
+        atted = atted.transpose(1, 2).contiguous().view(
+            n_batches,
+            -1,
+            self.args.hidden_size
+        )
+        atted = self.linear_merge(atted)
+        return atted
+    def att(self, value, key, query, mask):
+        d_k = query.size(-1)
+        scores = torch.matmul(
+            query, key.transpose(-2, -1)
+        ) / math.sqrt(d_k)
+        if mask is not None:
+            scores = scores.masked_fill(mask, -1e9)
+        att_map = F.softmax(scores, dim=-1)
+        att_map = self.dropout(att_map)
+        return torch.matmul(att_map, value)
+# ---------------------------
+# ---- Feed Forward Nets ----
+# ---------------------------
+class FFN(nn.Module):
+    def __init__(self, args):
+        super(FFN, self).__init__()
+        self.mlp = MLP(
+            in_size=args.hidden_size,
+            mid_size=args.ff_size,
+            out_size=args.hidden_size,
+            dropout_r=args.dropout_r,
+            use_relu=True
+        )
+    def forward(self, x):
+        return self.mlp(x)
+# ---------------------------
+# ---- FF + norm  -----------
+# ---------------------------
+class FFAndNorm(nn.Module):
+    def __init__(self, args):
+        super(FFAndNorm, self).__init__()
+        self.ffn = FFN(args)
+        self.norm1 = LayerNorm(args.hidden_size)
+        self.dropout2 = nn.Dropout(args.dropout_r)
+        self.norm2 = LayerNorm(args.hidden_size)
+    def forward(self, x):
+        x = self.norm1(x)
+        x = self.norm2(x + self.dropout2(self.ffn(x)))
+        return x
+class Block(nn.Module):
+    def __init__(self, args, i):
+        super(Block, self).__init__()
+        self.args = args
+        self.sa1 = SA(args)
+        self.sa2 = SGA(args)
+        self.sa3 = SGA(args)
+        self.last = (i == args.layer-1)
+        if not self.last:
+            self.att_lang = AttFlat(args, args.lang_seq_len, merge=False)
+            self.att_audio = AttFlat(args, args.audio_seq_len, merge=False)
+            self.att_vid = AttFlat(args, args.video_seq_len, merge=False)
+            self.norm_l = LayerNorm(args.hidden_size)
+            self.norm_a = LayerNorm(args.hidden_size)
+            self.norm_v = LayerNorm(args.hidden_size)
+            self.dropout = nn.Dropout(args.dropout_r)
+    def forward(self, x, x_mask, y, y_mask, z, z_mask):
+        ax = self.sa1(x, x_mask)
+        ay = self.sa2(y, x, y_mask, x_mask)
+        az = self.sa3(z, x, z_mask, x_mask)
+        x = ax + x
+        y = ay + y
+        z = az + z
+        if self.last:
+            return x, y, z
+        ax = self.att_lang(x, x_mask)
+        ay = self.att_audio(y, y_mask)
+        az = self.att_vid(z, y_mask)
+        return self.norm_l(x + self.dropout(ax)), \
+               self.norm_a(y + self.dropout(ay)), \
+               self.norm_v(z + self.dropout(az))
+class Model_LAV(nn.Module):
+    def __init__(self, args, vocab_size, pretrained_emb):
+        super(Model_LAV, self).__init__()
+        self.args = args
+        # LSTM
+        self.embedding = nn.Embedding(
+            num_embeddings=vocab_size,
+            embedding_dim=args.word_embed_size
+        )
+        # Loading the GloVe embedding weights
+        self.embedding.weight.data.copy_(torch.from_numpy(pretrained_emb))
+        self.lstm_x = nn.LSTM(
+            input_size=args.word_embed_size,
+            hidden_size=args.hidden_size,
+            num_layers=1,
+            batch_first=True
+        )
+        # self.lstm_y = nn.LSTM(
+        #     input_size=args.audio_feat_size,
+        #     hidden_size=args.hidden_size,
+        #     num_layers=1,
+        #     batch_first=True
+        # )
+        # Feature size to hid size
+        self.adapter_y = nn.Linear(args.audio_feat_size, args.hidden_size)
+        self.adapter_z = nn.Linear(args.video_feat_size, args.hidden_size)
+        # Encoder blocks
+        self.enc_list = nn.ModuleList([Block(args, i) for i in range(args.layer)])
+        # Flattenting features before proj
+        self.attflat_ac   = AttFlat(args, 1, merge=True)
+        self.attflat_vid  = AttFlat(args, 1, merge=True)
+        self.attflat_lang = AttFlat(args, 1, merge=True)
+        # Classification layers
+        self.proj_norm = LayerNorm(2 * args.hidden_size)
+        if self.args.task == "sentiment":
+            if self.args.task_binary:
+                self.proj = nn.Linear(2 * args.hidden_size, 2)
+            else:
+                self.proj = nn.Linear(2 * args.hidden_size, 7)
+        if self.args.task == "emotion":
+            self.proj = self.proj = nn.Linear(2 * args.hidden_size, 6)
+    def forward(self, x, y, z):
+        x_mask = make_mask(x.unsqueeze(2))
+        y_mask = make_mask(y)
+        z_mask = make_mask(z)
+        embedding = self.embedding(x)
+        x, _ = self.lstm_x(embedding)
+        # y, _ = self.lstm_y(y)
+        y, z = self.adapter_y(y), self.adapter_z(z)
+        for i, dec in enumerate(self.enc_list):
+            x_m, y_m, z_m = None, None, None
+            if i == 0:
+                x_m, y_m, z_m = x_mask, y_mask, z_mask
+            x, y, z = dec(x, x_m, y, y_m, z, z_m)
+        x = self.attflat_lang(
+            x,
+            None
+        )
+        y = self.attflat_ac(
+            y,
+            None
+        )
+        z = self.attflat_vid(
+            z,
+            None
+        )
+        # Classification layers
+        proj_feat = x + y + z
+        proj_feat = self.proj_norm(proj_feat)
+        ans = self.proj(proj_feat)
+        return ans

token_to_ix.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1b468b2048c2ac08aaae32ba38c69fc9535af97bf7946e39ba4888794a8574d
+size 286216

train_glove.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c73c457f2e7d047538488d411bcc851ae45b53cf3526482c5b0f6d4b745ebd55
+size 17012528

utils/__init__.py ADDED Viewed

File without changes

utils/audio.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# -*- coding: utf-8 -*-
+#/usr/bin/python2
+'''
+By kyubyong park. kbpark.linguist@gmail.com.
+https://www.github.com/kyubyong/dc_tts
+'''
+from __future__ import print_function, division
+import numpy as np
+import librosa
+import os, copy
+import matplotlib
+matplotlib.use('pdf')
+import matplotlib.pyplot as plt
+from scipy import signal
+from .audio_params import Hyperparams as hp
+import tensorflow as tf
+def get_spectrograms(fpath):
+    '''Parse the wave file in `fpath` and
+    Returns normalized melspectrogram and linear spectrogram.
+    Args:
+      fpath: A string. The full path of a sound file.
+    Returns:
+      mel: A 2d array of shape (T, n_mels) and dtype of float32.
+      mag: A 2d array of shape (T, 1+n_fft/2) and dtype of float32.
+    '''
+    # Loading sound file
+    y, sr = librosa.load(fpath, sr=hp.sr)
+    # Trimming
+    y, _ = librosa.effects.trim(y)
+    # Preemphasis
+    y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])
+    # stft
+    linear = librosa.stft(y=y,
+                          n_fft=hp.n_fft,
+                          hop_length=hp.hop_length,
+                          win_length=hp.win_length)
+    # magnitude spectrogram
+    mag = np.abs(linear)  # (1+n_fft//2, T)
+    # mel spectrogram
+    mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels)  # (n_mels, 1+n_fft//2)
+    mel = np.dot(mel_basis, mag)  # (n_mels, t)
+    # to decibel
+    mel = 20 * np.log10(np.maximum(1e-5, mel))
+    mag = 20 * np.log10(np.maximum(1e-5, mag))
+    # normalize
+    mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
+    mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
+    # Transpose
+    mel = mel.T.astype(np.float32)  # (T, n_mels)
+    mag = mag.T.astype(np.float32)  # (T, 1+n_fft//2)
+    return mel, mag
+def spectrogram2wav(mag):
+    '''# Generate wave file from linear magnitude spectrogram
+    Args:
+      mag: A numpy array of (T, 1+n_fft//2)
+    Returns:
+      wav: A 1-D numpy array.
+    '''
+    # transpose
+    mag = mag.T
+    # de-noramlize
+    mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db
+    # to amplitude
+    mag = np.power(10.0, mag * 0.05)
+    # wav reconstruction
+    wav = griffin_lim(mag**hp.power)
+    # de-preemphasis
+    wav = signal.lfilter([1], [1, -hp.preemphasis], wav)
+    # trim
+    wav, _ = librosa.effects.trim(wav)
+    return wav.astype(np.float32)
+def griffin_lim(spectrogram):
+    '''Applies Griffin-Lim's raw.'''
+    X_best = copy.deepcopy(spectrogram)
+    for i in range(hp.n_iter):
+        X_t = invert_spectrogram(X_best)
+        est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length)
+        phase = est / np.maximum(1e-8, np.abs(est))
+        X_best = spectrogram * phase
+    X_t = invert_spectrogram(X_best)
+    y = np.real(X_t)
+    return y
+def invert_spectrogram(spectrogram):
+    '''Applies inverse fft.
+    Args:
+      spectrogram: [1+n_fft//2, t]
+    '''
+    return librosa.istft(spectrogram, hp.hop_length, win_length=hp.win_length, window="hann")
+def plot_alignment(alignment, gs, dir=hp.logdir):
+    """Plots the alignment.
+    Args:
+      alignment: A numpy array with shape of (encoder_steps, decoder_steps)
+      gs: (int) global step.
+      dir: Output path.
+    """
+    if not os.path.exists(dir): os.mkdir(dir)
+    fig, ax = plt.subplots()
+    im = ax.imshow(alignment)
+    fig.colorbar(im)
+    plt.title('{} Steps'.format(gs))
+    plt.savefig('{}/alignment_{}.png'.format(dir, gs), format='png')
+    plt.close(fig)
+def guided_attention(g=0.2):
+    '''Guided attention. Refer to page 3 on the paper.'''
+    W = np.zeros((hp.max_N, hp.max_T), dtype=np.float32)
+    for n_pos in range(W.shape[0]):
+        for t_pos in range(W.shape[1]):
+            W[n_pos, t_pos] = 1 - np.exp(-(t_pos / float(hp.max_T) - n_pos / float(hp.max_N)) ** 2 / (2 * g * g))
+    return W
+def learning_rate_decay(init_lr, global_step, warmup_steps = 4000.0):
+    '''Noam scheme from tensor2tensor'''
+    step = tf.to_float(global_step + 1)
+    return init_lr * warmup_steps**0.5 * tf.minimum(step * warmup_steps**-1.5, step**-0.5)
+def load_spectrograms(fpath):
+    '''Read the wave file in `fpath`
+    and extracts spectrograms'''
+    fname = os.path.basename(fpath)
+    mel, mag = get_spectrograms(fpath)
+    t = mel.shape[0]
+    # Marginal padding for reduction shape sync.
+    num_paddings = hp.r - (t % hp.r) if t % hp.r != 0 else 0
+    mel = np.pad(mel, [[0, num_paddings], [0, 0]], mode="constant")
+    mag = np.pad(mag, [[0, num_paddings], [0, 0]], mode="constant")
+    # Reduction
+    mel = mel[::hp.r, :]
+    return fname, mel, mag

utils/audio_params.py ADDED Viewed

	@@ -0,0 +1,47 @@

+# -*- coding: utf-8 -*-
+#/usr/bin/python2
+'''
+By kyubyong park. kbpark.linguist@gmail.com.
+https://www.github.com/kyubyong/dc_tts
+'''
+class Hyperparams:
+    '''Hyper parameters'''
+    # pipeline
+    prepro = True  # if True, run `python prepro.py` first before running `python train.py`.
+    # signal processing
+    sr = 22050  # Sampling rate.
+    n_fft = 2048  # fft points (samples)
+    frame_shift = 0.0125  # seconds
+    frame_length = 0.05  # seconds
+    hop_length = int(sr * frame_shift)  # samples. =276.
+    win_length = int(sr * frame_length)  # samples. =1102.
+    n_mels = 80  # Number of Mel banks to generate
+    power = 1.5  # Exponent for amplifying the predicted magnitude
+    n_iter = 50  # Number of inversion iterations
+    preemphasis = .97
+    max_db = 100
+    ref_db = 20
+    # Model
+    r = 4 # Reduction factor. Do not change this.
+    dropout_rate = 0.05
+    e = 128 # == embedding
+    d = 256 # == hidden units of Text2Mel
+    c = 512 # == hidden units of SSRN
+    attention_win_size = 3
+    # data
+    data = "/data/private/voice/LJSpeech-1.0"
+    # data = "/data/private/voice/kate"
+    test_data = 'harvard_sentences.txt'
+    vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" # P: Padding, E: EOS.
+    max_N = 180 # Maximum number of characters.
+    max_T = 210 # Maximum number of mel frames.
+    # training scheme
+    lr = 0.001 # Initial learning rate.
+    logdir = "logdir/LJ01"
+    sampledir = 'samples'
+    B = 32 # batch size
+    num_iterations = 2000000

utils/compute_args.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import torch
+def compute_args(args):
+    # DataLoader
+    if not hasattr(args, 'dataset'):  # fix for previous version
+        args.dataset = 'MOSEI'
+    if args.dataset == "MOSEI": args.dataloader = 'Mosei_Dataset'
+    if args.dataset == "MELD": args.dataloader = 'Meld_Dataset'
+    # Loss function to use
+    if args.dataset == 'MOSEI' and args.task == 'sentiment': args.loss_fn = torch.nn.CrossEntropyLoss(reduction="sum")
+    if args.dataset == 'MOSEI' and args.task == 'emotion': args.loss_fn = torch.nn.BCEWithLogitsLoss(reduction="sum")
+    if args.dataset == 'MELD': args.loss_fn = torch.nn.CrossEntropyLoss(reduction="sum")
+    # Answer size
+    if args.dataset == 'MOSEI' and args.task == "sentiment": args.ans_size = 7
+    if args.dataset == 'MOSEI' and args.task == "sentiment" and args.task_binary: args.ans_size = 2
+    if args.dataset == 'MOSEI' and args.task == "emotion": args.ans_size = 6
+    if args.dataset == 'MELD' and args.task == "emotion": args.ans_size = 7
+    if args.dataset == 'MELD' and args.task == "sentiment": args.ans_size = 3
+    if args.dataset == 'MOSEI': args.pred_func = "amax"
+    if args.dataset == 'MOSEI' and args.task == "emotion": args.pred_func = "multi_label"
+    if args.dataset == 'MELD': args.pred_func = "amax"
+    return args

utils/plot.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# import matplotlib.pyplot as plt
+# import numpy as np
+#
+# def plot(d):
+#     # An "interface" to matplotlib.axes.Axes.hist() method
+#     n, bins, patches = plt.hist(x=d, bins='auto', color='#0504aa',
+#                                 alpha=0.7, rwidth=0.85)
+#     plt.grid(axis='y', alpha=0.75)
+#     plt.title('My Very Own Histogram')
+#     maxfreq = n.max()
+#     # Set a clean upper y-axis limit.
+#     plt.ylim(ymax=np.ceil(maxfreq / 10) * 10 if maxfreq % 10 else maxfreq + 10)
+#     plt.show()

utils/pred_func.py ADDED Viewed

	@@ -0,0 +1,9 @@

+import numpy as np
+def amax(x):
+    return np.argmax(x, axis=1)
+def multi_label(x):
+    return (x > 0)

utils/tokenize.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# $ wget https://github.com/explosion/spacy-models/releases/download/en_vectors_web_lg-2.1.0/en_vectors_web_lg-2.1.0.tar.gz -O en_vectors_web_lg-2.1.0.tar.gz
+# $ pip install en_vectors_web_lg-2.1.0.tar.gz
+import en_vectors_web_lg
+import re
+import numpy as np
+import os
+import pickle
+def clean(w):
+    return re.sub(
+            r"([.,'!?\"()*#:;])",
+            '',
+            w.lower()
+            ).replace('-', ' ').replace('/', ' ')
+def tokenize(key_to_word):
+    key_to_sentence = {}
+    for k, v in key_to_word.items():
+        key_to_sentence[k] = [clean(w) for w in v if clean(w) != '']
+    return key_to_sentence
+def create_dict(key_to_sentence, dataroot, use_glove=True):
+    token_file = dataroot+"/token_to_ix.pkl"
+    glove_file = dataroot+"/train_glove.npy"
+    if os.path.exists(glove_file) and os.path.exists(token_file):
+        print("Loading train language files")
+        return pickle.load(open(token_file, "rb")), np.load(glove_file)
+    print("Creating train language files")
+    token_to_ix = {
+        'UNK': 1,
+    }
+    spacy_tool = None
+    pretrained_emb = []
+    if use_glove:
+        spacy_tool = en_vectors_web_lg.load()
+        pretrained_emb.append(spacy_tool('UNK').vector)
+    for k, v in key_to_sentence.items():
+        for word in v:
+            if word not in token_to_ix:
+                token_to_ix[word] = len(token_to_ix)
+                if use_glove:
+                    pretrained_emb.append(spacy_tool(word).vector)
+    pretrained_emb = np.array(pretrained_emb)
+    np.save(glove_file, pretrained_emb)
+    pickle.dump(token_to_ix, open(token_file, "wb"))
+    return token_to_ix, pretrained_emb
+def sent_to_ix(s, token_to_ix, max_token=100):
+    ques_ix = np.zeros(max_token, np.int64)
+    for ix, word in enumerate(s):
+        if word in token_to_ix:
+            ques_ix[ix] = token_to_ix[word]
+        else:
+            ques_ix[ix] = token_to_ix['UNK']
+        if ix + 1 == max_token:
+            break
+    return ques_ix
+def cmumosei_7(a):
+    if a < -2:
+        res = 0
+    if -2 <= a and a < -1:
+        res = 1
+    if -1 <= a and a < 0:
+        res = 2
+    if 0 <= a and a <= 0:
+        res = 3
+    if 0 < a and a <= 1:
+        res = 4
+    if 1 < a and a <= 2:
+        res = 5
+    if a > 2:
+        res = 6
+    return res
+def cmumosei_2(a):
+    if a < 0:
+        return 0
+    if a >= 0:
+        return 1
+def pad_feature(feat, max_len):
+    if feat.shape[0] > max_len:
+        feat = feat[:max_len]
+    feat = np.pad(
+        feat,
+        ((0, max_len - feat.shape[0]), (0, 0)),
+        mode='constant',
+        constant_values=0
+    )
+    return feat