File size: 3,913 Bytes
3f7a513 51af749 3f7a513 6aeb766 3f7a513 8e3d0e5 0dad9f3 6eba34a 0dad9f3 9eb0395 0dad9f3 1ea7d89 0dad9f3 6a9abe6 0dad9f3 6a9abe6 0dad9f3 6a9abe6 0dad9f3 6a9abe6 0dad9f3 6a9abe6 8e3d0e5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
---
language: en
datasets:
- msp-podcast
inference: true
tags:
- speech
- audio
- wav2vec2
- audio-classification
- emotion-recognition
license: cc-by-nc-sa-4.0
pipeline_tag: audio-classification
---
# Model for Dimensional Speech Emotion Recognition based on Wav2vec 2.0
Please note that this model is for research purpose only.
A commercial license for a model
that has been trained on much more data
can be acquired with [audEERING](https://www.audeering.com/products/devaice/).
The model expects a raw audio signal as input,
and outputs predictions for arousal, dominance and valence in a range of approximately 0...1.
In addition,
it provides the pooled states of the last transformer layer.
The model was created by fine-tuning
[Wav2Vec2-Large-Robust](https://huggingface.co/facebook/wav2vec2-large-robust)
on [MSP-Podcast](https://ecs.utdallas.edu/research/researchlabs/msp-lab/MSP-Podcast.html) (v1.7).
The model was pruned from 24 to 12 transformer layers before fine-tuning.
An [ONNX](https://onnx.ai/) export of the model is available from [doi:10.5281/zenodo.6221127](https://zenodo.org/record/6221127).
Further details are given in the associated [paper](https://arxiv.org/abs/2203.07378) and [tutorial](https://github.com/audeering/w2v2-how-to).
# Usage
```python
import numpy as np
import torch
import torch.nn as nn
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import (
Wav2Vec2Model,
Wav2Vec2PreTrainedModel,
)
class RegressionHead(nn.Module):
r"""Classification head."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.final_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class EmotionModel(Wav2Vec2PreTrainedModel):
r"""Speech emotion classifier."""
def __init__(self, config):
super().__init__(config)
self.config = config
self.wav2vec2 = Wav2Vec2Model(config)
self.classifier = RegressionHead(config)
self.init_weights()
def forward(
self,
input_values,
):
outputs = self.wav2vec2(input_values)
hidden_states = outputs[0]
hidden_states = torch.mean(hidden_states, dim=1)
logits = self.classifier(hidden_states)
return hidden_states, logits
# load model from hub
device = 'cpu'
model_name = 'audeering/wav2vec2-large-robust-12-ft-emotion-msp-dim'
processor = Wav2Vec2Processor.from_pretrained(model_name)
model = EmotionModel.from_pretrained(model_name).to(device)
# dummy signal
sampling_rate = 16000
signal = np.zeros((1, sampling_rate), dtype=np.float32)
def process_func(
x: np.ndarray,
sampling_rate: int,
embeddings: bool = False,
) -> np.ndarray:
r"""Predict emotions or extract embeddings from raw audio signal."""
# run through processor to normalize signal
# always returns a batch, so we just get the first entry
# then we put it on the device
y = processor(x, sampling_rate=sampling_rate)
y = y['input_values'][0]
y = y.reshape(1, -1)
y = torch.from_numpy(y).to(device)
# run through model
with torch.no_grad():
y = model(y)[0 if embeddings else 1]
# convert to numpy
y = y.detach().cpu().numpy()
return y
print(process_func(signal, sampling_rate))
# Arousal dominance valence
# [[0.5460754 0.6062266 0.40431657]]
print(process_func(signal, sampling_rate, embeddings=True))
# Pooled hidden states of last transformer layer
# [[-0.00752167 0.0065819 -0.00746342 ... 0.00663632 0.00848748
# 0.00599211]]
``` |