Upload 6 files

Browse files

Files changed (6) hide show

README.md +68 -0
config.yaml +113 -0
configuration.json +13 -0
emotion2vec_base.pt +3 -0
example/test.wav +0 -0
tokens.txt +9 -0

README.md CHANGED Viewed

@@ -1,3 +1,71 @@
 ---
 license: apache-2.0
 ---

 ---
+frameworks:
+- Pytorch
 license: apache-2.0
+tasks:
+- emotion-recognition
 ---
+# 安装环境
+- modelscope>=1.11.1
+- funasr>=1.0.5
+# 用法
+## 基于modelscope进行推理
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+inference_pipeline = pipeline(
+    task=Tasks.emotion_recognition,
+    model="iic/emotion2vec_base_finetuned", model_revision="v2.0.4")
+rec_result = inference_pipeline('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav', granularity="utterance", extract_embedding=False)
+print(rec_result)
+```
+## 基于FunASR进行推理
+```python
+from funasr import AutoModel
+model = AutoModel(model="iic/emotion2vec_base_finetuned", model_revision="v2.0.4")
+wav_file = f"{model.model_path}/example/test.wav"
+res = model.generate(wav_file, output_dir="./outputs", granularity="utterance", extract_embedding=False)
+print(res)
+```
+注：模型会自动下载
+支持输入文件列表，wav.scp（kaldi风格）：
+```cat wav.scp
+wav_name1 wav_path1.wav
+wav_name2 wav_path2.wav
+...
+```
+输出为情感表征向量，保存在`output_dir`中，格式为numpy格式（可以用np.load()加载）
+# 说明
+本仓库为emotion2vec的modelscope版本，模型参数完全一致。
+原始仓库地址: [https://github.com/ddlBoJack/emotion2vec](https://github.com/ddlBoJack/emotion2vec)
+modelscope版本仓库：[https://github.com/alibaba-damo-academy/FunASR](https://github.com/alibaba-damo-academy/FunASR/tree/funasr1.0/examples/industrial_data_pretraining/emotion2vec)
+# 相关论文以及引用信息
+```BibTeX
+@article{ma2023emotion2vec,
+  title={emotion2vec: Self-Supervised Pre-Training for Speech Emotion Representation},
+  author={Ma, Ziyang and Zheng, Zhisheng and Ye, Jiaxin and Li, Jinchao and Gao, Zhifu and Zhang, Shiliang and Chen, Xie},
+  journal={arXiv preprint arXiv:2312.15185},
+  year={2023}
+}
+```

config.yaml ADDED Viewed

	@@ -0,0 +1,113 @@

+# network architecture
+model: Emotion2vec
+model_conf:
+    loss_beta: 0.0
+    loss_scale: null
+    depth: 8
+    start_drop_path_rate: 0.0
+    end_drop_path_rate: 0.0
+    num_heads: 12
+    norm_eps: 1e-05
+    norm_affine: true
+    encoder_dropout: 0.1
+    post_mlp_drop: 0.1
+    attention_dropout: 0.1
+    activation_dropout: 0.0
+    dropout_input: 0.0
+    layerdrop: 0.05
+    embed_dim: 768
+    mlp_ratio: 4.0
+    layer_norm_first: false
+    average_top_k_layers: 8
+    end_of_block_targets: false
+    clone_batch: 8
+    layer_norm_target_layer: false
+    batch_norm_target_layer: false
+    instance_norm_target_layer: true
+    instance_norm_targets: false
+    layer_norm_targets: false
+    ema_decay: 0.999
+    ema_same_dtype: true
+    log_norms: true
+    ema_end_decay: 0.99999
+    ema_anneal_end_step: 20000
+    ema_encoder_only: false
+    max_update: 100000
+    extractor_mode: layer_norm
+    shared_decoder: null
+    min_target_var: 0.1
+    min_pred_var: 0.01
+    supported_modality: AUDIO
+    mae_init: false
+    seed: 1
+    skip_ema: false
+    cls_loss: 1.0
+    recon_loss: 0.0
+    d2v_loss: 1.0
+    decoder_group: false
+    adversarial_training: false
+    adversarial_hidden_dim: 128
+    adversarial_weight: 0.1
+    cls_type: chunk
+    normalize: true
+    project_dim:
+    modalities:
+        audio:
+            type: AUDIO
+            prenet_depth: 4
+            prenet_layerdrop: 0.05
+            prenet_dropout: 0.1
+            start_drop_path_rate: 0.0
+            end_drop_path_rate: 0.0
+            num_extra_tokens: 10
+            init_extra_token_zero: true
+            mask_noise_std: 0.01
+            mask_prob_min: null
+            mask_prob: 0.5
+            inverse_mask: false
+            mask_prob_adjust: 0.05
+            keep_masked_pct: 0.0
+            mask_length: 5
+            add_masks: false
+            remove_masks: false
+            mask_dropout: 0.0
+            encoder_zero_mask: true
+            mask_channel_prob: 0.0
+            mask_channel_length: 64
+            ema_local_encoder: false
+            local_grad_mult: 1.0
+            use_alibi_encoder: true
+            alibi_scale: 1.0
+            learned_alibi: false
+            alibi_max_pos: null
+            learned_alibi_scale: true
+            learned_alibi_scale_per_head: true
+            learned_alibi_scale_per_layer: false
+            num_alibi_heads: 12
+            model_depth: 8
+            decoder:
+                decoder_dim: 384
+                decoder_groups: 16
+                decoder_kernel: 7
+                decoder_layers: 4
+                input_dropout: 0.1
+                add_positions_masked: false
+                add_positions_all: false
+                decoder_residual: true
+                projection_layers: 1
+                projection_ratio: 2.0
+            extractor_mode: layer_norm
+            feature_encoder_spec: '[(512, 10, 5)] + [(512, 3, 2)] * 4 + [(512,2,2)] + [(512,2,2)]'
+            conv_pos_width: 95
+            conv_pos_groups: 16
+            conv_pos_depth: 5
+            conv_pos_pre_ln: false
+tokenizer: CharTokenizer
+tokenizer_conf:
+  unk_symbol: <unk>
+  split_with_space: true

configuration.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "framework": "pytorch",
+  "task" : "emotion-recognition",
+  "pipeline": {"type":"funasr-pipeline"},
+  "model": {"type" : "funasr"},
+  "file_path_metas": {
+    "init_param":"emotion2vec_base.pt",
+    "tokenizer_conf": {"token_list": "tokens.txt"},
+    "config":"config.yaml"},
+  "model_name_in_hub": {
+    "ms":"iic/emotion2vec_base",
+    "hf":""}
+}

emotion2vec_base.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a6169355e0611d690f165f67901ce90fa905ee86a236ee54f4a14baf7d1689e
+size 1123130820

example/test.wav ADDED Viewed

Binary file (321 kB). View file

tokens.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+生气/angry
+厌恶/disgusted
+恐惧/fearful
+开心/happy
+中立/neutral
+其他/other
+难过/sad
+吃惊/surprised
+<unk>