echomimic-v2 / src /models /wav_clip.py
fffiloni's picture
Migrated from GitHub
6f199b8 verified
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
'''
@Project :EMO_digitalhuman
@File :wav_clip.py
@Author :juzhen.czy
@Date :2024/3/4 19:04
'''
from transformers import Wav2Vec2Model, Wav2Vec2Processor
import torch
from torch import nn
import librosa
from diffusers.models.modeling_utils import ModelMixin
from einops import rearrange, repeat
class Wav2Vec(ModelMixin):
def __init__(self, model_path):
super(Wav2Vec, self).__init__()
self.processor = Wav2Vec2Processor.from_pretrained(model_path)
self.wav2Vec = Wav2Vec2Model.from_pretrained(model_path)
self.wav2Vec.eval()
def forward(self, x):
with torch.no_grad():
return self.wav2Vec(x).last_hidden_state
# def forward(self, x):
# return self.wav2Vec(x).last_hidden_state
def process(self, x):
return self.processor(x, sampling_rate=16000, return_tensors="pt").input_values.to(self.device)
class AudioFeatureMapper(ModelMixin):
def __init__(self, input_num=15, output_num=77, model_path=None):
super(AudioFeatureMapper, self).__init__()
self.linear = nn.Linear(input_num, output_num)
if model_path is not None:
self.load_state_dict(torch.load(model_path))
def forward(self, x):
# print(x.shape)
result = self.linear(x.permute(0, 2, 1))
result = result.permute(0, 2, 1)
# result = self.linear(x)
return result
def test():
#加载模型
model_path = "/ossfs/workspace/projects/model_weights/Moore-AnimateAnyone/wav2vec2-base-960h"
model = Wav2Vec(model_path)
print("### model loaded ###")
#加载音频
audio_path = "/ossfs/workspace/projects/Moore-AnimateAnyone-master/assets/taken_clip.wav"
input_audio, rate = librosa.load(audio_path, sr=16000)
print(f"输入shape: {input_audio.shape}, rate: {rate}")
# 预处理, 维度变为 (1, input_audio.shape[0]), 增加了一个维度, 声音信号长度本身没有变
input_v = model.process(input_audio)
# 输出结果为
out = model(input_v)
print(f"输入shape: {input_v.shape}, 输出shape: {out.shape}")