MotionDiffuse / datasets /
initial commit
import torch
from torch.utils import data
import numpy as np
import os
from os.path import join as pjoin
import random
import codecs as cs
from tqdm import tqdm
class Text2MotionDataset(data.Dataset):
"""Dataset for Text2Motion generation task.
def __init__(self, opt, mean, std, split_file, times=1, w_vectorizer=None, eval_mode=False):
self.opt = opt
self.max_length = 20
self.times = times
self.w_vectorizer = w_vectorizer
self.eval_mode = eval_mode
min_motion_len = 40 if self.opt.dataset_name =='t2m' else 24
joints_num = opt.joints_num
data_dict = {}
id_list = []
with, 'r') as f:
for line in f.readlines():
new_name_list = []
length_list = []
for name in tqdm(id_list):
motion = np.load(pjoin(opt.motion_dir, name + '.npy'))
if (len(motion)) < min_motion_len or (len(motion) >= 200):
text_data = []
flag = False
with, name + '.txt')) as f:
for line in f.readlines():
text_dict = {}
line_split = line.strip().split('#')
caption = line_split[0]
tokens = line_split[1].split(' ')
f_tag = float(line_split[2])
to_tag = float(line_split[3])
f_tag = 0.0 if np.isnan(f_tag) else f_tag
to_tag = 0.0 if np.isnan(to_tag) else to_tag
text_dict['caption'] = caption
text_dict['tokens'] = tokens
if f_tag == 0.0 and to_tag == 0.0:
flag = True
n_motion = motion[int(f_tag*20) : int(to_tag*20)]
if (len(n_motion)) < min_motion_len or (len(n_motion) >= 200):
new_name = random.choice('ABCDEFGHIJKLMNOPQRSTUVW') + '_' + name
while new_name in data_dict:
new_name = random.choice('ABCDEFGHIJKLMNOPQRSTUVW') + '_' + name
data_dict[new_name] = {'motion': n_motion,
'length': len(n_motion),
if flag:
data_dict[name] = {'motion': motion,
'length': len(motion),
# Some motion may not exist in KIT dataset
name_list, length_list = zip(*sorted(zip(new_name_list, length_list), key=lambda x: x[1]))
if opt.is_train:
# root_rot_velocity (B, seq_len, 1)
std[0:1] = std[0:1] / opt.feat_bias
# root_linear_velocity (B, seq_len, 2)
std[1:3] = std[1:3] / opt.feat_bias
# root_y (B, seq_len, 1)
std[3:4] = std[3:4] / opt.feat_bias
# ric_data (B, seq_len, (joint_num - 1)*3)
std[4: 4 + (joints_num - 1) * 3] = std[4: 4 + (joints_num - 1) * 3] / 1.0
# rot_data (B, seq_len, (joint_num - 1)*6)
std[4 + (joints_num - 1) * 3: 4 + (joints_num - 1) * 9] = std[4 + (joints_num - 1) * 3: 4 + (
joints_num - 1) * 9] / 1.0
# local_velocity (B, seq_len, joint_num*3)
std[4 + (joints_num - 1) * 9: 4 + (joints_num - 1) * 9 + joints_num * 3] = std[
4 + (joints_num - 1) * 9: 4 + (
joints_num - 1) * 9 + joints_num * 3] / 1.0
# foot contact (B, seq_len, 4)
std[4 + (joints_num - 1) * 9 + joints_num * 3:] = std[
4 + (joints_num - 1) * 9 + joints_num * 3:] / opt.feat_bias
assert 4 + (joints_num - 1) * 9 + joints_num * 3 + 4 == mean.shape[-1], 'mean.npy'), mean), 'std.npy'), std)
self.mean = mean
self.std = std
self.length_arr = np.array(length_list)
self.data_dict = data_dict
self.name_list = name_list
def inv_transform(self, data):
return data * self.std + self.mean
def real_len(self):
return len(self.data_dict)
def __len__(self):
return self.real_len() * self.times
def __getitem__(self, item):
idx = item % self.real_len()
data = self.data_dict[self.name_list[idx]]
motion, m_length, text_list = data['motion'], data['length'], data['text']
# Randomly select a caption
text_data = random.choice(text_list)
caption = text_data['caption']
max_motion_length = self.opt.max_motion_length
if m_length >= self.opt.max_motion_length:
idx = random.randint(0, len(motion) - max_motion_length)
motion = motion[idx: idx + max_motion_length]
padding_len = max_motion_length - m_length
D = motion.shape[1]
padding_zeros = np.zeros((padding_len, D))
motion = np.concatenate((motion, padding_zeros), axis=0)
assert len(motion) == max_motion_length
"Z Normalization"
motion = (motion - self.mean) / self.std
if self.eval_mode:
tokens = text_data['tokens']
if len(tokens) < self.opt.max_text_len:
# pad with "unk"
tokens = ['sos/OTHER'] + tokens + ['eos/OTHER']
sent_len = len(tokens)
tokens = tokens + ['unk/OTHER'] * (self.opt.max_text_len + 2 - sent_len)
# crop
tokens = tokens[:self.opt.max_text_len]
tokens = ['sos/OTHER'] + tokens + ['eos/OTHER']
sent_len = len(tokens)
pos_one_hots = []
word_embeddings = []
for token in tokens:
word_emb, pos_oh = self.w_vectorizer[token]
pos_one_hots.append(pos_oh[None, :])
word_embeddings.append(word_emb[None, :])
pos_one_hots = np.concatenate(pos_one_hots, axis=0)
word_embeddings = np.concatenate(word_embeddings, axis=0)
return word_embeddings, pos_one_hots, caption, sent_len, motion, m_length
return caption, motion, m_length