MMAudio

Running on Zero

MMAudio / mmaudio /ext /autoencoder /edm2_utils.py

Rex Cheng

initial commit

dbac20f about 2 months ago

5.55 kB

	# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	#
	# This work is licensed under a Creative Commons
	# Attribution-NonCommercial-ShareAlike 4.0 International License.
	# You should have received a copy of the license along with this
	# work. If not, see http://creativecommons.org/licenses/by-nc-sa/4.0/
	"""Improved diffusion model architecture proposed in the paper
	"Analyzing and Improving the Training Dynamics of Diffusion Models"."""

	import numpy as np
	import torch

	#----------------------------------------------------------------------------
	# Variant of constant() that inherits dtype and device from the given
	# reference tensor by default.

	_constant_cache = dict()


	def constant(value, shape=None, dtype=None, device=None, memory_format=None):
	value = np.asarray(value)
	if shape is not None:
	shape = tuple(shape)
	if dtype is None:
	dtype = torch.get_default_dtype()
	if device is None:
	device = torch.device('cpu')
	if memory_format is None:
	memory_format = torch.contiguous_format

	key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format)
	tensor = _constant_cache.get(key, None)
	if tensor is None:
	tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device)
	if shape is not None:
	tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape))
	tensor = tensor.contiguous(memory_format=memory_format)
	_constant_cache[key] = tensor
	return tensor


	def const_like(ref, value, shape=None, dtype=None, device=None, memory_format=None):
	if dtype is None:
	dtype = ref.dtype
	if device is None:
	device = ref.device
	return constant(value, shape=shape, dtype=dtype, device=device, memory_format=memory_format)


	#----------------------------------------------------------------------------
	# Normalize given tensor to unit magnitude with respect to the given
	# dimensions. Default = all dimensions except the first.


	def normalize(x, dim=None, eps=1e-4):
	if dim is None:
	dim = list(range(1, x.ndim))
	norm = torch.linalg.vector_norm(x, dim=dim, keepdim=True, dtype=torch.float32)
	norm = torch.add(eps, norm, alpha=np.sqrt(norm.numel() / x.numel()))
	return x / norm.to(x.dtype)


	class Normalize(torch.nn.Module):

	def __init__(self, dim=None, eps=1e-4):
	super().__init__()
	self.dim = dim
	self.eps = eps

	def forward(self, x):
	return normalize(x, dim=self.dim, eps=self.eps)


	#----------------------------------------------------------------------------
	# Upsample or downsample the given tensor with the given filter,
	# or keep it as is.


	def resample(x, f=[1, 1], mode='keep'):
	if mode == 'keep':
	return x
	f = np.float32(f)
	assert f.ndim == 1 and len(f) % 2 == 0
	pad = (len(f) - 1) // 2
	f = f / f.sum()
	f = np.outer(f, f)[np.newaxis, np.newaxis, :, :]
	f = const_like(x, f)
	c = x.shape[1]
	if mode == 'down':
	return torch.nn.functional.conv2d(x,
	f.tile([c, 1, 1, 1]),
	groups=c,
	stride=2,
	padding=(pad, ))
	assert mode == 'up'
	return torch.nn.functional.conv_transpose2d(x, (f * 4).tile([c, 1, 1, 1]),
	groups=c,
	stride=2,
	padding=(pad, ))


	#----------------------------------------------------------------------------
	# Magnitude-preserving SiLU (Equation 81).


	def mp_silu(x):
	return torch.nn.functional.silu(x) / 0.596


	class MPSiLU(torch.nn.Module):

	def forward(self, x):
	return mp_silu(x)


	#----------------------------------------------------------------------------
	# Magnitude-preserving sum (Equation 88).


	def mp_sum(a, b, t=0.5):
	return a.lerp(b, t) / np.sqrt((1 - t)2 + t2)


	#----------------------------------------------------------------------------
	# Magnitude-preserving concatenation (Equation 103).


	def mp_cat(a, b, dim=1, t=0.5):
	Na = a.shape[dim]
	Nb = b.shape[dim]
	C = np.sqrt((Na + Nb) / ((1 - t)2 + t2))
	wa = C / np.sqrt(Na) * (1 - t)
	wb = C / np.sqrt(Nb) * t
	return torch.cat([wa * a, wb * b], dim=dim)


	#----------------------------------------------------------------------------
	# Magnitude-preserving convolution or fully-connected layer (Equation 47)
	# with force weight normalization (Equation 66).


	class MPConv1D(torch.nn.Module):

	def __init__(self, in_channels, out_channels, kernel_size):
	super().__init__()
	self.out_channels = out_channels
	self.weight = torch.nn.Parameter(torch.randn(out_channels, in_channels, kernel_size))

	self.weight_norm_removed = False

	def forward(self, x, gain=1):
	assert self.weight_norm_removed, 'call remove_weight_norm() before inference'

	w = self.weight * gain
	if w.ndim == 2:
	return x @ w.t()
	assert w.ndim == 3
	return torch.nn.functional.conv1d(x, w, padding=(w.shape[-1] // 2, ))

	def remove_weight_norm(self):
	w = self.weight.to(torch.float32)
	w = normalize(w) # traditional weight normalization
	w = w / np.sqrt(w[0].numel())
	w = w.to(self.weight.dtype)
	self.weight.data.copy_(w)

	self.weight_norm_removed = True
	return self