AudioLlama

Running on Zero

AudioLlama / mmaudio /model /embeddings.py

Rex Cheng

fix for hf

c4dd2de 10 days ago

1.68 kB

	import torch
	import torch.nn as nn

	# https://github.com/facebookresearch/DiT


	class TimestepEmbedder(nn.Module):
	"""
	Embeds scalar timesteps into vector representations.
	"""

	def __init__(self, dim, frequency_embedding_size, max_period):
	super().__init__()
	self.mlp = nn.Sequential(
	nn.Linear(frequency_embedding_size, dim),
	nn.SiLU(),
	nn.Linear(dim, dim),
	)
	self.dim = dim
	self.max_period = max_period
	assert dim % 2 == 0, 'dim must be even.'

	with torch.autocast('cuda', enabled=False):
	self.freqs = (
	1.0 / (10000**(torch.arange(0, frequency_embedding_size, 2, dtype=torch.float32) /
	frequency_embedding_size)))
	freq_scale = 10000 / max_period
	self.freqs = nn.Parameter(freq_scale * self.freqs)

	def timestep_embedding(self, t):
	"""
	Create sinusoidal timestep embeddings.
	:param t: a 1-D Tensor of N indices, one per batch element.
	These may be fractional.
	:param dim: the dimension of the output.
	:param max_period: controls the minimum frequency of the embeddings.
	:return: an (N, D) Tensor of positional embeddings.
	"""
	# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py

	args = t[:, None].float() * self.freqs[None]
	embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
	return embedding

	def forward(self, t):
	t_freq = self.timestep_embedding(t).to(t.dtype)
	t_emb = self.mlp(t_freq)
	return t_emb