Spaces:

multimodalart
/

flux-style-shaping

Running on L40S

App Files Files Community

flux-style-shaping / comfy /ldm /genmo /joint_model /rope_mixed.py

multimodalart HF staff

Squashing commit

4450790 verified 14 days ago

raw

history blame

2.65 kB

	#original code from https://github.com/genmoai/models under apache 2.0 license

	# import functools
	import math

	import torch


	def centers(start: float, stop, num, dtype=None, device=None):
	"""linspace through bin centers.

	Args:
	start (float): Start of the range.
	stop (float): End of the range.
	num (int): Number of points.
	dtype (torch.dtype): Data type of the points.
	device (torch.device): Device of the points.

	Returns:
	centers (Tensor): Centers of the bins. Shape: (num,).
	"""
	edges = torch.linspace(start, stop, num + 1, dtype=dtype, device=device)
	return (edges[:-1] + edges[1:]) / 2


	# @functools.lru_cache(maxsize=1)
	def create_position_matrix(
	T: int,
	pH: int,
	pW: int,
	device: torch.device,
	dtype: torch.dtype,
	*,
	target_area: float = 36864,
	):
	"""
	Args:
	T: int - Temporal dimension
	pH: int - Height dimension after patchify
	pW: int - Width dimension after patchify

	Returns:
	pos: [T * pH * pW, 3] - position matrix
	"""
	# Create 1D tensors for each dimension
	t = torch.arange(T, dtype=dtype)

	# Positionally interpolate to area 36864.
	# (3072x3072 frame with 16x16 patches = 192x192 latents).
	# This automatically scales rope positions when the resolution changes.
	# We use a large target area so the model is more sensitive
	# to changes in the learned pos_frequencies matrix.
	scale = math.sqrt(target_area / (pW * pH))
	w = centers(-pW * scale / 2, pW * scale / 2, pW)
	h = centers(-pH * scale / 2, pH * scale / 2, pH)

	# Use meshgrid to create 3D grids
	grid_t, grid_h, grid_w = torch.meshgrid(t, h, w, indexing="ij")

	# Stack and reshape the grids.
	pos = torch.stack([grid_t, grid_h, grid_w], dim=-1) # [T, pH, pW, 3]
	pos = pos.view(-1, 3) # [T * pH * pW, 3]
	pos = pos.to(dtype=dtype, device=device)

	return pos


	def compute_mixed_rotation(
	freqs: torch.Tensor,
	pos: torch.Tensor,
	):
	"""
	Project each 3-dim position into per-head, per-head-dim 1D frequencies.

	Args:
	freqs: [3, num_heads, num_freqs] - learned rotation frequency (for t, row, col) for each head position
	pos: [N, 3] - position of each token
	num_heads: int

	Returns:
	freqs_cos: [N, num_heads, num_freqs] - cosine components
	freqs_sin: [N, num_heads, num_freqs] - sine components
	"""
	assert freqs.ndim == 3
	freqs_sum = torch.einsum("Nd,dhf->Nhf", pos.to(freqs), freqs)
	freqs_cos = torch.cos(freqs_sum)
	freqs_sin = torch.sin(freqs_sum)
	return freqs_cos, freqs_sin