akswelh
/

NEOX

Model card Files Files and versions Community

NEOX / megatron /model /mamba /mamba.py

akswelh

Upload 251 files

d90b3a8 verified about 1 month ago

raw

history blame

15.1 kB

	import math

	import torch
	import torch.nn as nn
	import torch.nn.functional as F

	try:
	from mamba_ssm.ops.selective_scan_interface import (
	selective_scan_ref,
	selective_scan_fn,
	mamba_inner_fn,
	)
	from causal_conv1d import causal_conv1d_fn
	import einops
	except ModuleNotFoundError:
	print(
	"Unable to import Mamba kernels. Install them from our requirements/requirements-mamba.txt, \
	or directly from https://github.com/state-spaces/mamba"
	)
	pass

	from megatron.model.norms import get_norm
	from megatron import mpu

	# Mamba sublayer, with tensor parallelism
	class ParallelMambaBlock(nn.Module):
	def __init__(
	self,
	neox_args,
	init_method,
	output_layer_init_method,
	):
	super().__init__()

	self.neox_args = neox_args

	dtype = {
	"fp16": torch.float16,
	"bf16": torch.bfloat16,
	"fp32": torch.float32,
	}[neox_args.precision]
	self.precision = dtype
	factory_kwargs = {"device": torch.cuda.current_device(), "dtype": dtype}

	assert not (
	neox_args.mamba_use_bias_in_linears and neox_args.mamba_inner_func_fusion
	), "Mamba fused inner fn and bias in x_proj not compatible!"

	assert (
	neox_args.intermediate_size == None or neox_args.expansion_factor == None
	), "Must pass either the absolute intermediate size or the relative expansion factor for the mamba projections"

	# set variables, mostly following mamba defaults
	self.d_model = neox_args.hidden_size
	self.d_state = 16 # state dimensions per channel
	self.d_conv = 4 # convolution width
	if neox_args.intermediate_size:
	self.d_inner = neox_args.intermediate_size
	else:
	self.expand = (
	neox_args.expansion_factor if neox_args.expansion_factor else 2
	)
	self.d_inner = int(self.expand * self.d_model)
	self.dt_rank = math.ceil(self.d_model / 16) # rank of dt / Delta parameter
	self.dt_scale = 1.0

	self.dt_init = "random"
	self.dt_min, self.dt_max, self.dt_init_floor = 0.001, 0.1, 1e-4
	assert self.dt_init in ["constant", "random"]

	# TP-specific setup
	world_size = mpu.get_model_parallel_world_size()
	self.d_inner_per_rank = mpu.divide(self.d_inner, world_size)

	if neox_args.mamba_inner_func_fusion and world_size > 1:
	# as with gpt-j residual, we must manually reduce output from final proj
	# across TP ranks, since it is not done by fused mamba_inner_fn .
	self.reduce = mpu.mappings.reduce_from_model_parallel_region

	# up-projection.
	self.in_proj = mpu.ColumnParallelLinear(
	neox_args=neox_args,
	input_size=self.d_model,
	output_size=self.d_inner * 2,
	gather_output=False,
	init_method=init_method,
	skip_bias_add=not neox_args.mamba_use_bias_in_linears,
	bias=neox_args.mamba_use_bias_in_linears,
	)

	# convolution (parallelized across d_inner)
	self.conv1d = nn.Conv1d(
	in_channels=self.d_inner_per_rank,
	out_channels=self.d_inner_per_rank,
	bias=neox_args.mamba_use_bias_in_conv,
	kernel_size=self.d_conv,
	groups=self.d_inner_per_rank,
	padding=self.d_conv - 1,
	**factory_kwargs,
	)
	# Conv bias sometimes in 32-bit erroneously, when holding other parameters in fp32.
	# Uncertain why
	self.conv1d.to(self.precision)

	self.act_fn = F.silu # we do not allow for other activation fns

	# x_proj corresponds to s_B(x), s_C(x), s_Delta(x)
	# in https://arxiv.org/pdf/2312.00752.pdf Algorithm 2
	# (computes data-dependent B, C, Delta/dt)
	self.x_proj = mpu.RowParallelLinear(
	neox_args=neox_args,
	input_size=self.d_inner,
	output_size=self.dt_rank + self.d_state * 2,
	input_is_parallel=True,
	init_method=init_method,
	skip_bias_add=not neox_args.mamba_use_bias_in_linears,
	parallel_output=True,
	bias=neox_args.mamba_use_bias_in_linears,
	)

	# up-project dt / Delta from dt_rank to d_inner
	# dt_proj 's bias is a special case and should be kept always turned on -- Alg. 2 in the Mamba paper (https://arxiv.org/abs/2312.00752)
	# defines Delta as Delta = Tau_{Delta}(Parameter + s_{Delta}(x)) where s_{Delta}(x) = Broadcast_{D}(Linear_{1}(x))
	# or as they further explain in section 3.6 can be also s_{Delta}(x) = Linear_{D}(Linear_{R}(x)) where Linear_R
	# is the delta portion of x_proj and Linear_D is the dt_proj weight. Then, the Parameter term from Alg. 2 can
	# be viewed as the bias term in dt_proj, with a special initialization from https://arxiv.org/abs/2206.12037
	self.dt_proj = nn.Linear(
	self.dt_rank, self.d_inner_per_rank, bias=True, **factory_kwargs
	)

	# special init for dt_proj
	dt_init_std = (self.dt_rank*-0.5) self.dt_scale
	if self.dt_init == "constant":
	nn.init.constant_(self.dt_proj.weight, dt_init_std)
	elif self.dt_init == "random":
	nn.init.uniform_(self.dt_proj.weight, -dt_init_std, dt_init_std)
	else:
	raise NotImplementedError

	# more dt_proj init stuff. copied from https://github.com/state-spaces/mamba/blob/009bec5ee37f586844a3fc89c040a9c1a9d8badf/mamba_ssm/modules/mamba_simple.py#L91-L101
	dt = torch.exp(
	torch.rand(self.d_inner_per_rank, **factory_kwargs)
	* (math.log(self.dt_max) - math.log(self.dt_min))
	+ math.log(self.dt_min)
	).clamp(min=self.dt_init_floor)
	# Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
	inv_dt = dt + torch.log(-torch.expm1(-dt))
	with torch.no_grad():
	self.dt_proj.bias.copy_(inv_dt)

	# initialize A . uses S4D real initialization
	A = einops.repeat(
	torch.arange(
	1,
	self.d_state + 1,
	dtype=torch.float32,
	device=torch.cuda.current_device(),
	),
	"n -> d n",
	d=self.d_inner_per_rank,
	).contiguous()
	A_log = torch.log(A).to(
	torch.float32
	) # Keep in fp32, following https://github.com/state-spaces/mamba#precision and code comments
	self.A_log = nn.Parameter(A_log)
	self.A_log._no_weight_decay = (
	True # setting this attribute turns off weight decay for this param
	)
	# setting this attribute prevents deeperspeed from casting this param to fp32
	# requires DeepersSpeed commit https://github.com/EleutherAI/DeeperSpeed/commit/6d097beccc4e3b0ac806c7d975f8c10d4689de26 or later
	if self.neox_args.mamba_selective_fp32_params:
	self.A_log._deepspeed_no_cast = True

	# D parameter
	self.D = nn.Parameter(
	torch.ones(
	self.d_inner_per_rank,
	device=torch.cuda.current_device(),
	dtype=torch.float32,
	)
	).to(
	torch.float32
	) # Keep in fp32, following https://github.com/state-spaces/mamba#precision and code comments
	self.D._no_weight_decay = (
	True # setting this attribute turns off weight decay for this param
	)
	# setting this attribute prevents deeperspeed from casting this param to fp32
	# requires DeeperSpeed commit https://github.com/EleutherAI/DeeperSpeed/commit/6d097beccc4e3b0ac806c7d975f8c10d4689de26 or later
	if self.neox_args.mamba_selective_fp32_params:
	self.D._deepspeed_no_cast = True

	# out down-projection.
	# use "single_residual_scaled_normal"
	# for output_layer_init_method
	# to perform gpt-2 style scaled init as done in Mamba paper.
	self.out_proj = mpu.RowParallelLinear(
	neox_args=neox_args,
	input_size=self.d_inner,
	output_size=self.d_model,
	input_is_parallel=True,
	init_method=output_layer_init_method,
	skip_bias_add=not neox_args.mamba_use_bias_in_linears,
	bias=neox_args.mamba_use_bias_in_linears,
	parallel_output=False,
	)

	def selective_scan(
	self,
	x,
	dt,
	A,
	B,
	C,
	D,
	z=None,
	delta_bias=None,
	delta_softplus=True,
	):

	if not self.neox_args.mamba_selective_scan_fusion:
	y = selective_scan_ref(
	u=x,
	delta=dt,
	A=A,
	B=B,
	C=C,
	D=D,
	z=z,
	delta_bias=delta_bias,
	delta_softplus=delta_softplus,
	return_last_state=False,
	)
	else:
	y = selective_scan_fn(
	x,
	dt,
	A,
	B,
	C,
	D=D,
	z=z,
	delta_bias=delta_bias,
	delta_softplus=delta_softplus,
	return_last_state=False,
	)

	return y

	def forward(self, hidden_states):
	""" """
	# TODO: support inference natively in neox.
	# For now, we only handle training (parallel scan).
	assert self.training, "Mamba in NeoX does not support inference!"

	# hidden_states: [sq, b, h]
	seqlen, batch, dim = hidden_states.shape

	# first up: perform in_proj
	xz, _ = self.in_proj(hidden_states)
	xz = einops.rearrange(xz, "l b d -> b d l")

	A = -torch.exp(self.A_log.float()) # (d_inner, d_state)

	if self.neox_args.mamba_inner_func_fusion:
	# =================
	# Fused mamba inner
	# =================

	# mamba provides a mamba_inner fn that computes the entire (post-in_proj) Mamba block.
	# we want to use it if we can, as it saves memory and provides speedups.
	# equivalent to use_fast_path=True in state-spaces/mamba.
	out = mamba_inner_fn(
	xz,
	self.conv1d.weight,
	# for some bizarre reason this becomes fp32 sometime after init, when A and D held in fp32.
	# cast it manually if the bias exists
	self.conv1d.bias.to(self.precision)
	if self.conv1d.bias is not None
	else self.conv1d.bias,
	self.x_proj.weight,
	self.dt_proj.weight,
	self.out_proj.weight,
	self.out_proj.bias,
	A,
	None, # B is input-dependent, will compute from x_proj
	None, # C is input-dependent, will compute from x_proj
	self.D.float(),
	delta_bias=self.dt_proj.bias.float(),
	delta_softplus=True,
	)
	if getattr(self, "reduce", None):
	# manually reduce after mamba_inner_fn
	# to collect outputs from different TP ranks.
	# handled by running self.out_proj(y) below
	# so only needed here.
	out = self.reduce(out)

	out = einops.rearrange(out, "b l h -> l b h")

	return out

	x, z = xz.chunk(2, dim=1)

	# ===========
	# Convolution
	# ===========

	if not self.neox_args.mamba_causal_conv_fusion:
	self.conv1d.to(self.precision) # required if keeping fp32 A_log, D
	x = self.act_fn(self.conv1d(x)[..., :seqlen])
	else:
	# Note: this requires silu as activation.
	x = causal_conv1d_fn(
	x=x,
	weight=einops.rearrange(self.conv1d.weight, "d 1 w -> d w"),
	bias=self.conv1d.bias.to(self.precision)
	if self.conv1d.bias is not None
	else self.conv1d.bias,
	activation="silu",
	)

	# ==============
	# SSM (S6) layer
	# ==============

	# project: perform s_B, s_C, s_Delta projections
	x_dbl, _ = self.x_proj(einops.rearrange(x, "b d l -> (b l) d"))
	# split into component dt, B, C
	dt, B, C = torch.split(
	x_dbl, [self.dt_rank, self.d_state, self.d_state], dim=-1
	)

	# up-project Delta / dt
	dt = self.dt_proj.weight @ dt.t()
	dt = einops.rearrange(dt, "d (b l) -> b d l", l=seqlen)

	# rearrange B, C
	B = einops.rearrange(B, "(b l) d_state -> b d_state l", l=seqlen).contiguous()
	C = einops.rearrange(C, "(b l) d_state -> b d_state l", l=seqlen).contiguous()

	# perform selective scan.
	y = self.selective_scan(
	x,
	dt,
	A,
	B,
	C,
	self.D.float(),
	z=z,
	delta_bias=self.dt_proj.bias.float(),
	delta_softplus=True,
	)

	# ===============
	# Down-Projection
	# ===============
	y = einops.rearrange(y, "b d l -> b l d")

	out, _ = self.out_proj(y)

	out = einops.rearrange(out, "b l h -> l b h")

	return out


	class ParallelMambaResidualLayer(nn.Module):
	"""
	Pre-norm Mamba Block with residual connection. No parallelism yet supported.
	"""

	def __init__(
	self,
	neox_args,
	init_method,
	output_layer_init_method,
	layer_number,
	):
	super().__init__()
	# TODO: allow for residual in fp32 if it helps?
	self.layer_number = layer_number

	# TODO: Add support for triton RMSNorm fused kernel at https://github.com/state-spaces/mamba/blob/v1.2.0/mamba_ssm/ops/triton/layernorm.py
	norm, eps = get_norm(neox_args)

	self.norm = norm(neox_args.hidden_size, eps=eps)

	self.mixer = ParallelMambaBlock(
	neox_args=neox_args,
	init_method=init_method,
	output_layer_init_method=output_layer_init_method,
	)

	def forward(self, x, attention_mask=None, layer_past=None):

	# pseudocode:
	# x = x + mixer(norm(x))
	residual = x

	hidden_states = self.mixer(self.norm(x))

	return hidden_states + residual


	class ParallelMambaResidualLayerPipe(ParallelMambaResidualLayer):
	"""Extends MambaResidualLayer to forward attention_mask through the pipeline. DeepSpeed requires this."""

	def forward(self, args):
	assert (
	len(args) == 2
	), "MambaResidualLayerPipe expects 2 arguments - hidden_states and attention_mask"
	hidden_states, attention_mask = args
	# we are returning just [hidden_states, mask]
	return super().forward(hidden_states, attention_mask), attention_mask