pr-include-rev-in-flake

by drbh HF Staff - opened Mar 25

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+508

-1802

Files changed (18) hide show

README.md +9 -80
build.toml +2 -0
build/torch-universal/triton_layer_norm/__init__.py +2 -114
build/torch-universal/triton_layer_norm/_ops.py +0 -8
build/torch-universal/triton_layer_norm/layer_norm.py +244 -338
build/torch-universal/triton_layer_norm/layers.py +2 -44
build/torch-universal/triton_layer_norm/utils/__init__.py +0 -0
build/torch-universal/triton_layer_norm/utils/library.py +0 -66
build/torch-universal/triton_layer_norm/utils/torch.py +0 -21
flake.lock +0 -168
flake.nix +1 -7
tests/test_layer_norm.py +0 -373
torch-ext/triton_layer_norm/__init__.py +2 -114
torch-ext/triton_layer_norm/layer_norm.py +244 -338
torch-ext/triton_layer_norm/layers.py +2 -44
torch-ext/triton_layer_norm/utils/__init__.py +0 -0
torch-ext/triton_layer_norm/utils/library.py +0 -66
torch-ext/triton_layer_norm/utils/torch.py +0 -21

README.md CHANGED Viewed

@@ -1,80 +1,9 @@
----
-license: bsd-3-clause
-tags:
-- kernels
----
-# Triton layer normalization kernels.
-This kernel implements layers normalization using Triton. This kernel is from
-the [flash-attention](https://github.com/Dao-AILab/flash-attention) project.
-## Functions
-### Function `layer_norm`
-`(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, residual: Optional[torch.Tensor] = None, x1: Optional[torch.Tensor] = None, weight1: Optional[torch.Tensor] = None, bias1: Optional[torch.Tensor] = None, eps: float = 1e-06, dropout_p: float = 0.0, rowscale=None, prenorm: bool = False, residual_in_fp32: bool = False, is_rms_norm: bool = False, return_dropout_mask: bool = False, out: Optional[torch.Tensor] = None, residual_out: Optional[torch.Tensor] = None)`
-Apply layer normalization to the input tensor with Triton acceleration.
-### Parameters
-- **x** (*torch.Tensor*) --
-  Input tensor to normalize.
-- **weight** (*torch.Tensor*) --
-  Scale parameter for normalization.
-- **bias** (*torch.Tensor*) --
-  Shift parameter for normalization.
-- **residual** (*torch.Tensor*, *optional*) --
-  Optional residual tensor to add to the input before normalization.
-- **x1** (*torch.Tensor*, *optional*) --
-  Optional second input tensor to combine with *x*. When provided, the function
-  first adds *x1* to *x* and then applies normalization.
-- **weight1** (*torch.Tensor*, *optional*) --
-  Scale parameter for the second normalization.
-- **bias1** (*torch.Tensor*, *optional*) --
-  Shift parameter for the second normalization.
-- **eps** (*float*, *optional*, defaults to 1e-6) --
-  Small constant added for numerical stability in normalization.
-- **dropout_p** (*float*, *optional*, defaults to 0.0) --
-  Dropout probability. If greater than 0, applies dropout to the input before
-  normalization and residual addition.
-- **rowscale** (*torch.Tensor*, *optional*) --
-  Optional scaling factor applied to each row of the input tensor.
-  Not compatible with the use of *x1*.
-- **prenorm** (*bool*, *optional*, defaults to False) --
-  If True, returns both the normalized output and the unnormalized input+residual.
-- **residual_in_fp32** (*bool*, *optional*, defaults to False) --
-  If True, performs the residual connection in FP32 precision.
-- **is_rms_norm** (*bool*, *optional*, defaults to False) --
-  If True, uses RMS normalization instead of layer normalization.
-- **return_dropout_mask** (*bool*, *optional*, defaults to False) --
-  If True, returns the dropout mask used for the computation.
-- **out** (*torch.Tensor*, *optional*) --
-  Output tensor for the normalized result. If *None*, a new tensor is allocated.
-- **residual_out** (*torch.Tensor*, *optional*) --
-  Output tensor for the residual result when using prenorm. If *None*, a new tensor
-  is allocated when needed.
-### Returns
-**Type**: *torch.Tensor* or tuple of *torch.Tensor*
-- The normalized input.
-- The second normalization of the input if *weight1* is provided.
-- The residual tensor if *prenorm* is set.
-- The dropout mask if *return_dropout_mask* is set.
-- The dropout mask for *x1* if *x1* is provided and *return_dropout_mask* is set.
-## Layers
-### Class `LlamaRMSNorm`
-No documentation available.
-#### Methods
-##### Method `forward`
-`(self, hidden_states: torch.Tensor) -> torch.Tensor`
-No documentation available.

+---
+license: bsd-3-clause
+tags:
+  - kernel
+---
+## triton-layer-norm
+Triton layer norm [from flash-attention](https://github.com/Dao-AILab/flash-attention).

build.toml CHANGED Viewed

@@ -1,3 +1,5 @@
 [general]
 name = "triton_layer_norm"
 universal = true

 [general]
 name = "triton_layer_norm"
+[torch]
 universal = true

build/torch-universal/triton_layer_norm/__init__.py CHANGED Viewed

@@ -1,117 +1,5 @@
-"""Triton layer normalization kernels
-This kernel implements layers normalization using Triton. This kernel is from
-the `flash-attention <https://github.com/Dao-AILab/flash-attention>`_ project.
-"""
-from typing import Optional
-import torch
-from . import layers
 from .layer_norm import layer_norm_fn, layer_norm_linear_fn, rms_norm_fn
-def layer_norm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
-    x1: Optional[torch.Tensor] = None,
-    weight1: Optional[torch.Tensor] = None,
-    bias1: Optional[torch.Tensor] = None,
-    eps: float = 1e-6,
-    dropout_p: float = 0.0,
-    rowscale=None,
-    prenorm: bool = False,
-    residual_in_fp32: bool = False,
-    zero_centered_weight: bool = False,
-    is_rms_norm: bool = False,
-    return_dropout_mask: bool = False,
-    out: Optional[torch.Tensor] = None,
-    residual_out: Optional[torch.Tensor] = None,
-):
-    """
-    Apply layer normalization to the input tensor with Triton acceleration.
-    Args:
-        x (`torch.Tensor`):
-            Input tensor to normalize.
-        weight (`torch.Tensor`):
-            Scale parameter for normalization.
-        bias (`torch.Tensor`):
-            Shift parameter for normalization.
-        residual (`torch.Tensor`, *optional*):
-            Optional residual tensor to add to the input before normalization.
-        x1 (`torch.Tensor`, *optional*):
-            Optional second input tensor to combine with `x`. When provided, the function
-            first adds `x1` to `x` and then applies normalization.
-        weight1 (`torch.Tensor`, *optional*):
-            Scale parameter for the second normalization.
-        bias1 (`torch.Tensor`, *optional*):
-            Shift parameter for the second normalization.
-        eps (`float`, *optional*, defaults to 1e-6):
-            Small constant added for numerical stability in normalization.
-        dropout_p (`float`, *optional*, defaults to 0.0):
-            Dropout probability. If greater than 0, applies dropout to the input before
-            normalization and residual addition.
-        rowscale (`torch.Tensor`, *optional*):
-            Optional scaling factor applied to each row of the input tensor.
-            Not compatible with the use of `x1`.
-        prenorm (`bool`, *optional*, defaults to False):
-            If True, returns both the normalized output and the unnormalized input+residual.
-        residual_in_fp32 (`bool`, *optional*, defaults to False):
-            If True, performs the residual connection in FP32 precision.
-        zero_centered_weight (`bool`, *optional*, defaults to False):
-            When set to true, 1.0 is added to the weight before applying it.
-        is_rms_norm (`bool`, *optional*, defaults to False):
-            If True, uses RMS normalization instead of layer normalization.
-        return_dropout_mask (`bool`, *optional*, defaults to False):
-            If True, returns the dropout mask used for the computation.
-        out (`torch.Tensor`, *optional*):
-            Output tensor for the normalized result. If `None`, a new tensor is allocated.
-        residual_out (`torch.Tensor`, *optional*):
-            Output tensor for the residual result when using prenorm. If `None`, a new tensor
-            is allocated when needed.
-    Returns:
-        `torch.Tensor` or tuple of `torch.Tensor`:
-            - The normalized input.
-            - The second normalization of the input if `weight1` is provided.
-            - The residual tensor if `prenorm` is set.
-            - The dropout mask if `return_dropout_mask` is set.
-            - The dropout mask for `x1` if `x1` is provided and `return_dropout_mask` is set.
-    """
-    return layer_norm_fn(
-        x,
-        weight,
-        bias,
-        residual,
-        x1,
-        weight1,
-        bias1,
-        eps,
-        dropout_p,
-        rowscale,
-        prenorm,
-        residual_in_fp32,
-        is_rms_norm,
-        return_dropout_mask,
-        out=out,
-        residual_out=residual_out,
-    )
-__kernel_metadata__ = {
-    "license": "bsd-3-clause",
-}
-__all__ = [
-    "__kernel_metadata__",
-    "layers",
-    "layer_norm",
-    "layer_norm_fn",
-    "layer_norm_linear_fn",
-    "rms_norm_fn",
-]

 from .layer_norm import layer_norm_fn, layer_norm_linear_fn, rms_norm_fn
+from . import layers
+__all__ = ["layers", "layer_norm_fn", "layer_norm_linear_fn", "rms_norm_fn"]

build/torch-universal/triton_layer_norm/_ops.py DELETED Viewed

@@ -1,8 +0,0 @@
-import torch
-ops = torch.ops._triton_layer_norm_9b61b27_dirty
-def add_op_namespace_prefix(op_name: str):
-    """
-    Prefix op by namespace.
-    """
-    return f"_triton_layer_norm_9b61b27_dirty::{op_name}"

build/torch-universal/triton_layer_norm/layer_norm.py CHANGED Viewed

@@ -7,40 +7,14 @@
 # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
 import math
-from typing import Optional, List
 import torch
 import torch.nn.functional as F
-from torch import Tensor
 import triton
 import triton.language as tl
-from ._ops import add_op_namespace_prefix
-from .utils.torch import custom_fwd, custom_bwd
-from .utils.library import triton_op
-def maybe_contiguous_lastdim(x):
-    return x.contiguous() if x is not None and x.stride(-1) != 1 else x
-def maybe_contiguous(x):
-    return x.contiguous() if x is not None else None
-def triton_autotune_configs():
-    # Return configs with a valid warp count for the current device
-    configs = []
-    # Maximum threads per block is architecture-dependent in theory, but in reality all are 1024
-    max_threads_per_block = 1024
-    # Default to warp size 32 if not defined by device
-    warp_size = getattr(torch.cuda.get_device_properties(torch.cuda.current_device()), "warp_size", 32)
-    # Autotune for warp counts which are powers of 2 and do not exceed thread per block limit
-    return [triton.Config({}, num_warps=warp_count) for warp_count in [1, 2, 4, 8, 16, 32]
-            if warp_count * warp_size <= max_threads_per_block]
-    # return [triton.Config({}, num_warps=8)]
 def layer_norm_ref(
     x,
@@ -54,7 +28,6 @@ def layer_norm_ref(
     dropout_p=0.0,
     rowscale=None,
     prenorm=False,
-    zero_centered_weight=False,
     dropout_mask=None,
     dropout_mask1=None,
     upcast=False,
@@ -68,10 +41,6 @@ def layer_norm_ref(
         x1 = x1.float() if x1 is not None else None
         weight1 = weight1.float() if weight1 is not None else None
         bias1 = bias1.float() if bias1 is not None else None
-    if zero_centered_weight:
-        weight = weight + 1.0
-        if weight1 is not None:
-            weight1 = weight1 + 1.0
     if x1 is not None:
         assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
     if rowscale is not None:
@@ -90,9 +59,9 @@ def layer_norm_ref(
         x = x + x1
     if residual is not None:
         x = (x + residual).to(x.dtype)
-    out = F.layer_norm(x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps).to(
-        dtype
-    )
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
@@ -114,7 +83,6 @@ def rms_norm_ref(
     dropout_p=0.0,
     rowscale=None,
     prenorm=False,
-    zero_centered_weight=False,
     dropout_mask=None,
     dropout_mask1=None,
     upcast=False,
@@ -128,10 +96,6 @@ def rms_norm_ref(
         x1 = x1.float() if x1 is not None else None
         weight1 = weight1.float() if weight1 is not None else None
         bias1 = bias1.float() if bias1 is not None else None
-    if zero_centered_weight:
-        weight = weight + 1.0
-        if weight1 is not None:
-            weight1 = weight1 + 1.0
     if x1 is not None:
         assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
     if rowscale is not None:
@@ -151,26 +115,34 @@ def rms_norm_ref(
     if residual is not None:
         x = (x + residual).to(x.dtype)
     rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(dtype)
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
-        out1 = ((x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)).to(
-            dtype
-        )
         return (out, out1) if not prenorm else (out, out1, x)
 @triton.autotune(
-    configs=triton_autotune_configs(),
-    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS", "HAS_X1", "HAS_W1", "HAS_B1"],
 )
-# torch compile doesn't like triton.heuristics, so we set these manually when calling the kernel
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
-# @triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
-# @triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
-# @triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
 @triton.jit
 def _layer_norm_fwd_1pass_kernel(
     X,  # pointer to the input
@@ -186,7 +158,6 @@ def _layer_norm_fwd_1pass_kernel(
     ROWSCALE,
     SEEDS,  # Dropout seeds for each row
     DROPOUT_MASK,
-    DROPOUT_MASK1,
     Mean,  # pointer to the mean
     Rstd,  # pointer to the 1/std
     stride_x_row,  # how much to increase the pointer when moving by 1 row
@@ -199,7 +170,6 @@ def _layer_norm_fwd_1pass_kernel(
     N,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     dropout_p,  # Dropout probability
-    zero_centered_weight,  # If true, add 1.0 to the weight
     IS_RMS_NORM: tl.constexpr,
     BLOCK_N: tl.constexpr,
     HAS_RESIDUAL: tl.constexpr,
@@ -233,7 +203,9 @@ def _layer_norm_fwd_1pass_kernel(
     if HAS_DROPOUT:
         # Compute dropout mask
         # 7 rounds is good enough, and reduces register pressure
-        keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
         x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
         if STORE_DROPOUT_MASK:
             tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
@@ -246,11 +218,12 @@ def _layer_norm_fwd_1pass_kernel(
             # Compute dropout mask
             # 7 rounds is good enough, and reduces register pressure
             keep_mask = (
-                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
             )
             x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
             if STORE_DROPOUT_MASK:
-                tl.store(DROPOUT_MASK1 + row * N + cols, keep_mask, mask=cols < N)
         x += x1
     if HAS_RESIDUAL:
         residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
@@ -270,8 +243,6 @@ def _layer_norm_fwd_1pass_kernel(
     # Normalize and apply linear transformation
     mask = cols < N
     w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if zero_centered_weight:
-        w += 1.0
     if HAS_BIAS:
         b = tl.load(B + cols, mask=mask).to(tl.float32)
     x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
@@ -280,8 +251,6 @@ def _layer_norm_fwd_1pass_kernel(
     tl.store(Y + cols, y, mask=mask)
     if HAS_W1:
         w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-        if zero_centered_weight:
-            w1 += 1.0
         if HAS_B1:
             b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
         y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
@@ -289,87 +258,25 @@ def _layer_norm_fwd_1pass_kernel(
 def _layer_norm_fwd(
-    x: Tensor,
-    weight: Tensor,
-    bias: Tensor,
-    eps: float,
-    residual: Optional[Tensor] = None,
-    x1: Optional[Tensor] = None,
-    weight1: Optional[Tensor] = None,
-    bias1: Optional[Tensor] = None,
-    dropout_p: float = 0.0,
-    rowscale: Optional[Tensor] = None,
-    out_dtype: Optional[torch.dtype] = None,
-    residual_dtype: Optional[torch.dtype] = None,
-    zero_centered_weight: bool = False,
-    is_rms_norm: bool = False,
-    return_dropout_mask: bool = False,
-    out: Optional[Tensor] = None,
-    residual_out: Optional[Tensor] = None
-) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor):
-    # Need to wrap to handle the case where residual_out is a alias of x, which makes torch.library
-    # and torch.compile unhappy. Also allocate memory for out and residual_out if they are None
-    # so that _layer_norm_fwd_impl doesn't have to return them.
-    if out is None:
-        out = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
     if residual is not None:
         residual_dtype = residual.dtype
-    if residual_out is None and (
-        residual is not None
-        or (residual_dtype is not None and residual_dtype != x.dtype)
-        or dropout_p > 0.0
-        or rowscale is not None
-        or x1 is not None
-    ):
-        residual_out = torch.empty_like(
-            x, dtype=residual_dtype if residual_dtype is not None else x.dtype
-        )
-    else:
-        residual_out = None
-    y1, mean, rstd, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd_impl(
-        x,
-        weight,
-        bias,
-        eps,
-        out,
-        residual=residual,
-        x1=x1,
-        weight1=weight1,
-        bias1=bias1,
-        dropout_p=dropout_p,
-        rowscale=rowscale,
-        zero_centered_weight=zero_centered_weight,
-        is_rms_norm=is_rms_norm,
-        return_dropout_mask=return_dropout_mask,
-        residual_out=residual_out,
-    )
-    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
-    if residual_out is None:
-        residual_out = x
-    return out, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1
-# [2025-04-28] torch.library.triton_op ignores the schema argument, but here we need the schema
-# since we're returning a tuple of tensors
-@triton_op(add_op_namespace_prefix("layer_norm_fwd_impl"), mutates_args={"out", "residual_out"},
-           schema="(Tensor x, Tensor weight, Tensor bias, float eps, Tensor(a!) out, Tensor? residual, Tensor? x1, Tensor? weight1, Tensor? bias1, float dropout_p, Tensor? rowscale, bool zero_centered_weight, bool is_rms_norm, bool return_dropout_mask, Tensor(a!)? residual_out) -> (Tensor y1, Tensor mean, Tensor rstd, Tensor seeds, Tensor dropout_mask, Tensor dropout_mask1)")
-def _layer_norm_fwd_impl(
-    x: Tensor,
-    weight: Tensor,
-    bias: Tensor,
-    eps: float,
-    out: Tensor,
-    residual: Optional[Tensor] = None,
-    x1: Optional[Tensor] = None,
-    weight1: Optional[Tensor] = None,
-    bias1: Optional[Tensor] = None,
-    dropout_p: float = 0.0,
-    rowscale: Optional[Tensor] = None,
-    zero_centered_weight: bool = False,
-    is_rms_norm: bool = False,
-    return_dropout_mask: bool = False,
-    residual_out: Optional[Tensor] = None
-) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor):
     M, N = x.shape
     assert x.stride(-1) == 1
     if residual is not None:
@@ -393,17 +300,41 @@ def _layer_norm_fwd_impl(
     if rowscale is not None:
         assert rowscale.is_contiguous()
         assert rowscale.shape == (M,)
-    assert out.shape == x.shape
     assert out.stride(-1) == 1
-    if residual_out is not None:
-        assert residual_out.shape == x.shape
-        assert residual_out.stride(-1) == 1
     if weight1 is not None:
         y1 = torch.empty_like(out)
         assert y1.stride(-1) == 1
     else:
         y1 = None
-    mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None
     rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
     if dropout_p > 0.0:
         seeds = torch.randint(
@@ -412,20 +343,18 @@ def _layer_norm_fwd_impl(
     else:
         seeds = None
     if return_dropout_mask and dropout_p > 0.0:
-        dropout_mask = torch.empty(M, N, device=x.device, dtype=torch.bool)
-        if x1 is not None:
-            dropout_mask1 = torch.empty(M, N, device=x.device, dtype=torch.bool)
-        else:
-            dropout_mask1 = None
     else:
-        dropout_mask, dropout_mask1 = None, None
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     with torch.cuda.device(x.device.index):
-        torch.library.wrap_triton(_layer_norm_fwd_1pass_kernel)[(M,)](
             x,
             out,
             weight,
@@ -439,7 +368,6 @@ def _layer_norm_fwd_impl(
             rowscale,
             seeds,
             dropout_mask,
-            dropout_mask1,
             mean,
             rstd,
             x.stride(0),
@@ -452,8 +380,6 @@ def _layer_norm_fwd_impl(
             N,
             eps,
             dropout_p,
-            # Passing bool make torch inductor very unhappy since it then tries to compare to int_max
-            int(zero_centered_weight),
             is_rms_norm,
             BLOCK_N,
             residual is not None,
@@ -462,26 +388,50 @@ def _layer_norm_fwd_impl(
             dropout_p > 0.0,
             dropout_mask is not None,
             rowscale is not None,
-            HAS_X1=x1 is not None,
-            HAS_W1=weight1 is not None,
-            HAS_B1=bias1 is not None,
         )
-    return y1, mean, rstd, seeds, dropout_mask, dropout_mask1
 @triton.autotune(
-    configs=triton_autotune_configs(),
-    key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS", "HAS_DROPOUT"],
 )
-# torch compile doesn't like triton.heuristics, so we set these manually when calling the kernel
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
 # @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
-# @triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
-# @triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
-# @triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
-# @triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
-# @triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
 @triton.jit
 def _layer_norm_bwd_kernel(
     X,  # pointer to the input
@@ -515,7 +465,6 @@ def _layer_norm_bwd_kernel(
     N,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     dropout_p,
-    zero_centered_weight,
     rows_per_program,
     IS_RMS_NORM: tl.constexpr,
     BLOCK_N: tl.constexpr,
@@ -549,14 +498,10 @@ def _layer_norm_bwd_kernel(
     if RECOMPUTE_OUTPUT:
         Y += row_start * stride_y_row
     w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if zero_centered_weight:
-        w += 1.0
     if RECOMPUTE_OUTPUT and HAS_BIAS:
         b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
     if HAS_DY1:
         w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-        if zero_centered_weight:
-            w1 += 1.0
     dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
     if HAS_BIAS:
         db = tl.zeros((BLOCK_N,), dtype=tl.float32)
@@ -605,14 +550,18 @@ def _layer_norm_bwd_kernel(
         if HAS_DX1:
             if HAS_DROPOUT:
                 keep_mask = (
-                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
                 )
                 dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
             else:
                 dx1 = dx
             tl.store(DX1 + cols, dx1, mask=mask)
         if HAS_DROPOUT:
-            keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
             dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
         if HAS_ROWSCALE:
             rowscale = tl.load(ROWSCALE + row).to(tl.float32)
@@ -642,93 +591,31 @@ def _layer_norm_bwd_kernel(
 def _layer_norm_bwd(
-    dy: Tensor,
-    x: Tensor,
-    weight: Tensor,
-    bias: Tensor,
-    eps: float,
-    mean: Tensor,
-    rstd: Tensor,
-    dresidual: Optional[Tensor] = None,
-    dy1: Optional[Tensor] = None,
-    weight1: Optional[Tensor] = None,
-    bias1: Optional[Tensor] = None,
-    seeds: Optional[Tensor] = None,
-    dropout_p: float = 0.0,
-    rowscale: Optional[Tensor] = None,
-    has_residual: bool = False,
-    has_x1: bool = False,
-    zero_centered_weight: bool = False,
-    is_rms_norm: bool = False,
-    x_dtype: Optional[torch.dtype] = None,
-    recompute_output: bool = False,
-) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor):
-    # Need to wrap to handle the case where dresidual_in or dx1 are aliases of x,
-    # which makes torch.library unhappy
-    dx, dw, db, dresidual_in, dx1, dw1, db1, y = _layer_norm_bwd_impl(
-        dy,
-        x,
-        weight,
-        bias,
-        eps,
-        mean,
-        rstd,
-        dresidual,
-        dy1,
-        weight1,
-        bias1,
-        seeds,
-        dropout_p,
-        rowscale,
-        has_residual,
-        has_x1,
-        zero_centered_weight,
-        is_rms_norm,
-        x_dtype=x_dtype,
-        recompute_output=recompute_output,
-    )
-    # Don't need to compute dresidual_in separately in this case
-    if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
-        dresidual_in = dx
-    if has_x1 and dropout_p == 0.0:
-        dx1 = dx
-    return dx, dw, db, dresidual_in, dx1, dw1, db1, y
-@triton_op(add_op_namespace_prefix("layer_norm_bwd_impl"), mutates_args={},
-           schema="(Tensor dy, Tensor x, Tensor weight, Tensor bias, float eps, Tensor mean, Tensor rstd, Tensor? dresidual, Tensor? dy1, Tensor? weight1, Tensor? bias1, Tensor? seeds, float dropout_p, Tensor? rowscale, bool has_residual, bool has_x1, bool zero_centered_weight, bool is_rms_norm, ScalarType? x_dtype, bool recompute_output) -> (Tensor dx, Tensor dw, Tensor db, Tensor dresidual_in, Tensor dx1, Tensor dw1, Tensor db1, Tensor y)",
-           allow_decomposition=False,  # Don't let torch.compile trace inside
-           )
-def _layer_norm_bwd_impl(
-    dy: Tensor,
-    x: Tensor,
-    weight: Tensor,
-    bias: Tensor,
-    eps: float,
-    mean: Tensor,
-    rstd: Tensor,
-    dresidual: Optional[Tensor] = None,
-    dy1: Optional[Tensor] = None,
-    weight1: Optional[Tensor] = None,
-    bias1: Optional[Tensor] = None,
-    seeds: Optional[Tensor] = None,
-    dropout_p: float = 0.0,
-    rowscale: Optional[Tensor] = None,
-    has_residual: bool = False,
-    has_x1: bool = False,
-    zero_centered_weight: bool = False,
-    is_rms_norm: bool = False,
-    x_dtype: Optional[torch.dtype] = None,
-    recompute_output: bool = False,
-) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor):
     M, N = x.shape
     assert x.stride(-1) == 1
-    dy = maybe_contiguous_lastdim(dy)
     assert dy.stride(-1) == 1
     assert dy.shape == (M, N)
     if dresidual is not None:
-        dresidual = maybe_contiguous_lastdim(dresidual)
         assert dresidual.stride(-1) == 1
         assert dresidual.shape == (M, N)
     assert weight.shape == (N,)
@@ -737,7 +624,6 @@ def _layer_norm_bwd_impl(
         assert bias.stride(-1) == 1
         assert bias.shape == (N,)
     if dy1 is not None:
-        dy1 = maybe_contiguous_lastdim(dy1)
         assert weight1 is not None
         assert dy1.shape == dy.shape
         assert dy1.stride(-1) == 1
@@ -766,18 +652,22 @@ def _layer_norm_bwd_impl(
         else None
     )
     dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
-    y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None
     if recompute_output:
-        assert weight1 is None, "recompute_output is not supported with parallel LayerNorm"
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    # Increasing the multiple (e.g. 8) will allow more thread blocks to be launched and hide the
-    # latency of the gmem reads/writes, but will increase the time of summing up dw / db.
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count * 8
     _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
     _db = (
         torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
@@ -789,7 +679,7 @@ def _layer_norm_bwd_impl(
     rows_per_program = math.ceil(M / sm_count)
     grid = (sm_count,)
     with torch.cuda.device(x.device.index):
-        torch.library.wrap_triton(_layer_norm_bwd_kernel)[grid](
             x,
             weight,
             bias,
@@ -821,8 +711,6 @@ def _layer_norm_bwd_impl(
             N,
             eps,
             dropout_p,
-            # Passing bool make torch inductor very unhappy since it then tries to compare to int_max
-            int(zero_centered_weight),
             rows_per_program,
             is_rms_norm,
             BLOCK_N,
@@ -830,22 +718,24 @@ def _layer_norm_bwd_impl(
             dresidual_in is not None,
             bias is not None,
             dropout_p > 0.0,
-            HAS_ROWSCALE=rowscale is not None,
-            HAS_DY1=dy1 is not None,
-            HAS_DX1=dx1 is not None,
-            HAS_B1=bias1 is not None,
-            RECOMPUTE_OUTPUT=y is not None,
         )
     dw = _dw.sum(0).to(weight.dtype)
     db = _db.sum(0).to(bias.dtype) if bias is not None else None
     dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
     db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
-    # dresidual_in and dx1 could be None, the wrapper will handle assigning them from dx
-    return dx, dw, db, dresidual_in, dx1, dw1, db1, y
 class LayerNormFn(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx,
@@ -861,27 +751,34 @@ class LayerNormFn(torch.autograd.Function):
         rowscale=None,
         prenorm=False,
         residual_in_fp32=False,
-        zero_centered_weight=False,
         is_rms_norm=False,
         return_dropout_mask=False,
-        out_dtype=None,
         out=None,
-        residual_out=None
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
-        x = maybe_contiguous_lastdim(x.reshape(-1, x.shape[-1]))
         if residual is not None:
             assert residual.shape == x_shape_og
-            residual = maybe_contiguous_lastdim(residual.reshape(-1, residual.shape[-1]))
         if x1 is not None:
             assert x1.shape == x_shape_og
             assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-            x1 = maybe_contiguous_lastdim(x1.reshape(-1, x1.shape[-1]))
         weight = weight.contiguous()
-        bias = maybe_contiguous(bias)
-        weight1 = maybe_contiguous(weight1)
-        bias1 = maybe_contiguous(bias1)
         if rowscale is not None:
             rowscale = rowscale.reshape(-1).contiguous()
         residual_dtype = (
@@ -893,24 +790,24 @@ class LayerNormFn(torch.autograd.Function):
             out = out.reshape(-1, out.shape[-1])
         if residual_out is not None:
             residual_out = residual_out.reshape(-1, residual_out.shape[-1])
-        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
-            x,
-            weight,
-            bias,
-            eps,
-            residual,
-            x1,
-            weight1,
-            bias1,
-            dropout_p=dropout_p,
-            rowscale=rowscale,
-            out_dtype=out_dtype,
-            residual_dtype=residual_dtype,
-            zero_centered_weight=zero_centered_weight,
-            is_rms_norm=is_rms_norm,
-            return_dropout_mask=return_dropout_mask,
-            out=out,
-            residual_out=residual_out,
         )
         ctx.save_for_backward(
             residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
@@ -923,12 +820,17 @@ class LayerNormFn(torch.autograd.Function):
         ctx.has_x1 = x1 is not None
         ctx.prenorm = prenorm
         ctx.x_dtype = x.dtype
-        ctx.zero_centered_weight = zero_centered_weight
         y = y.reshape(x_shape_og)
         y1 = y1.reshape(x_shape_og) if y1 is not None else None
-        residual_out = residual_out.reshape(x_shape_og) if residual_out is not None else None
-        dropout_mask = dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
-        dropout_mask1 = dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
         if not return_dropout_mask:
             if weight1 is None:
                 return y if not prenorm else (y, residual_out)
@@ -952,19 +854,26 @@ class LayerNormFn(torch.autograd.Function):
     def backward(ctx, dy, *args):
         x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
         dy = dy.reshape(-1, dy.shape[-1])
         if weight1 is not None:
             dy1, args = args[0], args[1:]
             dy1 = dy1.reshape(-1, dy1.shape[-1])
             assert dy1.shape == x.shape
         else:
             dy1 = None
         if ctx.prenorm:
             dresidual = args[0]
             dresidual = dresidual.reshape(-1, dresidual.shape[-1])
             assert dresidual.shape == x.shape
         else:
             dresidual = None
-        dx, dw, db, dresidual_in, dx1, dw1, db1, _ = _layer_norm_bwd(
             dy,
             x,
             weight,
@@ -981,10 +890,8 @@ class LayerNormFn(torch.autograd.Function):
             rowscale,
             ctx.has_residual,
             ctx.has_x1,
-            ctx.zero_centered_weight,
             ctx.is_rms_norm,
             x_dtype=ctx.x_dtype,
-            recompute_output=False,
         )
         return (
             dx.reshape(ctx.x_shape_og),
@@ -1003,8 +910,6 @@ class LayerNormFn(torch.autograd.Function):
             None,
             None,
             None,
-            None,
-            None,
         )
@@ -1021,12 +926,10 @@ def layer_norm_fn(
     rowscale=None,
     prenorm=False,
     residual_in_fp32=False,
-    zero_centered_weight=False,
     is_rms_norm=False,
     return_dropout_mask=False,
-    out_dtype=None,
     out=None,
-    residual_out=None
 ):
     return LayerNormFn.apply(
         x,
@@ -1041,12 +944,10 @@ def layer_norm_fn(
         rowscale,
         prenorm,
         residual_in_fp32,
-        zero_centered_weight,
         is_rms_norm,
         return_dropout_mask,
-        out_dtype,
         out,
-        residual_out
     )
@@ -1063,11 +964,9 @@ def rms_norm_fn(
     rowscale=None,
     prenorm=False,
     residual_in_fp32=False,
-    zero_centered_weight=False,
     return_dropout_mask=False,
-    out_dtype=None,
     out=None,
-    residual_out=None
 ):
     return LayerNormFn.apply(
         x,
@@ -1082,19 +981,16 @@ def rms_norm_fn(
         rowscale,
         prenorm,
         residual_in_fp32,
-        zero_centered_weight,
         True,
         return_dropout_mask,
-        out_dtype,
         out,
-        residual_out
     )
 class RMSNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, zero_centered_weight=False,
-                 device=None, dtype=None):
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.eps = eps
@@ -1102,16 +998,12 @@ class RMSNorm(torch.nn.Module):
             self.drop = torch.nn.Dropout(dropout_p)
         else:
             self.drop = None
-        self.zero_centered_weight = zero_centered_weight
         self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
         self.register_parameter("bias", None)
         self.reset_parameters()
     def reset_parameters(self):
-        if not self.zero_centered_weight:
-            torch.nn.init.ones_(self.weight)
-        else:
-            torch.nn.init.zeros_(self.weight)
     def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
         return rms_norm_fn(
@@ -1123,14 +1015,12 @@ class RMSNorm(torch.nn.Module):
             dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
             prenorm=prenorm,
             residual_in_fp32=residual_in_fp32,
-            zero_centered_weight=self.zero_centered_weight,
         )
 class LayerNormLinearFn(torch.autograd.Function):
     @staticmethod
-    @custom_fwd
     def forward(
         ctx,
         x,
@@ -1146,12 +1036,17 @@ class LayerNormLinearFn(torch.autograd.Function):
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
-        x = maybe_contiguous_lastdim(x.reshape(-1, x.shape[-1]))
         if residual is not None:
             assert residual.shape == x_shape_og
-            residual = maybe_contiguous_lastdim(residual.reshape(-1, residual.shape[-1]))
         norm_weight = norm_weight.contiguous()
-        norm_bias = maybe_contiguous(norm_bias)
         residual_dtype = (
             residual.dtype
             if residual is not None
@@ -1163,17 +1058,25 @@ class LayerNormLinearFn(torch.autograd.Function):
             norm_bias,
             eps,
             residual,
-            out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_dtype("cuda"),
             residual_dtype=residual_dtype,
             is_rms_norm=is_rms_norm,
         )
         y = y.reshape(x_shape_og)
-        dtype = torch.get_autocast_dtype("cuda") if torch.is_autocast_enabled() else y.dtype
         linear_weight = linear_weight.to(dtype)
         linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
         out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
         # We don't store y, will be recomputed in the backward pass to save memory
-        ctx.save_for_backward(residual_out, norm_weight, norm_bias, linear_weight, mean, rstd)
         ctx.x_shape_og = x_shape_og
         ctx.eps = eps
         ctx.is_rms_norm = is_rms_norm
@@ -1184,17 +1087,20 @@ class LayerNormLinearFn(torch.autograd.Function):
         return out if not prenorm else (out, residual_out.reshape(x_shape_og))
     @staticmethod
-    @custom_bwd
     def backward(ctx, dout, *args):
         x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
         dout = dout.reshape(-1, dout.shape[-1])
         dy = F.linear(dout, linear_weight.t())
         dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
-        dy = maybe_contiguous_lastdim(dy)
         assert dy.shape == x.shape
         if ctx.prenorm:
             dresidual = args[0]
-            dresidual = maybe_contiguous_lastdim(dresidual.reshape(-1, dresidual.shape[-1]))
             assert dresidual.shape == x.shape
         else:
             dresidual = None

 # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
 import math
 import torch
 import torch.nn.functional as F
+from torch.amp import custom_fwd, custom_bwd
 import triton
 import triton.language as tl
 def layer_norm_ref(
     x,
     dropout_p=0.0,
     rowscale=None,
     prenorm=False,
     dropout_mask=None,
     dropout_mask1=None,
     upcast=False,
         x1 = x1.float() if x1 is not None else None
         weight1 = weight1.float() if weight1 is not None else None
         bias1 = bias1.float() if bias1 is not None else None
     if x1 is not None:
         assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
     if rowscale is not None:
         x = x + x1
     if residual is not None:
         x = (x + residual).to(x.dtype)
+    out = F.layer_norm(
+        x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps
+    ).to(dtype)
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
     dropout_p=0.0,
     rowscale=None,
     prenorm=False,
     dropout_mask=None,
     dropout_mask1=None,
     upcast=False,
         x1 = x1.float() if x1 is not None else None
         weight1 = weight1.float() if weight1 is not None else None
         bias1 = bias1.float() if bias1 is not None else None
     if x1 is not None:
         assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
     if rowscale is not None:
     if residual is not None:
         x = (x + residual).to(x.dtype)
     rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
+    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(
+        dtype
+    )
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
+        out1 = (
+            (x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)
+        ).to(dtype)
         return (out, out1) if not prenorm else (out, out1, x)
 @triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
 )
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
+@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
+@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
+@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
 @triton.jit
 def _layer_norm_fwd_1pass_kernel(
     X,  # pointer to the input
     ROWSCALE,
     SEEDS,  # Dropout seeds for each row
     DROPOUT_MASK,
     Mean,  # pointer to the mean
     Rstd,  # pointer to the 1/std
     stride_x_row,  # how much to increase the pointer when moving by 1 row
     N,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     dropout_p,  # Dropout probability
     IS_RMS_NORM: tl.constexpr,
     BLOCK_N: tl.constexpr,
     HAS_RESIDUAL: tl.constexpr,
     if HAS_DROPOUT:
         # Compute dropout mask
         # 7 rounds is good enough, and reduces register pressure
+        keep_mask = (
+            tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
+        )
         x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
         if STORE_DROPOUT_MASK:
             tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
             # Compute dropout mask
             # 7 rounds is good enough, and reduces register pressure
             keep_mask = (
+                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
+                > dropout_p
             )
             x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
             if STORE_DROPOUT_MASK:
+                tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N)
         x += x1
     if HAS_RESIDUAL:
         residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
     # Normalize and apply linear transformation
     mask = cols < N
     w = tl.load(W + cols, mask=mask).to(tl.float32)
     if HAS_BIAS:
         b = tl.load(B + cols, mask=mask).to(tl.float32)
     x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
     tl.store(Y + cols, y, mask=mask)
     if HAS_W1:
         w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
         if HAS_B1:
             b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
         y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
 def _layer_norm_fwd(
+    x,
+    weight,
+    bias,
+    eps,
+    residual=None,
+    x1=None,
+    weight1=None,
+    bias1=None,
+    dropout_p=0.0,
+    rowscale=None,
+    out_dtype=None,
+    residual_dtype=None,
+    is_rms_norm=False,
+    return_dropout_mask=False,
+    out=None,
+    residual_out=None,
+):
     if residual is not None:
         residual_dtype = residual.dtype
     M, N = x.shape
     assert x.stride(-1) == 1
     if residual is not None:
     if rowscale is not None:
         assert rowscale.is_contiguous()
         assert rowscale.shape == (M,)
+    # allocate output
+    if out is None:
+        out = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
+    else:
+        assert out.shape == x.shape
     assert out.stride(-1) == 1
     if weight1 is not None:
         y1 = torch.empty_like(out)
         assert y1.stride(-1) == 1
     else:
         y1 = None
+    if (
+        residual is not None
+        or (residual_dtype is not None and residual_dtype != x.dtype)
+        or dropout_p > 0.0
+        or rowscale is not None
+        or x1 is not None
+    ):
+        if residual_out is None:
+            residual_out = torch.empty(
+                M,
+                N,
+                device=x.device,
+                dtype=residual_dtype if residual_dtype is not None else x.dtype,
+            )
+        else:
+            assert residual_out.shape == x.shape
+        assert residual_out.stride(-1) == 1
+    else:
+        residual_out = None
+    mean = (
+        torch.empty((M,), dtype=torch.float32, device=x.device)
+        if not is_rms_norm
+        else None
+    )
     rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
     if dropout_p > 0.0:
         seeds = torch.randint(
     else:
         seeds = None
     if return_dropout_mask and dropout_p > 0.0:
+        dropout_mask = torch.empty(
+            M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool
+        )
     else:
+        dropout_mask = None
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     with torch.cuda.device(x.device.index):
+        _layer_norm_fwd_1pass_kernel[(M,)](
             x,
             out,
             weight,
             rowscale,
             seeds,
             dropout_mask,
             mean,
             rstd,
             x.stride(0),
             N,
             eps,
             dropout_p,
             is_rms_norm,
             BLOCK_N,
             residual is not None,
             dropout_p > 0.0,
             dropout_mask is not None,
             rowscale is not None,
         )
+    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
+    if dropout_mask is not None and x1 is not None:
+        dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
+    else:
+        dropout_mask1 = None
+    return (
+        out,
+        y1,
+        mean,
+        rstd,
+        residual_out if residual_out is not None else x,
+        seeds,
+        dropout_mask,
+        dropout_mask1,
+    )
 @triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=[
+        "N",
+        "HAS_DRESIDUAL",
+        "STORE_DRESIDUAL",
+        "IS_RMS_NORM",
+        "HAS_BIAS",
+        "HAS_DROPOUT",
+    ],
 )
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
 # @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
+@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
+@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
+@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
+@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
+@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
 @triton.jit
 def _layer_norm_bwd_kernel(
     X,  # pointer to the input
     N,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     dropout_p,
     rows_per_program,
     IS_RMS_NORM: tl.constexpr,
     BLOCK_N: tl.constexpr,
     if RECOMPUTE_OUTPUT:
         Y += row_start * stride_y_row
     w = tl.load(W + cols, mask=mask).to(tl.float32)
     if RECOMPUTE_OUTPUT and HAS_BIAS:
         b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
     if HAS_DY1:
         w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
     dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
     if HAS_BIAS:
         db = tl.zeros((BLOCK_N,), dtype=tl.float32)
         if HAS_DX1:
             if HAS_DROPOUT:
                 keep_mask = (
+                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
+                    > dropout_p
                 )
                 dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
             else:
                 dx1 = dx
             tl.store(DX1 + cols, dx1, mask=mask)
         if HAS_DROPOUT:
+            keep_mask = (
+                tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7)
+                > dropout_p
+            )
             dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
         if HAS_ROWSCALE:
             rowscale = tl.load(ROWSCALE + row).to(tl.float32)
 def _layer_norm_bwd(
+    dy,
+    x,
+    weight,
+    bias,
+    eps,
+    mean,
+    rstd,
+    dresidual=None,
+    dy1=None,
+    weight1=None,
+    bias1=None,
+    seeds=None,
+    dropout_p=0.0,
+    rowscale=None,
+    has_residual=False,
+    has_x1=False,
+    is_rms_norm=False,
+    x_dtype=None,
+    recompute_output=False,
+):
     M, N = x.shape
     assert x.stride(-1) == 1
     assert dy.stride(-1) == 1
     assert dy.shape == (M, N)
     if dresidual is not None:
         assert dresidual.stride(-1) == 1
         assert dresidual.shape == (M, N)
     assert weight.shape == (N,)
         assert bias.stride(-1) == 1
         assert bias.shape == (N,)
     if dy1 is not None:
         assert weight1 is not None
         assert dy1.shape == dy.shape
         assert dy1.stride(-1) == 1
         else None
     )
     dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
+    y = (
+        torch.empty(M, N, dtype=dy.dtype, device=dy.device)
+        if recompute_output
+        else None
+    )
     if recompute_output:
+        assert (
+            weight1 is None
+        ), "recompute_output is not supported with parallel LayerNorm"
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
     _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
     _db = (
         torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
     rows_per_program = math.ceil(M / sm_count)
     grid = (sm_count,)
     with torch.cuda.device(x.device.index):
+        _layer_norm_bwd_kernel[grid](
             x,
             weight,
             bias,
             N,
             eps,
             dropout_p,
             rows_per_program,
             is_rms_norm,
             BLOCK_N,
             dresidual_in is not None,
             bias is not None,
             dropout_p > 0.0,
         )
     dw = _dw.sum(0).to(weight.dtype)
     db = _db.sum(0).to(bias.dtype) if bias is not None else None
     dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
     db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
+    # Don't need to compute dresidual_in separately in this case
+    if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
+        dresidual_in = dx
+    if has_x1 and dropout_p == 0.0:
+        dx1 = dx
+    return (
+        (dx, dw, db, dresidual_in, dx1, dw1, db1)
+        if not recompute_output
+        else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
+    )
 class LayerNormFn(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx,
         rowscale=None,
         prenorm=False,
         residual_in_fp32=False,
         is_rms_norm=False,
         return_dropout_mask=False,
         out=None,
+        residual_out=None,
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
         if residual is not None:
             assert residual.shape == x_shape_og
+            residual = residual.reshape(-1, residual.shape[-1])
+            if residual.stride(-1) != 1:
+                residual = residual.contiguous()
         if x1 is not None:
             assert x1.shape == x_shape_og
             assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
+            x1 = x1.reshape(-1, x1.shape[-1])
+            if x1.stride(-1) != 1:
+                x1 = x1.contiguous()
         weight = weight.contiguous()
+        if bias is not None:
+            bias = bias.contiguous()
+        if weight1 is not None:
+            weight1 = weight1.contiguous()
+        if bias1 is not None:
+            bias1 = bias1.contiguous()
         if rowscale is not None:
             rowscale = rowscale.reshape(-1).contiguous()
         residual_dtype = (
             out = out.reshape(-1, out.shape[-1])
         if residual_out is not None:
             residual_out = residual_out.reshape(-1, residual_out.shape[-1])
+        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = (
+            _layer_norm_fwd(
+                x,
+                weight,
+                bias,
+                eps,
+                residual,
+                x1,
+                weight1,
+                bias1,
+                dropout_p=dropout_p,
+                rowscale=rowscale,
+                residual_dtype=residual_dtype,
+                is_rms_norm=is_rms_norm,
+                return_dropout_mask=return_dropout_mask,
+                out=out,
+                residual_out=residual_out,
+            )
         )
         ctx.save_for_backward(
             residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
         ctx.has_x1 = x1 is not None
         ctx.prenorm = prenorm
         ctx.x_dtype = x.dtype
         y = y.reshape(x_shape_og)
         y1 = y1.reshape(x_shape_og) if y1 is not None else None
+        residual_out = (
+            residual_out.reshape(x_shape_og) if residual_out is not None else None
+        )
+        dropout_mask = (
+            dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
+        )
+        dropout_mask1 = (
+            dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
+        )
         if not return_dropout_mask:
             if weight1 is None:
                 return y if not prenorm else (y, residual_out)
     def backward(ctx, dy, *args):
         x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
         dy = dy.reshape(-1, dy.shape[-1])
+        if dy.stride(-1) != 1:
+            dy = dy.contiguous()
+        assert dy.shape == x.shape
         if weight1 is not None:
             dy1, args = args[0], args[1:]
             dy1 = dy1.reshape(-1, dy1.shape[-1])
+            if dy1.stride(-1) != 1:
+                dy1 = dy1.contiguous()
             assert dy1.shape == x.shape
         else:
             dy1 = None
         if ctx.prenorm:
             dresidual = args[0]
             dresidual = dresidual.reshape(-1, dresidual.shape[-1])
+            if dresidual.stride(-1) != 1:
+                dresidual = dresidual.contiguous()
             assert dresidual.shape == x.shape
         else:
             dresidual = None
+        dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
             dy,
             x,
             weight,
             rowscale,
             ctx.has_residual,
             ctx.has_x1,
             ctx.is_rms_norm,
             x_dtype=ctx.x_dtype,
         )
         return (
             dx.reshape(ctx.x_shape_og),
             None,
             None,
             None,
         )
     rowscale=None,
     prenorm=False,
     residual_in_fp32=False,
     is_rms_norm=False,
     return_dropout_mask=False,
     out=None,
+    residual_out=None,
 ):
     return LayerNormFn.apply(
         x,
         rowscale,
         prenorm,
         residual_in_fp32,
         is_rms_norm,
         return_dropout_mask,
         out,
+        residual_out,
     )
     rowscale=None,
     prenorm=False,
     residual_in_fp32=False,
     return_dropout_mask=False,
     out=None,
+    residual_out=None,
 ):
     return LayerNormFn.apply(
         x,
         rowscale,
         prenorm,
         residual_in_fp32,
         True,
         return_dropout_mask,
         out,
+        residual_out,
     )
 class RMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, dtype=None):
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.eps = eps
             self.drop = torch.nn.Dropout(dropout_p)
         else:
             self.drop = None
         self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
         self.register_parameter("bias", None)
         self.reset_parameters()
     def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
     def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
         return rms_norm_fn(
             dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
             prenorm=prenorm,
             residual_in_fp32=residual_in_fp32,
         )
 class LayerNormLinearFn(torch.autograd.Function):
     @staticmethod
+    @custom_fwd(device_type="cuda")
     def forward(
         ctx,
         x,
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
         if residual is not None:
             assert residual.shape == x_shape_og
+            residual = residual.reshape(-1, residual.shape[-1])
+            if residual.stride(-1) != 1:
+                residual = residual.contiguous()
         norm_weight = norm_weight.contiguous()
+        if norm_bias is not None:
+            norm_bias = norm_bias.contiguous()
         residual_dtype = (
             residual.dtype
             if residual is not None
             norm_bias,
             eps,
             residual,
+            out_dtype=(
+                None
+                if not torch.is_autocast_enabled()
+                else torch.get_autocast_gpu_dtype()
+            ),
             residual_dtype=residual_dtype,
             is_rms_norm=is_rms_norm,
         )
         y = y.reshape(x_shape_og)
+        dtype = (
+            torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
+        )
         linear_weight = linear_weight.to(dtype)
         linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
         out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
         # We don't store y, will be recomputed in the backward pass to save memory
+        ctx.save_for_backward(
+            residual_out, norm_weight, norm_bias, linear_weight, mean, rstd
+        )
         ctx.x_shape_og = x_shape_og
         ctx.eps = eps
         ctx.is_rms_norm = is_rms_norm
         return out if not prenorm else (out, residual_out.reshape(x_shape_og))
     @staticmethod
+    @custom_bwd(device_type="cuda")
     def backward(ctx, dout, *args):
         x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
         dout = dout.reshape(-1, dout.shape[-1])
         dy = F.linear(dout, linear_weight.t())
         dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
+        if dy.stride(-1) != 1:
+            dy = dy.contiguous()
         assert dy.shape == x.shape
         if ctx.prenorm:
             dresidual = args[0]
+            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
+            if dresidual.stride(-1) != 1:
+                dresidual = dresidual.contiguous()
             assert dresidual.shape == x.shape
         else:
             dresidual = None

build/torch-universal/triton_layer_norm/layers.py CHANGED Viewed

@@ -1,46 +1,4 @@
-import torch
-from torch import nn
-from .layer_norm import rms_norm_fn
-class LlamaRMSNorm(nn.Module):
-    """
-    RMS Layer Norm for Llama models.
-    Triton-optimized RMS layer norm. The interface is compatible with `LLamaRMSNorm` in
-    `transformers`.
-    Attributes:
-        weight (`torch.Tensor`): The learnable scaling parameter.
-        variance_epsilon (`float`): The epsilon value for numerical stability.
-    """
-    weight: torch.Tensor
-    variance_epsilon: float
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """
-        Apply RMS normalization to the input hidden states.
-        Args:
-            hidden_states (`torch.Tensor`):
-                Input tensor of shape `(batch_size, sequence_length, hidden_size)` or any shape
-                where the last dimension is the feature dimension to be normalized.
-        Returns:
-            `torch.Tensor`:
-                The normalized tensor with the same shape as the input `hidden_states`.
-        """
-        return rms_norm_fn(
-            hidden_states,
-            self.weight,
-            bias=None,
-            residual=None,
-            eps=self.variance_epsilon,
-            dropout_p=0.0,
-            prenorm=False,
-            residual_in_fp32=False,
-        )
-__all__ = ["LlamaRMSNorm"]


1	+ from .layer_norm import RMSNorm

2

3
4	+ __all__ = ["RMSNorm"]

build/torch-universal/triton_layer_norm/utils/__init__.py DELETED Viewed

File without changes

build/torch-universal/triton_layer_norm/utils/library.py DELETED Viewed

@@ -1,66 +0,0 @@
-# Adapted from https://github.com/pytorch/pytorch/blob/v2.7.0/torch/_library/triton.py
-# The PyTorch implementation simply ignores the schema argument, we simply modify it to use schema.
-from typing import Optional, Callable, Iterable, Union
-from torch.library import custom_op, CustomOpDef
-from torch._library.triton import set_wrap_triton_enabled
-def triton_op(
-    name: str,
-    fn: Optional[Callable] = None,
-    /,
-    *,
-    mutates_args: Union[str, Iterable[str]],
-    schema: Optional[str] = None,
-    # If allow_decomposition=True, this matches torch.library.triton_op behavior. If set to False,
-    # then it behaves like torch.library.custom_op instead, which doesn't decompose the operator
-    # and so inductor can't trace inside.
-    allow_decomposition=True,
-) -> Callable:
-    def dec(fn: Callable[..., object]) -> CustomOpDef:
-        def backend_fn(*args, **kwargs):  # type: ignore[no-untyped-def]
-            # Optimization: we're passing regular Tensors into the triton kernel, so
-            # no need to go through HOP dispatch
-            with set_wrap_triton_enabled(False):
-                return fn(*args, **kwargs)
-        result = custom_op(
-            name,
-            backend_fn,
-            mutates_args=mutates_args,
-            # This is the only difference with the PyTorch implementation
-            schema=schema,
-        )
-        from torch._subclasses.functional_tensor import FunctionalTensorMode
-        # We require that the user pass us a function that is make_fx traceable,
-        # so we can just register it as the Fake/meta kernel.
-        result.register_fake(fn)
-        if allow_decomposition:
-            # We decompose the operator when FunctionalTensorMode is active.
-            # The goal is to decompose the operator in AOTDispatcher.
-            # - With torch.compile, this means that the backend (usually Inductor)
-            #   can see a call to the triton kernel(s) and so it can directly optimize
-            #   them by inlining them into the lowering process.
-            def functional_decomp(  # type: ignore[no-untyped-def]
-                mode, op, types, args, kwargs
-            ):
-                from torch.export._trace import custom_triton_ops_decomposition_disabled
-                if custom_triton_ops_decomposition_disabled():
-                    return mode.__torch_dispatch__(op, types, args, kwargs)
-                else:
-                    with mode:
-                        return fn(*args, **kwargs)
-            result.register_torch_dispatch(FunctionalTensorMode, functional_decomp)
-        return result
-    if fn is None:
-        return dec
-    else:
-        return dec(fn)

build/torch-universal/triton_layer_norm/utils/torch.py DELETED Viewed

@@ -1,21 +0,0 @@
-import torch
-from typing import Callable
-def custom_amp_decorator(dec: Callable, cuda_amp_deprecated: bool):
-    def decorator(*args, **kwargs):
-        if cuda_amp_deprecated:
-            kwargs["device_type"] = "cuda"
-        return dec(*args, **kwargs)
-    return decorator
-if hasattr(torch.amp, "custom_fwd"): # type: ignore[attr-defined]
-    deprecated = True
-    from torch.amp import custom_fwd, custom_bwd # type: ignore[attr-defined]
-else:
-    deprecated = False
-    from torch.cuda.amp import custom_fwd, custom_bwd
-custom_fwd = custom_amp_decorator(custom_fwd, deprecated)
-custom_bwd = custom_amp_decorator(custom_bwd, deprecated)

flake.lock DELETED Viewed

@@ -1,168 +0,0 @@
-{
-  "nodes": {
-    "flake-compat": {
-      "locked": {
-        "lastModified": 1747046372,
-        "narHash": "sha256-CIVLLkVgvHYbgI2UpXvIIBJ12HWgX+fjA8Xf8PUmqCY=",
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "rev": "9100a0f413b0c601e0533d1d94ffd501ce2e7885",
-        "type": "github"
-      },
-      "original": {
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "type": "github"
-      }
-    },
-    "flake-compat_2": {
-      "locked": {
-        "lastModified": 1733328505,
-        "narHash": "sha256-NeCCThCEP3eCl2l/+27kNNK7QrwZB1IJCrXfrbv5oqU=",
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "rev": "ff81ac966bb2cae68946d5ed5fc4994f96d0ffec",
-        "type": "github"
-      },
-      "original": {
-        "owner": "edolstra",
-        "repo": "flake-compat",
-        "type": "github"
-      }
-    },
-    "flake-utils": {
-      "inputs": {
-        "systems": "systems"
-      },
-      "locked": {
-        "lastModified": 1731533236,
-        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "flake-utils_2": {
-      "inputs": {
-        "systems": "systems_2"
-      },
-      "locked": {
-        "lastModified": 1731533236,
-        "narHash": "sha256-l0KFg5HjrsfsO/JpG+r7fRrqm12kzFHyUHqHCVpMMbI=",
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "rev": "11707dc2f618dd54ca8739b309ec4fc024de578b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "numtide",
-        "repo": "flake-utils",
-        "type": "github"
-      }
-    },
-    "hf-nix": {
-      "inputs": {
-        "flake-compat": "flake-compat_2",
-        "flake-utils": "flake-utils_2",
-        "nixpkgs": "nixpkgs"
-      },
-      "locked": {
-        "lastModified": 1754038838,
-        "narHash": "sha256-oHigCT4z0ayyLyEuxdZooSXRAZP8lfOkZHzY1lx1U50=",
-        "owner": "huggingface",
-        "repo": "hf-nix",
-        "rev": "336f781fa284e193baa3d4c3ce3f95fb34e9ffad",
-        "type": "github"
-      },
-      "original": {
-        "owner": "huggingface",
-        "repo": "hf-nix",
-        "type": "github"
-      }
-    },
-    "kernel-builder": {
-      "inputs": {
-        "flake-compat": "flake-compat",
-        "flake-utils": "flake-utils",
-        "hf-nix": "hf-nix",
-        "nixpkgs": [
-          "kernel-builder",
-          "hf-nix",
-          "nixpkgs"
-        ]
-      },
-      "locked": {
-        "lastModified": 1756320464,
-        "narHash": "sha256-x9LI4h87/Z9UgTQjgeG0fRcdeXl91xIqBlTauGKZM70=",
-        "owner": "huggingface",
-        "repo": "kernel-builder",
-        "rev": "b4accba4496b28faef19a0487fbcf9686b14e2ef",
-        "type": "github"
-      },
-      "original": {
-        "owner": "huggingface",
-        "repo": "kernel-builder",
-        "type": "github"
-      }
-    },
-    "nixpkgs": {
-      "locked": {
-        "lastModified": 1752785354,
-        "narHash": "sha256-Y33ryUz7MPqKrZwlbQcsYCUz2jAJCacRf8jbs0tYUlA=",
-        "owner": "nixos",
-        "repo": "nixpkgs",
-        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nixos",
-        "repo": "nixpkgs",
-        "rev": "d38025438a6ee456758dc03188ca6873a415463b",
-        "type": "github"
-      }
-    },
-    "root": {
-      "inputs": {
-        "kernel-builder": "kernel-builder"
-      }
-    },
-    "systems": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    },
-    "systems_2": {
-      "locked": {
-        "lastModified": 1681028828,
-        "narHash": "sha256-Vy1rq5AaRuLzOxct8nz4T6wlgyUR7zLU309k9mBC768=",
-        "owner": "nix-systems",
-        "repo": "default",
-        "rev": "da67096a3b9bf56a91d16901293e51ba5b49a27e",
-        "type": "github"
-      },
-      "original": {
-        "owner": "nix-systems",
-        "repo": "default",
-        "type": "github"
-      }
-    }
-  },
-  "root": "root",
-  "version": 7
-}

flake.nix CHANGED Viewed

@@ -10,11 +10,5 @@
       self,
       kernel-builder,
     }:
-    kernel-builder.lib.genFlakeOutputs {
-      path = ./.;
-      rev = self.shortRev or self.dirtyShortRev or self.lastModifiedDate;
-      # Import-time autotune.
-      doGetKernelCheck = false;
-      pythonCheckInputs = pkgs: with pkgs; [ einops ];
-    };
 }

       self,
       kernel-builder,
     }:
+    kernel-builder.lib.genFlakeOutputs ./.;
 }

tests/test_layer_norm.py DELETED Viewed

@@ -1,373 +0,0 @@
-# Copyright (c) 2024, Tri Dao.
-import pytest
-import torch
-import torch.nn.functional as F
-from einops import rearrange, repeat
-from triton_layer_norm import (
-    layer_norm_fn,
-    layer_norm_linear_fn,
-)
-from triton_layer_norm.layer_norm import layer_norm_ref, rms_norm_ref
-is_sm8x = torch.cuda.get_device_capability("cuda")[0] >= 8
-# @pytest.mark.parametrize("zero_centered_weight", [False, True])
-@pytest.mark.parametrize("zero_centered_weight", [False])
-@pytest.mark.parametrize("has_weight1", [False, True])
-# @pytest.mark.parametrize("has_weight1", [False])
-@pytest.mark.parametrize("has_x1", [False, True])
-# @pytest.mark.parametrize("has_x1", [False])
-@pytest.mark.parametrize("has_rowscale", [False, True])
-# @pytest.mark.parametrize("has_rowscale", [False])
-@pytest.mark.parametrize("dropout_p", [0.0, 0.27])
-# @pytest.mark.parametrize("dropout_p", [0.0])
-@pytest.mark.parametrize("prenorm", [True, False])
-# @pytest.mark.parametrize("prenorm", [True])
-@pytest.mark.parametrize("is_rms_norm", [False, True])
-# @pytest.mark.parametrize("is_rms_norm", [True])
-@pytest.mark.parametrize("has_residual", [True, False])
-# @pytest.mark.parametrize("has_residual", [True])
-@pytest.mark.parametrize(
-    "weight_dtype", [torch.float32, torch.float16] + ([torch.bfloat16] if is_sm8x else [])
-)
-# @pytest.mark.parametrize("weight_dtype", [torch.float32])
-@pytest.mark.parametrize(
-    "input_dtype,residual_dtype",
-    [(torch.float16, torch.float16), (torch.float16, torch.float32), (torch.float32, torch.float32)]
-    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
-)
-# @pytest.mark.parametrize("input_dtype,residual_dtype", [(torch.float16, torch.float16)])
-@pytest.mark.parametrize("hidden_size", [192, 2048, 2560, 3000, 4096])
-# @pytest.mark.parametrize("hidden_size", [1024])
-def test_layer_norm(
-    hidden_size,
-    input_dtype,
-    residual_dtype,
-    weight_dtype,
-    has_residual,
-    is_rms_norm,
-    prenorm,
-    dropout_p,
-    has_rowscale,
-    has_x1,
-    has_weight1,
-    zero_centered_weight,
-):
-    if has_rowscale and has_x1:
-        pytest.skip("Not supported")
-    device = "cuda"
-    if any(x == torch.bfloat16 for x in [input_dtype, residual_dtype, weight_dtype]):
-        atol = 5e-2
-    elif any(x == torch.float16 for x in [input_dtype, residual_dtype, weight_dtype]):
-        atol = 1e-2
-    else:
-        atol = 1e-4
-    # set seed
-    torch.random.manual_seed(0)
-    batch_size = 8
-    seqlen = 512
-    layer_norm_ref_fn = layer_norm_ref if not is_rms_norm else rms_norm_ref
-    allclose = (
-        # Sometimes x0_pt.grad is NaN
-        lambda x, x_pt, x_ref, atol=atol: (x - x_ref).abs().max()
-        <= 2 * (x_pt[~x_pt.isnan()] - x_ref[~x_pt.isnan()]).abs().max() + atol
-        or (
-            # Sometimes x_pt and x_ref are the same (e.g. bfloat16) so we want to perturb is a bit
-            # by multiply and divide by 0.3
-            (x_pt[~x_pt.isnan()] - x_ref[~x_pt.isnan()]).abs().max() == 0.0
-            and (x - x_ref).abs().max()
-            <= 2 * (x_pt[~x_pt.isnan()] * 0.3 / 0.3 - x_ref[~x_pt.isnan()]).abs().max() + atol
-        )
-    )
-    x0 = torch.randn(
-        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
-    )
-    x0_pt = x0.detach().clone().requires_grad_()
-    x0_ref = x0.detach().clone().requires_grad_()
-    if has_residual:
-        res = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True)
-        res_pt = res.detach().clone().requires_grad_()
-        res_ref = res.detach().clone().requires_grad_()
-    else:
-        res, res_pt, res_ref = None, None, None
-    weight = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
-    if not is_rms_norm:
-        bias = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
-    else:
-        bias = None
-    weight_pt = weight.detach().clone().requires_grad_()
-    weight_ref = weight.detach().clone().requires_grad_()
-    bias_pt = bias.detach().clone().requires_grad_() if bias is not None else None
-    bias_ref = bias.detach().clone().requires_grad_() if bias is not None else None
-    if has_x1:
-        x1 = torch.randn_like(x0, dtype=input_dtype, requires_grad=True)
-        x1_pt = x1.detach().clone().requires_grad_()
-        x1_ref = x1.detach().clone().requires_grad_()
-    else:
-        x1, x1_pt, x1_ref = None, None, None
-    if has_weight1:
-        weight1 = torch.randn(
-            hidden_size, device=device, dtype=weight_dtype, requires_grad=True
-        )
-        weight1_pt = weight1.detach().clone().requires_grad_()
-        weight1_ref = weight1.detach().clone().requires_grad_()
-        if not is_rms_norm:
-            bias1 = torch.randn(
-                hidden_size, device=device, dtype=weight_dtype, requires_grad=True
-            )
-        else:
-            bias1 = None
-        bias1_pt = bias1.detach().clone().requires_grad_() if bias1 is not None else None
-        bias1_ref = bias1.detach().clone().requires_grad_() if bias1 is not None else None
-    else:
-        weight1, weight1_pt, weight1_ref = None, None, None
-        bias1, bias1_pt, bias1_ref = None, None, None
-    rowscale = (
-        torch.randn(batch_size, seqlen, dtype=input_dtype, device=device)
-        if has_rowscale
-        else None
-    )
-    residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32
-    out, *rest = layer_norm_fn(
-        x0,
-        weight,
-        bias,
-        residual=res,
-        x1=x1,
-        weight1=weight1,
-        bias1=bias1,
-        eps=1e-6,
-        dropout_p=dropout_p,
-        rowscale=rowscale,
-        prenorm=prenorm,
-        residual_in_fp32=residual_in_fp32,
-        zero_centered_weight=zero_centered_weight,
-        is_rms_norm=is_rms_norm,
-        return_dropout_mask=True,
-    )
-    dropout_mask = rest[-2] if dropout_p > 0.0 else None
-    dropout_mask1 = rest[-1] if dropout_p > 0.0 and x1 is not None else None
-    out_pt = layer_norm_ref_fn(
-        x0_pt,
-        weight_pt,
-        bias_pt,
-        residual=res_pt,
-        x1=x1_pt,
-        weight1=weight1_pt,
-        bias1=bias1_pt,
-        eps=1e-6,
-        dropout_p=dropout_p,
-        rowscale=rowscale,
-        prenorm=prenorm,
-        zero_centered_weight=zero_centered_weight,
-        dropout_mask=dropout_mask,
-        dropout_mask1=dropout_mask1,
-    )
-    out_ref = layer_norm_ref_fn(
-        x0_ref,
-        weight_ref,
-        bias_ref,
-        residual=res_ref,
-        x1=x1_ref,
-        weight1=weight1_ref,
-        bias1=bias1_ref,
-        eps=1e-6,
-        dropout_p=dropout_p,
-        rowscale=rowscale,
-        prenorm=prenorm,
-        zero_centered_weight=zero_centered_weight,
-        dropout_mask=dropout_mask,
-        dropout_mask1=dropout_mask1,
-        upcast=True,
-    )
-    if not has_weight1:
-        if prenorm:
-            residual = rest[0]
-            out_pt, residual_pt = out_pt
-            out_ref, residual_ref = out_ref
-        out1, out1_pt, out1_ref = None, None, None
-    else:
-        out1 = rest.pop(0)
-        if prenorm:
-            residual = rest[0]
-            out_pt, out1_pt, residual_pt = out_pt
-            out_ref, out1_ref, residual_ref = out_ref
-        else:
-            out_pt, out1_pt = out_pt
-            out_ref, out1_ref = out_ref
-    assert out.dtype == input_dtype
-    if prenorm:
-        assert residual.dtype == residual_dtype
-        assert allclose(residual, residual_pt, residual_ref)
-    assert allclose(out, out_pt, out_ref)
-    if out1 is not None:
-        assert out1.dtype == input_dtype
-        assert allclose(out1, out1_pt, out1_ref)
-    if dropout_mask is not None:
-        dropout_fraction = 1.0 - dropout_mask.float().mean()
-        assert abs(dropout_fraction - dropout_p) < 0.01
-    if dropout_mask1 is not None:
-        dropout_fraction = 1.0 - dropout_mask1.float().mean()
-        assert abs(dropout_fraction - dropout_p) < 0.01
-        assert not torch.equal(dropout_mask, dropout_mask1)
-    g = torch.randn_like(out) / batch_size
-    if has_weight1:
-        out = out * F.gelu(out1)
-        out_pt = out_pt * F.gelu(out1_pt)
-        out_ref = out_ref * F.gelu(out1_ref)
-    if not prenorm:
-        out.backward(g)
-        out_pt.backward(g)
-        out_ref.backward(g)
-    else:
-        (out * F.sigmoid(residual)).backward(g)
-        (out_pt * F.sigmoid(residual_pt)).backward(g)
-        (out_ref * F.sigmoid(residual_ref.to(dtype=residual_dtype))).backward(g)
-    assert allclose(x0.grad, x0_pt.grad, x0_ref.grad)
-    if has_residual:
-        assert allclose(res.grad, res_pt.grad, res_ref.grad)
-    if has_x1:
-        assert allclose(x1.grad, x1_pt.grad, x1_ref.grad)
-    assert allclose(weight.grad, weight_pt.grad, weight_ref.grad)
-    if bias is not None:
-        assert allclose(bias.grad, bias_pt.grad, bias_ref.grad)
-    if has_weight1:
-        assert allclose(weight1.grad, weight1_pt.grad, weight1_ref.grad)
-        if bias1 is not None:
-            assert allclose(bias1.grad, bias1_pt.grad, bias1_ref.grad)
-@pytest.mark.parametrize("prenorm", [True, False])
-# @pytest.mark.parametrize("prenorm", [True])
-@pytest.mark.parametrize("is_rms_norm", [False, True])
-# @pytest.mark.parametrize("is_rms_norm", [True])
-@pytest.mark.parametrize("has_residual", [True, False])
-# @pytest.mark.parametrize("has_residual", [False])
-@pytest.mark.parametrize("weight_dtype", [torch.float32])
-@pytest.mark.parametrize(
-    "input_dtype,residual_dtype",
-    [(torch.float16, torch.float16), (torch.float16, torch.float32)]
-    + ([(torch.bfloat16, torch.bfloat16), (torch.bfloat16, torch.float32)] if is_sm8x else []),
-)
-# @pytest.mark.parametrize("input_dtype,residual_dtype", [(torch.bfloat16, torch.float32)])
-@pytest.mark.parametrize("hidden_size", [192, 2048, 2560, 3000])
-# @pytest.mark.parametrize("hidden_size", [256])
-def test_layer_norm_linear(
-    hidden_size, input_dtype, residual_dtype, weight_dtype, has_residual, is_rms_norm, prenorm
-):
-    device = "cuda"
-    if any(x == torch.bfloat16 for x in [input_dtype, residual_dtype, weight_dtype]):
-        atol = 5e-2
-    elif any(x == torch.float16 for x in [input_dtype, residual_dtype, weight_dtype]):
-        atol = 1e-2
-    else:
-        atol = 1e-4
-    # set seed
-    torch.random.manual_seed(0)
-    batch_size = 4
-    seqlen = 512
-    # batch_size = 1
-    # seqlen = 1
-    layer_norm_ref_fn = layer_norm_ref if not is_rms_norm else rms_norm_ref
-    allclose = (
-        lambda x, x_pt, x_ref, atol=atol: (x - x_ref).abs().max()
-        <= 2 * (x_pt - x_ref).abs().max() + atol
-    )
-    x0 = torch.randn(
-        batch_size, seqlen, hidden_size, device=device, dtype=input_dtype, requires_grad=True
-    )
-    x0_pt = x0.detach().clone().requires_grad_()
-    x0_ref = x0.detach().clone().requires_grad_()
-    if has_residual:
-        res = torch.randn_like(x0, dtype=residual_dtype, requires_grad=True)
-        res_pt = res.detach().clone().requires_grad_()
-        res_ref = res.detach().clone().requires_grad_()
-    else:
-        res, res_pt, res_ref = None, None, None
-    norm_weight = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
-    if not is_rms_norm:
-        norm_bias = torch.randn(hidden_size, device=device, dtype=weight_dtype, requires_grad=True)
-    else:
-        norm_bias = None
-    norm_weight_pt = norm_weight.detach().clone().requires_grad_()
-    norm_weight_ref = norm_weight.detach().clone().requires_grad_()
-    norm_bias_pt = norm_bias.detach().clone().requires_grad_() if norm_bias is not None else None
-    norm_bias_ref = norm_bias.detach().clone().requires_grad_() if norm_bias is not None else None
-    linear_weight = torch.empty(
-        2 * hidden_size, hidden_size, device=device, dtype=weight_dtype, requires_grad=True
-    )
-    torch.nn.init.xavier_uniform_(linear_weight)
-    if not is_rms_norm:
-        linear_bias = torch.randn(
-            2 * hidden_size, device=device, dtype=weight_dtype, requires_grad=True
-        )
-    else:
-        linear_bias = None
-    linear_weight_pt = linear_weight.detach().clone().requires_grad_()
-    linear_weight_ref = linear_weight.detach().clone().requires_grad_()
-    linear_bias_pt = (
-        linear_bias.detach().clone().requires_grad_() if linear_bias is not None else None
-    )
-    linear_bias_ref = (
-        linear_bias.detach().clone().requires_grad_() if linear_bias is not None else None
-    )
-    residual_in_fp32 = (not has_residual) and residual_dtype == torch.float32
-    with torch.autocast(device_type="cuda", dtype=input_dtype):
-        out, *rest = layer_norm_linear_fn(
-            x0,
-            norm_weight,
-            norm_bias,
-            linear_weight,
-            linear_bias,
-            residual=res,
-            eps=1e-6,
-            prenorm=prenorm,
-            residual_in_fp32=residual_in_fp32,
-            is_rms_norm=is_rms_norm,
-        )
-    out_pt, *rest_pt = layer_norm_ref_fn(
-        x0_pt, norm_weight_pt, norm_bias_pt, residual=res_pt, eps=1e-6, prenorm=prenorm
-    )
-    with torch.autocast(device_type="cuda", dtype=input_dtype):
-        out_pt = F.linear(out_pt, linear_weight_pt, linear_bias_pt)
-    out_ref, *rest_ref = layer_norm_ref_fn(
-        x0_ref,
-        norm_weight_ref,
-        norm_bias_ref,
-        residual=res_ref,
-        eps=1e-6,
-        prenorm=prenorm,
-        upcast=True,
-    )
-    out_ref = F.linear(out_ref.to(linear_weight_ref.dtype), linear_weight_ref, linear_bias_ref)
-    if prenorm:
-        residual = rest[0]
-        residual_pt = rest_pt[0]
-        residual_ref = rest_ref[0]
-    assert out.dtype == input_dtype
-    if prenorm:
-        assert residual.dtype == residual_dtype
-        assert allclose(residual, residual_pt, residual_ref)
-    assert allclose(out, out_pt, out_ref)
-    g = torch.randn_like(out) / batch_size
-    out.backward(g)
-    out_pt.backward(g)
-    out_ref.backward(g)
-    assert allclose(x0.grad, x0_pt.grad, x0_ref.grad)
-    if has_residual:
-        assert allclose(res.grad, res_pt.grad, res_ref.grad)
-    assert allclose(norm_weight.grad, norm_weight_pt.grad, norm_weight_ref.grad)
-    if norm_bias is not None:
-        assert allclose(norm_bias.grad, norm_bias_pt.grad, norm_bias_ref.grad)
-    assert allclose(linear_weight.grad, linear_weight_pt.grad, linear_weight_ref.grad)
-    if linear_bias is not None:
-        assert allclose(linear_bias.grad, linear_bias_pt.grad, linear_bias_ref.grad)

torch-ext/triton_layer_norm/__init__.py CHANGED Viewed

@@ -1,117 +1,5 @@
-"""Triton layer normalization kernels
-This kernel implements layers normalization using Triton. This kernel is from
-the `flash-attention <https://github.com/Dao-AILab/flash-attention>`_ project.
-"""
-from typing import Optional
-import torch
-from . import layers
 from .layer_norm import layer_norm_fn, layer_norm_linear_fn, rms_norm_fn
-def layer_norm(
-    x: torch.Tensor,
-    weight: torch.Tensor,
-    bias: torch.Tensor,
-    residual: Optional[torch.Tensor] = None,
-    x1: Optional[torch.Tensor] = None,
-    weight1: Optional[torch.Tensor] = None,
-    bias1: Optional[torch.Tensor] = None,
-    eps: float = 1e-6,
-    dropout_p: float = 0.0,
-    rowscale=None,
-    prenorm: bool = False,
-    residual_in_fp32: bool = False,
-    zero_centered_weight: bool = False,
-    is_rms_norm: bool = False,
-    return_dropout_mask: bool = False,
-    out: Optional[torch.Tensor] = None,
-    residual_out: Optional[torch.Tensor] = None,
-):
-    """
-    Apply layer normalization to the input tensor with Triton acceleration.
-    Args:
-        x (`torch.Tensor`):
-            Input tensor to normalize.
-        weight (`torch.Tensor`):
-            Scale parameter for normalization.
-        bias (`torch.Tensor`):
-            Shift parameter for normalization.
-        residual (`torch.Tensor`, *optional*):
-            Optional residual tensor to add to the input before normalization.
-        x1 (`torch.Tensor`, *optional*):
-            Optional second input tensor to combine with `x`. When provided, the function
-            first adds `x1` to `x` and then applies normalization.
-        weight1 (`torch.Tensor`, *optional*):
-            Scale parameter for the second normalization.
-        bias1 (`torch.Tensor`, *optional*):
-            Shift parameter for the second normalization.
-        eps (`float`, *optional*, defaults to 1e-6):
-            Small constant added for numerical stability in normalization.
-        dropout_p (`float`, *optional*, defaults to 0.0):
-            Dropout probability. If greater than 0, applies dropout to the input before
-            normalization and residual addition.
-        rowscale (`torch.Tensor`, *optional*):
-            Optional scaling factor applied to each row of the input tensor.
-            Not compatible with the use of `x1`.
-        prenorm (`bool`, *optional*, defaults to False):
-            If True, returns both the normalized output and the unnormalized input+residual.
-        residual_in_fp32 (`bool`, *optional*, defaults to False):
-            If True, performs the residual connection in FP32 precision.
-        zero_centered_weight (`bool`, *optional*, defaults to False):
-            When set to true, 1.0 is added to the weight before applying it.
-        is_rms_norm (`bool`, *optional*, defaults to False):
-            If True, uses RMS normalization instead of layer normalization.
-        return_dropout_mask (`bool`, *optional*, defaults to False):
-            If True, returns the dropout mask used for the computation.
-        out (`torch.Tensor`, *optional*):
-            Output tensor for the normalized result. If `None`, a new tensor is allocated.
-        residual_out (`torch.Tensor`, *optional*):
-            Output tensor for the residual result when using prenorm. If `None`, a new tensor
-            is allocated when needed.
-    Returns:
-        `torch.Tensor` or tuple of `torch.Tensor`:
-            - The normalized input.
-            - The second normalization of the input if `weight1` is provided.
-            - The residual tensor if `prenorm` is set.
-            - The dropout mask if `return_dropout_mask` is set.
-            - The dropout mask for `x1` if `x1` is provided and `return_dropout_mask` is set.
-    """
-    return layer_norm_fn(
-        x,
-        weight,
-        bias,
-        residual,
-        x1,
-        weight1,
-        bias1,
-        eps,
-        dropout_p,
-        rowscale,
-        prenorm,
-        residual_in_fp32,
-        is_rms_norm,
-        return_dropout_mask,
-        out=out,
-        residual_out=residual_out,
-    )
-__kernel_metadata__ = {
-    "license": "bsd-3-clause",
-}
-__all__ = [
-    "__kernel_metadata__",
-    "layers",
-    "layer_norm",
-    "layer_norm_fn",
-    "layer_norm_linear_fn",
-    "rms_norm_fn",
-]

 from .layer_norm import layer_norm_fn, layer_norm_linear_fn, rms_norm_fn
+from . import layers
+__all__ = ["layers", "layer_norm_fn", "layer_norm_linear_fn", "rms_norm_fn"]

torch-ext/triton_layer_norm/layer_norm.py CHANGED Viewed

@@ -7,40 +7,14 @@
 # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
 import math
-from typing import Optional, List
 import torch
 import torch.nn.functional as F
-from torch import Tensor
 import triton
 import triton.language as tl
-from ._ops import add_op_namespace_prefix
-from .utils.torch import custom_fwd, custom_bwd
-from .utils.library import triton_op
-def maybe_contiguous_lastdim(x):
-    return x.contiguous() if x is not None and x.stride(-1) != 1 else x
-def maybe_contiguous(x):
-    return x.contiguous() if x is not None else None
-def triton_autotune_configs():
-    # Return configs with a valid warp count for the current device
-    configs = []
-    # Maximum threads per block is architecture-dependent in theory, but in reality all are 1024
-    max_threads_per_block = 1024
-    # Default to warp size 32 if not defined by device
-    warp_size = getattr(torch.cuda.get_device_properties(torch.cuda.current_device()), "warp_size", 32)
-    # Autotune for warp counts which are powers of 2 and do not exceed thread per block limit
-    return [triton.Config({}, num_warps=warp_count) for warp_count in [1, 2, 4, 8, 16, 32]
-            if warp_count * warp_size <= max_threads_per_block]
-    # return [triton.Config({}, num_warps=8)]
 def layer_norm_ref(
     x,
@@ -54,7 +28,6 @@ def layer_norm_ref(
     dropout_p=0.0,
     rowscale=None,
     prenorm=False,
-    zero_centered_weight=False,
     dropout_mask=None,
     dropout_mask1=None,
     upcast=False,
@@ -68,10 +41,6 @@ def layer_norm_ref(
         x1 = x1.float() if x1 is not None else None
         weight1 = weight1.float() if weight1 is not None else None
         bias1 = bias1.float() if bias1 is not None else None
-    if zero_centered_weight:
-        weight = weight + 1.0
-        if weight1 is not None:
-            weight1 = weight1 + 1.0
     if x1 is not None:
         assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
     if rowscale is not None:
@@ -90,9 +59,9 @@ def layer_norm_ref(
         x = x + x1
     if residual is not None:
         x = (x + residual).to(x.dtype)
-    out = F.layer_norm(x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps).to(
-        dtype
-    )
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
@@ -114,7 +83,6 @@ def rms_norm_ref(
     dropout_p=0.0,
     rowscale=None,
     prenorm=False,
-    zero_centered_weight=False,
     dropout_mask=None,
     dropout_mask1=None,
     upcast=False,
@@ -128,10 +96,6 @@ def rms_norm_ref(
         x1 = x1.float() if x1 is not None else None
         weight1 = weight1.float() if weight1 is not None else None
         bias1 = bias1.float() if bias1 is not None else None
-    if zero_centered_weight:
-        weight = weight + 1.0
-        if weight1 is not None:
-            weight1 = weight1 + 1.0
     if x1 is not None:
         assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
     if rowscale is not None:
@@ -151,26 +115,34 @@ def rms_norm_ref(
     if residual is not None:
         x = (x + residual).to(x.dtype)
     rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
-    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(dtype)
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
-        out1 = ((x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)).to(
-            dtype
-        )
         return (out, out1) if not prenorm else (out, out1, x)
 @triton.autotune(
-    configs=triton_autotune_configs(),
-    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS", "HAS_X1", "HAS_W1", "HAS_B1"],
 )
-# torch compile doesn't like triton.heuristics, so we set these manually when calling the kernel
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
-# @triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
-# @triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
-# @triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
 @triton.jit
 def _layer_norm_fwd_1pass_kernel(
     X,  # pointer to the input
@@ -186,7 +158,6 @@ def _layer_norm_fwd_1pass_kernel(
     ROWSCALE,
     SEEDS,  # Dropout seeds for each row
     DROPOUT_MASK,
-    DROPOUT_MASK1,
     Mean,  # pointer to the mean
     Rstd,  # pointer to the 1/std
     stride_x_row,  # how much to increase the pointer when moving by 1 row
@@ -199,7 +170,6 @@ def _layer_norm_fwd_1pass_kernel(
     N,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     dropout_p,  # Dropout probability
-    zero_centered_weight,  # If true, add 1.0 to the weight
     IS_RMS_NORM: tl.constexpr,
     BLOCK_N: tl.constexpr,
     HAS_RESIDUAL: tl.constexpr,
@@ -233,7 +203,9 @@ def _layer_norm_fwd_1pass_kernel(
     if HAS_DROPOUT:
         # Compute dropout mask
         # 7 rounds is good enough, and reduces register pressure
-        keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
         x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
         if STORE_DROPOUT_MASK:
             tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
@@ -246,11 +218,12 @@ def _layer_norm_fwd_1pass_kernel(
             # Compute dropout mask
             # 7 rounds is good enough, and reduces register pressure
             keep_mask = (
-                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
             )
             x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
             if STORE_DROPOUT_MASK:
-                tl.store(DROPOUT_MASK1 + row * N + cols, keep_mask, mask=cols < N)
         x += x1
     if HAS_RESIDUAL:
         residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
@@ -270,8 +243,6 @@ def _layer_norm_fwd_1pass_kernel(
     # Normalize and apply linear transformation
     mask = cols < N
     w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if zero_centered_weight:
-        w += 1.0
     if HAS_BIAS:
         b = tl.load(B + cols, mask=mask).to(tl.float32)
     x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
@@ -280,8 +251,6 @@ def _layer_norm_fwd_1pass_kernel(
     tl.store(Y + cols, y, mask=mask)
     if HAS_W1:
         w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-        if zero_centered_weight:
-            w1 += 1.0
         if HAS_B1:
             b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
         y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
@@ -289,87 +258,25 @@ def _layer_norm_fwd_1pass_kernel(
 def _layer_norm_fwd(
-    x: Tensor,
-    weight: Tensor,
-    bias: Tensor,
-    eps: float,
-    residual: Optional[Tensor] = None,
-    x1: Optional[Tensor] = None,
-    weight1: Optional[Tensor] = None,
-    bias1: Optional[Tensor] = None,
-    dropout_p: float = 0.0,
-    rowscale: Optional[Tensor] = None,
-    out_dtype: Optional[torch.dtype] = None,
-    residual_dtype: Optional[torch.dtype] = None,
-    zero_centered_weight: bool = False,
-    is_rms_norm: bool = False,
-    return_dropout_mask: bool = False,
-    out: Optional[Tensor] = None,
-    residual_out: Optional[Tensor] = None
-) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor):
-    # Need to wrap to handle the case where residual_out is a alias of x, which makes torch.library
-    # and torch.compile unhappy. Also allocate memory for out and residual_out if they are None
-    # so that _layer_norm_fwd_impl doesn't have to return them.
-    if out is None:
-        out = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
     if residual is not None:
         residual_dtype = residual.dtype
-    if residual_out is None and (
-        residual is not None
-        or (residual_dtype is not None and residual_dtype != x.dtype)
-        or dropout_p > 0.0
-        or rowscale is not None
-        or x1 is not None
-    ):
-        residual_out = torch.empty_like(
-            x, dtype=residual_dtype if residual_dtype is not None else x.dtype
-        )
-    else:
-        residual_out = None
-    y1, mean, rstd, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd_impl(
-        x,
-        weight,
-        bias,
-        eps,
-        out,
-        residual=residual,
-        x1=x1,
-        weight1=weight1,
-        bias1=bias1,
-        dropout_p=dropout_p,
-        rowscale=rowscale,
-        zero_centered_weight=zero_centered_weight,
-        is_rms_norm=is_rms_norm,
-        return_dropout_mask=return_dropout_mask,
-        residual_out=residual_out,
-    )
-    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
-    if residual_out is None:
-        residual_out = x
-    return out, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1
-# [2025-04-28] torch.library.triton_op ignores the schema argument, but here we need the schema
-# since we're returning a tuple of tensors
-@triton_op(add_op_namespace_prefix("layer_norm_fwd_impl"), mutates_args={"out", "residual_out"},
-           schema="(Tensor x, Tensor weight, Tensor bias, float eps, Tensor(a!) out, Tensor? residual, Tensor? x1, Tensor? weight1, Tensor? bias1, float dropout_p, Tensor? rowscale, bool zero_centered_weight, bool is_rms_norm, bool return_dropout_mask, Tensor(a!)? residual_out) -> (Tensor y1, Tensor mean, Tensor rstd, Tensor seeds, Tensor dropout_mask, Tensor dropout_mask1)")
-def _layer_norm_fwd_impl(
-    x: Tensor,
-    weight: Tensor,
-    bias: Tensor,
-    eps: float,
-    out: Tensor,
-    residual: Optional[Tensor] = None,
-    x1: Optional[Tensor] = None,
-    weight1: Optional[Tensor] = None,
-    bias1: Optional[Tensor] = None,
-    dropout_p: float = 0.0,
-    rowscale: Optional[Tensor] = None,
-    zero_centered_weight: bool = False,
-    is_rms_norm: bool = False,
-    return_dropout_mask: bool = False,
-    residual_out: Optional[Tensor] = None
-) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor):
     M, N = x.shape
     assert x.stride(-1) == 1
     if residual is not None:
@@ -393,17 +300,41 @@ def _layer_norm_fwd_impl(
     if rowscale is not None:
         assert rowscale.is_contiguous()
         assert rowscale.shape == (M,)
-    assert out.shape == x.shape
     assert out.stride(-1) == 1
-    if residual_out is not None:
-        assert residual_out.shape == x.shape
-        assert residual_out.stride(-1) == 1
     if weight1 is not None:
         y1 = torch.empty_like(out)
         assert y1.stride(-1) == 1
     else:
         y1 = None
-    mean = torch.empty((M,), dtype=torch.float32, device=x.device) if not is_rms_norm else None
     rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
     if dropout_p > 0.0:
         seeds = torch.randint(
@@ -412,20 +343,18 @@ def _layer_norm_fwd_impl(
     else:
         seeds = None
     if return_dropout_mask and dropout_p > 0.0:
-        dropout_mask = torch.empty(M, N, device=x.device, dtype=torch.bool)
-        if x1 is not None:
-            dropout_mask1 = torch.empty(M, N, device=x.device, dtype=torch.bool)
-        else:
-            dropout_mask1 = None
     else:
-        dropout_mask, dropout_mask1 = None, None
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     with torch.cuda.device(x.device.index):
-        torch.library.wrap_triton(_layer_norm_fwd_1pass_kernel)[(M,)](
             x,
             out,
             weight,
@@ -439,7 +368,6 @@ def _layer_norm_fwd_impl(
             rowscale,
             seeds,
             dropout_mask,
-            dropout_mask1,
             mean,
             rstd,
             x.stride(0),
@@ -452,8 +380,6 @@ def _layer_norm_fwd_impl(
             N,
             eps,
             dropout_p,
-            # Passing bool make torch inductor very unhappy since it then tries to compare to int_max
-            int(zero_centered_weight),
             is_rms_norm,
             BLOCK_N,
             residual is not None,
@@ -462,26 +388,50 @@ def _layer_norm_fwd_impl(
             dropout_p > 0.0,
             dropout_mask is not None,
             rowscale is not None,
-            HAS_X1=x1 is not None,
-            HAS_W1=weight1 is not None,
-            HAS_B1=bias1 is not None,
         )
-    return y1, mean, rstd, seeds, dropout_mask, dropout_mask1
 @triton.autotune(
-    configs=triton_autotune_configs(),
-    key=["N", "HAS_DRESIDUAL", "STORE_DRESIDUAL", "IS_RMS_NORM", "HAS_BIAS", "HAS_DROPOUT"],
 )
-# torch compile doesn't like triton.heuristics, so we set these manually when calling the kernel
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
 # @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
-# @triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
-# @triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
-# @triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
-# @triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
-# @triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
 @triton.jit
 def _layer_norm_bwd_kernel(
     X,  # pointer to the input
@@ -515,7 +465,6 @@ def _layer_norm_bwd_kernel(
     N,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     dropout_p,
-    zero_centered_weight,
     rows_per_program,
     IS_RMS_NORM: tl.constexpr,
     BLOCK_N: tl.constexpr,
@@ -549,14 +498,10 @@ def _layer_norm_bwd_kernel(
     if RECOMPUTE_OUTPUT:
         Y += row_start * stride_y_row
     w = tl.load(W + cols, mask=mask).to(tl.float32)
-    if zero_centered_weight:
-        w += 1.0
     if RECOMPUTE_OUTPUT and HAS_BIAS:
         b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
     if HAS_DY1:
         w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
-        if zero_centered_weight:
-            w1 += 1.0
     dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
     if HAS_BIAS:
         db = tl.zeros((BLOCK_N,), dtype=tl.float32)
@@ -605,14 +550,18 @@ def _layer_norm_bwd_kernel(
         if HAS_DX1:
             if HAS_DROPOUT:
                 keep_mask = (
-                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
                 )
                 dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
             else:
                 dx1 = dx
             tl.store(DX1 + cols, dx1, mask=mask)
         if HAS_DROPOUT:
-            keep_mask = tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
             dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
         if HAS_ROWSCALE:
             rowscale = tl.load(ROWSCALE + row).to(tl.float32)
@@ -642,93 +591,31 @@ def _layer_norm_bwd_kernel(
 def _layer_norm_bwd(
-    dy: Tensor,
-    x: Tensor,
-    weight: Tensor,
-    bias: Tensor,
-    eps: float,
-    mean: Tensor,
-    rstd: Tensor,
-    dresidual: Optional[Tensor] = None,
-    dy1: Optional[Tensor] = None,
-    weight1: Optional[Tensor] = None,
-    bias1: Optional[Tensor] = None,
-    seeds: Optional[Tensor] = None,
-    dropout_p: float = 0.0,
-    rowscale: Optional[Tensor] = None,
-    has_residual: bool = False,
-    has_x1: bool = False,
-    zero_centered_weight: bool = False,
-    is_rms_norm: bool = False,
-    x_dtype: Optional[torch.dtype] = None,
-    recompute_output: bool = False,
-) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor):
-    # Need to wrap to handle the case where dresidual_in or dx1 are aliases of x,
-    # which makes torch.library unhappy
-    dx, dw, db, dresidual_in, dx1, dw1, db1, y = _layer_norm_bwd_impl(
-        dy,
-        x,
-        weight,
-        bias,
-        eps,
-        mean,
-        rstd,
-        dresidual,
-        dy1,
-        weight1,
-        bias1,
-        seeds,
-        dropout_p,
-        rowscale,
-        has_residual,
-        has_x1,
-        zero_centered_weight,
-        is_rms_norm,
-        x_dtype=x_dtype,
-        recompute_output=recompute_output,
-    )
-    # Don't need to compute dresidual_in separately in this case
-    if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
-        dresidual_in = dx
-    if has_x1 and dropout_p == 0.0:
-        dx1 = dx
-    return dx, dw, db, dresidual_in, dx1, dw1, db1, y
-@triton_op(add_op_namespace_prefix("layer_norm_bwd_impl"), mutates_args={},
-           schema="(Tensor dy, Tensor x, Tensor weight, Tensor bias, float eps, Tensor mean, Tensor rstd, Tensor? dresidual, Tensor? dy1, Tensor? weight1, Tensor? bias1, Tensor? seeds, float dropout_p, Tensor? rowscale, bool has_residual, bool has_x1, bool zero_centered_weight, bool is_rms_norm, ScalarType? x_dtype, bool recompute_output) -> (Tensor dx, Tensor dw, Tensor db, Tensor dresidual_in, Tensor dx1, Tensor dw1, Tensor db1, Tensor y)",
-           allow_decomposition=False,  # Don't let torch.compile trace inside
-           )
-def _layer_norm_bwd_impl(
-    dy: Tensor,
-    x: Tensor,
-    weight: Tensor,
-    bias: Tensor,
-    eps: float,
-    mean: Tensor,
-    rstd: Tensor,
-    dresidual: Optional[Tensor] = None,
-    dy1: Optional[Tensor] = None,
-    weight1: Optional[Tensor] = None,
-    bias1: Optional[Tensor] = None,
-    seeds: Optional[Tensor] = None,
-    dropout_p: float = 0.0,
-    rowscale: Optional[Tensor] = None,
-    has_residual: bool = False,
-    has_x1: bool = False,
-    zero_centered_weight: bool = False,
-    is_rms_norm: bool = False,
-    x_dtype: Optional[torch.dtype] = None,
-    recompute_output: bool = False,
-) -> (Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor):
     M, N = x.shape
     assert x.stride(-1) == 1
-    dy = maybe_contiguous_lastdim(dy)
     assert dy.stride(-1) == 1
     assert dy.shape == (M, N)
     if dresidual is not None:
-        dresidual = maybe_contiguous_lastdim(dresidual)
         assert dresidual.stride(-1) == 1
         assert dresidual.shape == (M, N)
     assert weight.shape == (N,)
@@ -737,7 +624,6 @@ def _layer_norm_bwd_impl(
         assert bias.stride(-1) == 1
         assert bias.shape == (N,)
     if dy1 is not None:
-        dy1 = maybe_contiguous_lastdim(dy1)
         assert weight1 is not None
         assert dy1.shape == dy.shape
         assert dy1.stride(-1) == 1
@@ -766,18 +652,22 @@ def _layer_norm_bwd_impl(
         else None
     )
     dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
-    y = torch.empty(M, N, dtype=dy.dtype, device=dy.device) if recompute_output else None
     if recompute_output:
-        assert weight1 is None, "recompute_output is not supported with parallel LayerNorm"
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    # Increasing the multiple (e.g. 8) will allow more thread blocks to be launched and hide the
-    # latency of the gmem reads/writes, but will increase the time of summing up dw / db.
-    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count * 8
     _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
     _db = (
         torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
@@ -789,7 +679,7 @@ def _layer_norm_bwd_impl(
     rows_per_program = math.ceil(M / sm_count)
     grid = (sm_count,)
     with torch.cuda.device(x.device.index):
-        torch.library.wrap_triton(_layer_norm_bwd_kernel)[grid](
             x,
             weight,
             bias,
@@ -821,8 +711,6 @@ def _layer_norm_bwd_impl(
             N,
             eps,
             dropout_p,
-            # Passing bool make torch inductor very unhappy since it then tries to compare to int_max
-            int(zero_centered_weight),
             rows_per_program,
             is_rms_norm,
             BLOCK_N,
@@ -830,22 +718,24 @@ def _layer_norm_bwd_impl(
             dresidual_in is not None,
             bias is not None,
             dropout_p > 0.0,
-            HAS_ROWSCALE=rowscale is not None,
-            HAS_DY1=dy1 is not None,
-            HAS_DX1=dx1 is not None,
-            HAS_B1=bias1 is not None,
-            RECOMPUTE_OUTPUT=y is not None,
         )
     dw = _dw.sum(0).to(weight.dtype)
     db = _db.sum(0).to(bias.dtype) if bias is not None else None
     dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
     db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
-    # dresidual_in and dx1 could be None, the wrapper will handle assigning them from dx
-    return dx, dw, db, dresidual_in, dx1, dw1, db1, y
 class LayerNormFn(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx,
@@ -861,27 +751,34 @@ class LayerNormFn(torch.autograd.Function):
         rowscale=None,
         prenorm=False,
         residual_in_fp32=False,
-        zero_centered_weight=False,
         is_rms_norm=False,
         return_dropout_mask=False,
-        out_dtype=None,
         out=None,
-        residual_out=None
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
-        x = maybe_contiguous_lastdim(x.reshape(-1, x.shape[-1]))
         if residual is not None:
             assert residual.shape == x_shape_og
-            residual = maybe_contiguous_lastdim(residual.reshape(-1, residual.shape[-1]))
         if x1 is not None:
             assert x1.shape == x_shape_og
             assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
-            x1 = maybe_contiguous_lastdim(x1.reshape(-1, x1.shape[-1]))
         weight = weight.contiguous()
-        bias = maybe_contiguous(bias)
-        weight1 = maybe_contiguous(weight1)
-        bias1 = maybe_contiguous(bias1)
         if rowscale is not None:
             rowscale = rowscale.reshape(-1).contiguous()
         residual_dtype = (
@@ -893,24 +790,24 @@ class LayerNormFn(torch.autograd.Function):
             out = out.reshape(-1, out.shape[-1])
         if residual_out is not None:
             residual_out = residual_out.reshape(-1, residual_out.shape[-1])
-        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = _layer_norm_fwd(
-            x,
-            weight,
-            bias,
-            eps,
-            residual,
-            x1,
-            weight1,
-            bias1,
-            dropout_p=dropout_p,
-            rowscale=rowscale,
-            out_dtype=out_dtype,
-            residual_dtype=residual_dtype,
-            zero_centered_weight=zero_centered_weight,
-            is_rms_norm=is_rms_norm,
-            return_dropout_mask=return_dropout_mask,
-            out=out,
-            residual_out=residual_out,
         )
         ctx.save_for_backward(
             residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
@@ -923,12 +820,17 @@ class LayerNormFn(torch.autograd.Function):
         ctx.has_x1 = x1 is not None
         ctx.prenorm = prenorm
         ctx.x_dtype = x.dtype
-        ctx.zero_centered_weight = zero_centered_weight
         y = y.reshape(x_shape_og)
         y1 = y1.reshape(x_shape_og) if y1 is not None else None
-        residual_out = residual_out.reshape(x_shape_og) if residual_out is not None else None
-        dropout_mask = dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
-        dropout_mask1 = dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
         if not return_dropout_mask:
             if weight1 is None:
                 return y if not prenorm else (y, residual_out)
@@ -952,19 +854,26 @@ class LayerNormFn(torch.autograd.Function):
     def backward(ctx, dy, *args):
         x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
         dy = dy.reshape(-1, dy.shape[-1])
         if weight1 is not None:
             dy1, args = args[0], args[1:]
             dy1 = dy1.reshape(-1, dy1.shape[-1])
             assert dy1.shape == x.shape
         else:
             dy1 = None
         if ctx.prenorm:
             dresidual = args[0]
             dresidual = dresidual.reshape(-1, dresidual.shape[-1])
             assert dresidual.shape == x.shape
         else:
             dresidual = None
-        dx, dw, db, dresidual_in, dx1, dw1, db1, _ = _layer_norm_bwd(
             dy,
             x,
             weight,
@@ -981,10 +890,8 @@ class LayerNormFn(torch.autograd.Function):
             rowscale,
             ctx.has_residual,
             ctx.has_x1,
-            ctx.zero_centered_weight,
             ctx.is_rms_norm,
             x_dtype=ctx.x_dtype,
-            recompute_output=False,
         )
         return (
             dx.reshape(ctx.x_shape_og),
@@ -1003,8 +910,6 @@ class LayerNormFn(torch.autograd.Function):
             None,
             None,
             None,
-            None,
-            None,
         )
@@ -1021,12 +926,10 @@ def layer_norm_fn(
     rowscale=None,
     prenorm=False,
     residual_in_fp32=False,
-    zero_centered_weight=False,
     is_rms_norm=False,
     return_dropout_mask=False,
-    out_dtype=None,
     out=None,
-    residual_out=None
 ):
     return LayerNormFn.apply(
         x,
@@ -1041,12 +944,10 @@ def layer_norm_fn(
         rowscale,
         prenorm,
         residual_in_fp32,
-        zero_centered_weight,
         is_rms_norm,
         return_dropout_mask,
-        out_dtype,
         out,
-        residual_out
     )
@@ -1063,11 +964,9 @@ def rms_norm_fn(
     rowscale=None,
     prenorm=False,
     residual_in_fp32=False,
-    zero_centered_weight=False,
     return_dropout_mask=False,
-    out_dtype=None,
     out=None,
-    residual_out=None
 ):
     return LayerNormFn.apply(
         x,
@@ -1082,19 +981,16 @@ def rms_norm_fn(
         rowscale,
         prenorm,
         residual_in_fp32,
-        zero_centered_weight,
         True,
         return_dropout_mask,
-        out_dtype,
         out,
-        residual_out
     )
 class RMSNorm(torch.nn.Module):
-    def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, zero_centered_weight=False,
-                 device=None, dtype=None):
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.eps = eps
@@ -1102,16 +998,12 @@ class RMSNorm(torch.nn.Module):
             self.drop = torch.nn.Dropout(dropout_p)
         else:
             self.drop = None
-        self.zero_centered_weight = zero_centered_weight
         self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
         self.register_parameter("bias", None)
         self.reset_parameters()
     def reset_parameters(self):
-        if not self.zero_centered_weight:
-            torch.nn.init.ones_(self.weight)
-        else:
-            torch.nn.init.zeros_(self.weight)
     def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
         return rms_norm_fn(
@@ -1123,14 +1015,12 @@ class RMSNorm(torch.nn.Module):
             dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
             prenorm=prenorm,
             residual_in_fp32=residual_in_fp32,
-            zero_centered_weight=self.zero_centered_weight,
         )
 class LayerNormLinearFn(torch.autograd.Function):
     @staticmethod
-    @custom_fwd
     def forward(
         ctx,
         x,
@@ -1146,12 +1036,17 @@ class LayerNormLinearFn(torch.autograd.Function):
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
-        x = maybe_contiguous_lastdim(x.reshape(-1, x.shape[-1]))
         if residual is not None:
             assert residual.shape == x_shape_og
-            residual = maybe_contiguous_lastdim(residual.reshape(-1, residual.shape[-1]))
         norm_weight = norm_weight.contiguous()
-        norm_bias = maybe_contiguous(norm_bias)
         residual_dtype = (
             residual.dtype
             if residual is not None
@@ -1163,17 +1058,25 @@ class LayerNormLinearFn(torch.autograd.Function):
             norm_bias,
             eps,
             residual,
-            out_dtype=None if not torch.is_autocast_enabled() else torch.get_autocast_dtype("cuda"),
             residual_dtype=residual_dtype,
             is_rms_norm=is_rms_norm,
         )
         y = y.reshape(x_shape_og)
-        dtype = torch.get_autocast_dtype("cuda") if torch.is_autocast_enabled() else y.dtype
         linear_weight = linear_weight.to(dtype)
         linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
         out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
         # We don't store y, will be recomputed in the backward pass to save memory
-        ctx.save_for_backward(residual_out, norm_weight, norm_bias, linear_weight, mean, rstd)
         ctx.x_shape_og = x_shape_og
         ctx.eps = eps
         ctx.is_rms_norm = is_rms_norm
@@ -1184,17 +1087,20 @@ class LayerNormLinearFn(torch.autograd.Function):
         return out if not prenorm else (out, residual_out.reshape(x_shape_og))
     @staticmethod
-    @custom_bwd
     def backward(ctx, dout, *args):
         x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
         dout = dout.reshape(-1, dout.shape[-1])
         dy = F.linear(dout, linear_weight.t())
         dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
-        dy = maybe_contiguous_lastdim(dy)
         assert dy.shape == x.shape
         if ctx.prenorm:
             dresidual = args[0]
-            dresidual = maybe_contiguous_lastdim(dresidual.reshape(-1, dresidual.shape[-1]))
             assert dresidual.shape == x.shape
         else:
             dresidual = None

 # The models we train have hidden dim up to 8k anyway (e.g. Llama 70B), so this is fine.
 import math
 import torch
 import torch.nn.functional as F
+from torch.amp import custom_fwd, custom_bwd
 import triton
 import triton.language as tl
 def layer_norm_ref(
     x,
     dropout_p=0.0,
     rowscale=None,
     prenorm=False,
     dropout_mask=None,
     dropout_mask1=None,
     upcast=False,
         x1 = x1.float() if x1 is not None else None
         weight1 = weight1.float() if weight1 is not None else None
         bias1 = bias1.float() if bias1 is not None else None
     if x1 is not None:
         assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
     if rowscale is not None:
         x = x + x1
     if residual is not None:
         x = (x + residual).to(x.dtype)
+    out = F.layer_norm(
+        x.to(weight.dtype), x.shape[-1:], weight=weight, bias=bias, eps=eps
+    ).to(dtype)
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
     dropout_p=0.0,
     rowscale=None,
     prenorm=False,
     dropout_mask=None,
     dropout_mask1=None,
     upcast=False,
         x1 = x1.float() if x1 is not None else None
         weight1 = weight1.float() if weight1 is not None else None
         bias1 = bias1.float() if bias1 is not None else None
     if x1 is not None:
         assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
     if rowscale is not None:
     if residual is not None:
         x = (x + residual).to(x.dtype)
     rstd = 1 / torch.sqrt((x.square()).mean(dim=-1, keepdim=True) + eps)
+    out = ((x * rstd * weight) + bias if bias is not None else (x * rstd * weight)).to(
+        dtype
+    )
     if weight1 is None:
         return out if not prenorm else (out, x)
     else:
+        out1 = (
+            (x * rstd * weight1) + bias1 if bias1 is not None else (x * rstd * weight1)
+        ).to(dtype)
         return (out, out1) if not prenorm else (out, out1, x)
 @triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=["N", "HAS_RESIDUAL", "STORE_RESIDUAL_OUT", "IS_RMS_NORM", "HAS_BIAS"],
 )
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_RESIDUAL": lambda args: args["RESIDUAL"] is not None})
+@triton.heuristics({"HAS_X1": lambda args: args["X1"] is not None})
+@triton.heuristics({"HAS_W1": lambda args: args["W1"] is not None})
+@triton.heuristics({"HAS_B1": lambda args: args["B1"] is not None})
 @triton.jit
 def _layer_norm_fwd_1pass_kernel(
     X,  # pointer to the input
     ROWSCALE,
     SEEDS,  # Dropout seeds for each row
     DROPOUT_MASK,
     Mean,  # pointer to the mean
     Rstd,  # pointer to the 1/std
     stride_x_row,  # how much to increase the pointer when moving by 1 row
     N,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     dropout_p,  # Dropout probability
     IS_RMS_NORM: tl.constexpr,
     BLOCK_N: tl.constexpr,
     HAS_RESIDUAL: tl.constexpr,
     if HAS_DROPOUT:
         # Compute dropout mask
         # 7 rounds is good enough, and reduces register pressure
+        keep_mask = (
+            tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7) > dropout_p
+        )
         x = tl.where(keep_mask, x / (1.0 - dropout_p), 0.0)
         if STORE_DROPOUT_MASK:
             tl.store(DROPOUT_MASK + row * N + cols, keep_mask, mask=cols < N)
             # Compute dropout mask
             # 7 rounds is good enough, and reduces register pressure
             keep_mask = (
+                tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
+                > dropout_p
             )
             x1 = tl.where(keep_mask, x1 / (1.0 - dropout_p), 0.0)
             if STORE_DROPOUT_MASK:
+                tl.store(DROPOUT_MASK + (M + row) * N + cols, keep_mask, mask=cols < N)
         x += x1
     if HAS_RESIDUAL:
         residual = tl.load(RESIDUAL + cols, mask=cols < N, other=0.0).to(tl.float32)
     # Normalize and apply linear transformation
     mask = cols < N
     w = tl.load(W + cols, mask=mask).to(tl.float32)
     if HAS_BIAS:
         b = tl.load(B + cols, mask=mask).to(tl.float32)
     x_hat = (x - mean) * rstd if not IS_RMS_NORM else x * rstd
     tl.store(Y + cols, y, mask=mask)
     if HAS_W1:
         w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
         if HAS_B1:
             b1 = tl.load(B1 + cols, mask=mask).to(tl.float32)
         y1 = x_hat * w1 + b1 if HAS_B1 else x_hat * w1
 def _layer_norm_fwd(
+    x,
+    weight,
+    bias,
+    eps,
+    residual=None,
+    x1=None,
+    weight1=None,
+    bias1=None,
+    dropout_p=0.0,
+    rowscale=None,
+    out_dtype=None,
+    residual_dtype=None,
+    is_rms_norm=False,
+    return_dropout_mask=False,
+    out=None,
+    residual_out=None,
+):
     if residual is not None:
         residual_dtype = residual.dtype
     M, N = x.shape
     assert x.stride(-1) == 1
     if residual is not None:
     if rowscale is not None:
         assert rowscale.is_contiguous()
         assert rowscale.shape == (M,)
+    # allocate output
+    if out is None:
+        out = torch.empty_like(x, dtype=x.dtype if out_dtype is None else out_dtype)
+    else:
+        assert out.shape == x.shape
     assert out.stride(-1) == 1
     if weight1 is not None:
         y1 = torch.empty_like(out)
         assert y1.stride(-1) == 1
     else:
         y1 = None
+    if (
+        residual is not None
+        or (residual_dtype is not None and residual_dtype != x.dtype)
+        or dropout_p > 0.0
+        or rowscale is not None
+        or x1 is not None
+    ):
+        if residual_out is None:
+            residual_out = torch.empty(
+                M,
+                N,
+                device=x.device,
+                dtype=residual_dtype if residual_dtype is not None else x.dtype,
+            )
+        else:
+            assert residual_out.shape == x.shape
+        assert residual_out.stride(-1) == 1
+    else:
+        residual_out = None
+    mean = (
+        torch.empty((M,), dtype=torch.float32, device=x.device)
+        if not is_rms_norm
+        else None
+    )
     rstd = torch.empty((M,), dtype=torch.float32, device=x.device)
     if dropout_p > 0.0:
         seeds = torch.randint(
     else:
         seeds = None
     if return_dropout_mask and dropout_p > 0.0:
+        dropout_mask = torch.empty(
+            M if x1 is None else 2 * M, N, device=x.device, dtype=torch.bool
+        )
     else:
+        dropout_mask = None
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
     with torch.cuda.device(x.device.index):
+        _layer_norm_fwd_1pass_kernel[(M,)](
             x,
             out,
             weight,
             rowscale,
             seeds,
             dropout_mask,
             mean,
             rstd,
             x.stride(0),
             N,
             eps,
             dropout_p,
             is_rms_norm,
             BLOCK_N,
             residual is not None,
             dropout_p > 0.0,
             dropout_mask is not None,
             rowscale is not None,
         )
+    # residual_out is None if residual is None and residual_dtype == input_dtype and dropout_p == 0.0
+    if dropout_mask is not None and x1 is not None:
+        dropout_mask, dropout_mask1 = dropout_mask.tensor_split(2, dim=0)
+    else:
+        dropout_mask1 = None
+    return (
+        out,
+        y1,
+        mean,
+        rstd,
+        residual_out if residual_out is not None else x,
+        seeds,
+        dropout_mask,
+        dropout_mask1,
+    )
 @triton.autotune(
+    configs=[
+        triton.Config({}, num_warps=1),
+        triton.Config({}, num_warps=2),
+        triton.Config({}, num_warps=4),
+        triton.Config({}, num_warps=8),
+        triton.Config({}, num_warps=16),
+        triton.Config({}, num_warps=32),
+    ],
+    key=[
+        "N",
+        "HAS_DRESIDUAL",
+        "STORE_DRESIDUAL",
+        "IS_RMS_NORM",
+        "HAS_BIAS",
+        "HAS_DROPOUT",
+    ],
 )
 # @triton.heuristics({"HAS_BIAS": lambda args: args["B"] is not None})
 # @triton.heuristics({"HAS_DRESIDUAL": lambda args: args["DRESIDUAL"] is not None})
 # @triton.heuristics({"STORE_DRESIDUAL": lambda args: args["DRESIDUAL_IN"] is not None})
+@triton.heuristics({"HAS_ROWSCALE": lambda args: args["ROWSCALE"] is not None})
+@triton.heuristics({"HAS_DY1": lambda args: args["DY1"] is not None})
+@triton.heuristics({"HAS_DX1": lambda args: args["DX1"] is not None})
+@triton.heuristics({"HAS_B1": lambda args: args["DB1"] is not None})
+@triton.heuristics({"RECOMPUTE_OUTPUT": lambda args: args["Y"] is not None})
 @triton.jit
 def _layer_norm_bwd_kernel(
     X,  # pointer to the input
     N,  # number of columns in X
     eps,  # epsilon to avoid division by zero
     dropout_p,
     rows_per_program,
     IS_RMS_NORM: tl.constexpr,
     BLOCK_N: tl.constexpr,
     if RECOMPUTE_OUTPUT:
         Y += row_start * stride_y_row
     w = tl.load(W + cols, mask=mask).to(tl.float32)
     if RECOMPUTE_OUTPUT and HAS_BIAS:
         b = tl.load(B + cols, mask=mask, other=0.0).to(tl.float32)
     if HAS_DY1:
         w1 = tl.load(W1 + cols, mask=mask).to(tl.float32)
     dw = tl.zeros((BLOCK_N,), dtype=tl.float32)
     if HAS_BIAS:
         db = tl.zeros((BLOCK_N,), dtype=tl.float32)
         if HAS_DX1:
             if HAS_DROPOUT:
                 keep_mask = (
+                    tl.rand(tl.load(SEEDS + M + row).to(tl.uint32), cols, n_rounds=7)
+                    > dropout_p
                 )
                 dx1 = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
             else:
                 dx1 = dx
             tl.store(DX1 + cols, dx1, mask=mask)
         if HAS_DROPOUT:
+            keep_mask = (
+                tl.rand(tl.load(SEEDS + row).to(tl.uint32), cols, n_rounds=7)
+                > dropout_p
+            )
             dx = tl.where(keep_mask, dx / (1.0 - dropout_p), 0.0)
         if HAS_ROWSCALE:
             rowscale = tl.load(ROWSCALE + row).to(tl.float32)
 def _layer_norm_bwd(
+    dy,
+    x,
+    weight,
+    bias,
+    eps,
+    mean,
+    rstd,
+    dresidual=None,
+    dy1=None,
+    weight1=None,
+    bias1=None,
+    seeds=None,
+    dropout_p=0.0,
+    rowscale=None,
+    has_residual=False,
+    has_x1=False,
+    is_rms_norm=False,
+    x_dtype=None,
+    recompute_output=False,
+):
     M, N = x.shape
     assert x.stride(-1) == 1
     assert dy.stride(-1) == 1
     assert dy.shape == (M, N)
     if dresidual is not None:
         assert dresidual.stride(-1) == 1
         assert dresidual.shape == (M, N)
     assert weight.shape == (N,)
         assert bias.stride(-1) == 1
         assert bias.shape == (N,)
     if dy1 is not None:
         assert weight1 is not None
         assert dy1.shape == dy.shape
         assert dy1.stride(-1) == 1
         else None
     )
     dx1 = torch.empty_like(dx) if (has_x1 and dropout_p > 0.0) else None
+    y = (
+        torch.empty(M, N, dtype=dy.dtype, device=dy.device)
+        if recompute_output
+        else None
+    )
     if recompute_output:
+        assert (
+            weight1 is None
+        ), "recompute_output is not supported with parallel LayerNorm"
     # Less than 64KB per feature: enqueue fused kernel
     MAX_FUSED_SIZE = 65536 // x.element_size()
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
+    sm_count = torch.cuda.get_device_properties(x.device).multi_processor_count
     _dw = torch.empty((sm_count, N), dtype=torch.float32, device=weight.device)
     _db = (
         torch.empty((sm_count, N), dtype=torch.float32, device=bias.device)
     rows_per_program = math.ceil(M / sm_count)
     grid = (sm_count,)
     with torch.cuda.device(x.device.index):
+        _layer_norm_bwd_kernel[grid](
             x,
             weight,
             bias,
             N,
             eps,
             dropout_p,
             rows_per_program,
             is_rms_norm,
             BLOCK_N,
             dresidual_in is not None,
             bias is not None,
             dropout_p > 0.0,
         )
     dw = _dw.sum(0).to(weight.dtype)
     db = _db.sum(0).to(bias.dtype) if bias is not None else None
     dw1 = _dw1.sum(0).to(weight1.dtype) if weight1 is not None else None
     db1 = _db1.sum(0).to(bias1.dtype) if bias1 is not None else None
+    # Don't need to compute dresidual_in separately in this case
+    if has_residual and dx.dtype == x.dtype and dropout_p == 0.0 and rowscale is None:
+        dresidual_in = dx
+    if has_x1 and dropout_p == 0.0:
+        dx1 = dx
+    return (
+        (dx, dw, db, dresidual_in, dx1, dw1, db1)
+        if not recompute_output
+        else (dx, dw, db, dresidual_in, dx1, dw1, db1, y)
+    )
 class LayerNormFn(torch.autograd.Function):
     @staticmethod
     def forward(
         ctx,
         rowscale=None,
         prenorm=False,
         residual_in_fp32=False,
         is_rms_norm=False,
         return_dropout_mask=False,
         out=None,
+        residual_out=None,
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
         if residual is not None:
             assert residual.shape == x_shape_og
+            residual = residual.reshape(-1, residual.shape[-1])
+            if residual.stride(-1) != 1:
+                residual = residual.contiguous()
         if x1 is not None:
             assert x1.shape == x_shape_og
             assert rowscale is None, "rowscale is not supported with parallel LayerNorm"
+            x1 = x1.reshape(-1, x1.shape[-1])
+            if x1.stride(-1) != 1:
+                x1 = x1.contiguous()
         weight = weight.contiguous()
+        if bias is not None:
+            bias = bias.contiguous()
+        if weight1 is not None:
+            weight1 = weight1.contiguous()
+        if bias1 is not None:
+            bias1 = bias1.contiguous()
         if rowscale is not None:
             rowscale = rowscale.reshape(-1).contiguous()
         residual_dtype = (
             out = out.reshape(-1, out.shape[-1])
         if residual_out is not None:
             residual_out = residual_out.reshape(-1, residual_out.shape[-1])
+        y, y1, mean, rstd, residual_out, seeds, dropout_mask, dropout_mask1 = (
+            _layer_norm_fwd(
+                x,
+                weight,
+                bias,
+                eps,
+                residual,
+                x1,
+                weight1,
+                bias1,
+                dropout_p=dropout_p,
+                rowscale=rowscale,
+                residual_dtype=residual_dtype,
+                is_rms_norm=is_rms_norm,
+                return_dropout_mask=return_dropout_mask,
+                out=out,
+                residual_out=residual_out,
+            )
         )
         ctx.save_for_backward(
             residual_out, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd
         ctx.has_x1 = x1 is not None
         ctx.prenorm = prenorm
         ctx.x_dtype = x.dtype
         y = y.reshape(x_shape_og)
         y1 = y1.reshape(x_shape_og) if y1 is not None else None
+        residual_out = (
+            residual_out.reshape(x_shape_og) if residual_out is not None else None
+        )
+        dropout_mask = (
+            dropout_mask.reshape(x_shape_og) if dropout_mask is not None else None
+        )
+        dropout_mask1 = (
+            dropout_mask1.reshape(x_shape_og) if dropout_mask1 is not None else None
+        )
         if not return_dropout_mask:
             if weight1 is None:
                 return y if not prenorm else (y, residual_out)
     def backward(ctx, dy, *args):
         x, weight, bias, weight1, bias1, rowscale, seeds, mean, rstd = ctx.saved_tensors
         dy = dy.reshape(-1, dy.shape[-1])
+        if dy.stride(-1) != 1:
+            dy = dy.contiguous()
+        assert dy.shape == x.shape
         if weight1 is not None:
             dy1, args = args[0], args[1:]
             dy1 = dy1.reshape(-1, dy1.shape[-1])
+            if dy1.stride(-1) != 1:
+                dy1 = dy1.contiguous()
             assert dy1.shape == x.shape
         else:
             dy1 = None
         if ctx.prenorm:
             dresidual = args[0]
             dresidual = dresidual.reshape(-1, dresidual.shape[-1])
+            if dresidual.stride(-1) != 1:
+                dresidual = dresidual.contiguous()
             assert dresidual.shape == x.shape
         else:
             dresidual = None
+        dx, dw, db, dresidual_in, dx1, dw1, db1 = _layer_norm_bwd(
             dy,
             x,
             weight,
             rowscale,
             ctx.has_residual,
             ctx.has_x1,
             ctx.is_rms_norm,
             x_dtype=ctx.x_dtype,
         )
         return (
             dx.reshape(ctx.x_shape_og),
             None,
             None,
             None,
         )
     rowscale=None,
     prenorm=False,
     residual_in_fp32=False,
     is_rms_norm=False,
     return_dropout_mask=False,
     out=None,
+    residual_out=None,
 ):
     return LayerNormFn.apply(
         x,
         rowscale,
         prenorm,
         residual_in_fp32,
         is_rms_norm,
         return_dropout_mask,
         out,
+        residual_out,
     )
     rowscale=None,
     prenorm=False,
     residual_in_fp32=False,
     return_dropout_mask=False,
     out=None,
+    residual_out=None,
 ):
     return LayerNormFn.apply(
         x,
         rowscale,
         prenorm,
         residual_in_fp32,
         True,
         return_dropout_mask,
         out,
+        residual_out,
     )
 class RMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-5, dropout_p=0.0, device=None, dtype=None):
         factory_kwargs = {"device": device, "dtype": dtype}
         super().__init__()
         self.eps = eps
             self.drop = torch.nn.Dropout(dropout_p)
         else:
             self.drop = None
         self.weight = torch.nn.Parameter(torch.empty(hidden_size, **factory_kwargs))
         self.register_parameter("bias", None)
         self.reset_parameters()
     def reset_parameters(self):
+        torch.nn.init.ones_(self.weight)
     def forward(self, x, residual=None, prenorm=False, residual_in_fp32=False):
         return rms_norm_fn(
             dropout_p=self.drop.p if self.drop is not None and self.training else 0.0,
             prenorm=prenorm,
             residual_in_fp32=residual_in_fp32,
         )
 class LayerNormLinearFn(torch.autograd.Function):
     @staticmethod
+    @custom_fwd(device_type="cuda")
     def forward(
         ctx,
         x,
     ):
         x_shape_og = x.shape
         # reshape input data into 2D tensor
+        x = x.reshape(-1, x.shape[-1])
+        if x.stride(-1) != 1:
+            x = x.contiguous()
         if residual is not None:
             assert residual.shape == x_shape_og
+            residual = residual.reshape(-1, residual.shape[-1])
+            if residual.stride(-1) != 1:
+                residual = residual.contiguous()
         norm_weight = norm_weight.contiguous()
+        if norm_bias is not None:
+            norm_bias = norm_bias.contiguous()
         residual_dtype = (
             residual.dtype
             if residual is not None
             norm_bias,
             eps,
             residual,
+            out_dtype=(
+                None
+                if not torch.is_autocast_enabled()
+                else torch.get_autocast_gpu_dtype()
+            ),
             residual_dtype=residual_dtype,
             is_rms_norm=is_rms_norm,
         )
         y = y.reshape(x_shape_og)
+        dtype = (
+            torch.get_autocast_gpu_dtype() if torch.is_autocast_enabled() else y.dtype
+        )
         linear_weight = linear_weight.to(dtype)
         linear_bias = linear_bias.to(dtype) if linear_bias is not None else None
         out = F.linear(y.to(linear_weight.dtype), linear_weight, linear_bias)
         # We don't store y, will be recomputed in the backward pass to save memory
+        ctx.save_for_backward(
+            residual_out, norm_weight, norm_bias, linear_weight, mean, rstd
+        )
         ctx.x_shape_og = x_shape_og
         ctx.eps = eps
         ctx.is_rms_norm = is_rms_norm
         return out if not prenorm else (out, residual_out.reshape(x_shape_og))
     @staticmethod
+    @custom_bwd(device_type="cuda")
     def backward(ctx, dout, *args):
         x, norm_weight, norm_bias, linear_weight, mean, rstd = ctx.saved_tensors
         dout = dout.reshape(-1, dout.shape[-1])
         dy = F.linear(dout, linear_weight.t())
         dlinear_bias = None if ctx.linear_bias_is_none else dout.sum(0)
+        if dy.stride(-1) != 1:
+            dy = dy.contiguous()
         assert dy.shape == x.shape
         if ctx.prenorm:
             dresidual = args[0]
+            dresidual = dresidual.reshape(-1, dresidual.shape[-1])
+            if dresidual.stride(-1) != 1:
+                dresidual = dresidual.contiguous()
             assert dresidual.shape == x.shape
         else:
             dresidual = None

torch-ext/triton_layer_norm/layers.py CHANGED Viewed

@@ -1,46 +1,4 @@
-import torch
-from torch import nn
-from .layer_norm import rms_norm_fn
-class LlamaRMSNorm(nn.Module):
-    """
-    RMS Layer Norm for Llama models.
-    Triton-optimized RMS layer norm. The interface is compatible with `LLamaRMSNorm` in
-    `transformers`.
-    Attributes:
-        weight (`torch.Tensor`): The learnable scaling parameter.
-        variance_epsilon (`float`): The epsilon value for numerical stability.
-    """
-    weight: torch.Tensor
-    variance_epsilon: float
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        """
-        Apply RMS normalization to the input hidden states.
-        Args:
-            hidden_states (`torch.Tensor`):
-                Input tensor of shape `(batch_size, sequence_length, hidden_size)` or any shape
-                where the last dimension is the feature dimension to be normalized.
-        Returns:
-            `torch.Tensor`:
-                The normalized tensor with the same shape as the input `hidden_states`.
-        """
-        return rms_norm_fn(
-            hidden_states,
-            self.weight,
-            bias=None,
-            residual=None,
-            eps=self.variance_epsilon,
-            dropout_p=0.0,
-            prenorm=False,
-            residual_in_fp32=False,
-        )
-__all__ = ["LlamaRMSNorm"]


1	+ from .layer_norm import RMSNorm

2

3
4	+ __all__ = ["RMSNorm"]

torch-ext/triton_layer_norm/utils/__init__.py DELETED Viewed

File without changes

torch-ext/triton_layer_norm/utils/library.py DELETED Viewed

@@ -1,66 +0,0 @@
-# Adapted from https://github.com/pytorch/pytorch/blob/v2.7.0/torch/_library/triton.py
-# The PyTorch implementation simply ignores the schema argument, we simply modify it to use schema.
-from typing import Optional, Callable, Iterable, Union
-from torch.library import custom_op, CustomOpDef
-from torch._library.triton import set_wrap_triton_enabled
-def triton_op(
-    name: str,
-    fn: Optional[Callable] = None,
-    /,
-    *,
-    mutates_args: Union[str, Iterable[str]],
-    schema: Optional[str] = None,
-    # If allow_decomposition=True, this matches torch.library.triton_op behavior. If set to False,
-    # then it behaves like torch.library.custom_op instead, which doesn't decompose the operator
-    # and so inductor can't trace inside.
-    allow_decomposition=True,
-) -> Callable:
-    def dec(fn: Callable[..., object]) -> CustomOpDef:
-        def backend_fn(*args, **kwargs):  # type: ignore[no-untyped-def]
-            # Optimization: we're passing regular Tensors into the triton kernel, so
-            # no need to go through HOP dispatch
-            with set_wrap_triton_enabled(False):
-                return fn(*args, **kwargs)
-        result = custom_op(
-            name,
-            backend_fn,
-            mutates_args=mutates_args,
-            # This is the only difference with the PyTorch implementation
-            schema=schema,
-        )
-        from torch._subclasses.functional_tensor import FunctionalTensorMode
-        # We require that the user pass us a function that is make_fx traceable,
-        # so we can just register it as the Fake/meta kernel.
-        result.register_fake(fn)
-        if allow_decomposition:
-            # We decompose the operator when FunctionalTensorMode is active.
-            # The goal is to decompose the operator in AOTDispatcher.
-            # - With torch.compile, this means that the backend (usually Inductor)
-            #   can see a call to the triton kernel(s) and so it can directly optimize
-            #   them by inlining them into the lowering process.
-            def functional_decomp(  # type: ignore[no-untyped-def]
-                mode, op, types, args, kwargs
-            ):
-                from torch.export._trace import custom_triton_ops_decomposition_disabled
-                if custom_triton_ops_decomposition_disabled():
-                    return mode.__torch_dispatch__(op, types, args, kwargs)
-                else:
-                    with mode:
-                        return fn(*args, **kwargs)
-            result.register_torch_dispatch(FunctionalTensorMode, functional_decomp)
-        return result
-    if fn is None:
-        return dec
-    else:
-        return dec(fn)

torch-ext/triton_layer_norm/utils/torch.py DELETED Viewed

@@ -1,21 +0,0 @@
-import torch
-from typing import Callable
-def custom_amp_decorator(dec: Callable, cuda_amp_deprecated: bool):
-    def decorator(*args, **kwargs):
-        if cuda_amp_deprecated:
-            kwargs["device_type"] = "cuda"
-        return dec(*args, **kwargs)
-    return decorator
-if hasattr(torch.amp, "custom_fwd"): # type: ignore[attr-defined]
-    deprecated = True
-    from torch.amp import custom_fwd, custom_bwd # type: ignore[attr-defined]
-else:
-    deprecated = False
-    from torch.cuda.amp import custom_fwd, custom_bwd
-custom_fwd = custom_amp_decorator(custom_fwd, deprecated)
-custom_bwd = custom_amp_decorator(custom_bwd, deprecated)