Upload FastSLMForCausalLM

Browse files

Files changed (9) hide show

README.md +199 -0
config.json +112 -0
configuration_fast_slm.py +246 -0
delta_net.py +472 -0
fused_mha_with_cache.py +126 -0
mamba2.py +464 -0
model.safetensors +3 -0
modeling_fast_slm.py +0 -0
triton_attention.py +2714 -0

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

config.json ADDED Viewed

	@@ -0,0 +1,112 @@

+{
+  "architectures": [
+    "FastSLMForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "attn_hidden_size": -1,
+  "attn_implementation": "flash_attention_2",
+  "attn_implementation_new": "flash_attention_2",
+  "auto_map": {
+    "AutoConfig": "configuration_fast_slm.FastSLMConfig",
+    "AutoModelForCausalLM": "modeling_fast_slm.FastSLMForCausalLM"
+  },
+  "bos_token_id": 1,
+  "calc_logits_for_entire_prompt": false,
+  "d_conv": 4,
+  "dtype": "bfloat16",
+  "eos_token_id": 2,
+  "ffn_expand_ratio": 3,
+  "global_attn_idx": [],
+  "hidden_act": "silu",
+  "hidden_size": 2048,
+  "hybrid_decoder_layer": "mamba",
+  "initializer_range": 0.02,
+  "intermediate_size": 0,
+  "kq_head_dim": -1,
+  "kq_norm": "none",
+  "layer_type": [
+    "m",
+    "a",
+    "m",
+    "a",
+    "a",
+    "a",
+    "m",
+    "a",
+    "m",
+    "a",
+    "m",
+    "a",
+    "a",
+    "a",
+    "m",
+    "a",
+    "m",
+    "a",
+    "m",
+    "a",
+    "m",
+    "a",
+    "m",
+    "a"
+  ],
+  "layer_types": [
+    "deltanet",
+    "f",
+    "m2",
+    "f",
+    "a",
+    "f",
+    "m2",
+    "f",
+    "deltanet",
+    "f",
+    "m2",
+    "f",
+    "a",
+    "f",
+    "m2",
+    "f",
+    "deltanet",
+    "f",
+    "m2",
+    "f",
+    "deltanet",
+    "f",
+    "m2",
+    "f"
+  ],
+  "mamba2_headdim": 64,
+  "mamba_conv_bias": true,
+  "mamba_d_conv": 4,
+  "mamba_d_state": 128,
+  "mamba_dt_rank": 128,
+  "mamba_expand": 2,
+  "mamba_inner_layernorms": true,
+  "mamba_proj_bias": false,
+  "max_position_embeddings": 36000,
+  "mlp_hidden_act": "silu",
+  "model_type": "jamba",
+  "new_seq_length": 2048,
+  "num_attention_heads": 16,
+  "num_experts": 1,
+  "num_experts_per_tok": 1,
+  "num_hidden_layers": 24,
+  "num_key_value_heads": 4,
+  "num_memory_tokens": 256,
+  "orig_max_position_embeddings": 4096,
+  "output_router_logits": false,
+  "pad_token_id": 0,
+  "rms_norm_eps": 1e-06,
+  "rope": true,
+  "rope_theta": 10000.0,
+  "rope_type": "ntk",
+  "router_aux_loss_coef": 0.001,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.56.2",
+  "use_cache": false,
+  "use_mamba_kernels": true,
+  "v_head_dim": -1,
+  "vocab_size": 131072
+}

configuration_fast_slm.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# coding=utf-8
+# Copyright 2024 AI21 Labs Ltd. and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Jamba model configuration"""
+import math
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+class FastSLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`JambaModel`]. It is used to instantiate a
+    Jamba model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the jamba-small architecture.
+    [ai21labs/jamba-small](https://huggingface.co/ai21labs/Jamba-v0.1)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 65536):
+            Vocabulary size of the Jamba model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`JambaModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        calc_logits_for_entire_prompt (`bool`, *optional*, defaults to `False`):
+            Whether or not to calculate logits for entire prompt during generation. If `False`, only the logits of the
+            last prompt token will be calculated, which are the only logits needed for generation. For long sequences,
+            the logits for the entire sequence may use a lot of memory so setting `calc_logits_for_entire_prompt=False`
+            will reduce memory footprint significantly.
+            Note: some generation features may not be available if this is set to `False`.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `None`.
+        n_ctx (`int`, *optional*, defaults to 262144):
+            This value doesn't have any real effect. The maximum sequence length that this model is intended to be
+            used with. It can be used with longer sequences, but performance may degrade.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_experts (`int`, *optional*, defaults to 16):
+            Number of experts per Sparse MLP layer.
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
+            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
+            `True` and kernels are not available
+        mamba_d_state (`int`, *optional*, defaults to 16):
+            The dimension the mamba state space latents
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size
+        mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
+        mamba_inner_layernorms (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to apply layernorms to internal mamba activations
+    """
+    model_type = "jamba"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    def __init__(
+            self,
+            vocab_size=65536,
+            tie_word_embeddings=False,
+            hidden_size=4096,
+            intermediate_size=14336,
+            num_hidden_layers=32,
+            num_attention_heads=32,
+            num_key_value_heads=8,
+            hidden_act="silu",
+            initializer_range=0.02,
+            rms_norm_eps=1e-6,
+            use_cache=True,
+            calc_logits_for_entire_prompt=False,
+            output_router_logits=False,
+            router_aux_loss_coef=0.001,
+            pad_token_id=0,
+            bos_token_id=1,
+            eos_token_id=2,
+            sliding_window=None,
+            max_position_embeddings=262144,
+            orig_max_position_embeddings=None,
+            attention_dropout=0.0,
+            num_experts_per_tok=2,
+            num_experts=16,
+            use_mamba_kernels=True,
+            mamba_d_state=16,
+            mamba_d_conv=4,
+            mamba_expand=2,
+            mamba_dt_rank="auto",
+            mamba_conv_bias=True,
+            mamba_proj_bias=False,
+            mamba_inner_layernorms=True,
+            hybrid_decoder_layer='mamba',
+            global_attn_idx=None,
+            attn_implementation_new='flash_attention_2',
+            mamba2_headdim=64,
+            rope_type=None,
+            layer_types=None,
+            ffn_expand_ratio=None,
+            d_conv=4,
+            **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.max_position_embeddings = max_position_embeddings
+        self.orig_max_position_embeddings = orig_max_position_embeddings
+        self.attention_dropout = attention_dropout
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.calc_logits_for_entire_prompt = calc_logits_for_entire_prompt
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_experts = num_experts
+        self.use_mamba_kernels = use_mamba_kernels
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+        self.mamba_inner_layernorms = mamba_inner_layernorms
+        # added by Xin
+        self.kq_norm = kwargs.pop("kq_norm", None)
+        self.rope = kwargs.pop("rope", False)
+        self.rope_theta = kwargs.pop("rope_theta", 10000.0)
+        self.num_memory_tokens = kwargs.pop("num_memory_tokens", 0)
+        self.attn_hidden_size = kwargs.pop("attn_hidden_size", -1)
+        self.kq_head_dim = kwargs.pop("kq_head_dim", -1)
+        self.v_head_dim = kwargs.pop("v_head_dim", -1)
+        #! adhoc change
+        self.new_seq_length = 2048
+        self.hybrid_decoder_layer = hybrid_decoder_layer
+        self.global_attn_idx = global_attn_idx
+        self.attn_implementation_new = attn_implementation_new
+        self.mamba2_headdim = mamba2_headdim
+        self.rope_type = rope_type
+        self.layer_types = layer_types
+        self.ffn_expand_ratio = ffn_expand_ratio
+        self.d_conv = d_conv
+        self.mlp_hidden_act = kwargs.pop("mlp_hidden_act", "silu")
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )

delta_net.py ADDED Viewed

	@@ -0,0 +1,472 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+from __future__ import annotations
+from typing import TYPE_CHECKING, Dict, Optional, Tuple
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torch.nn import functional as F
+from fla.modules import FusedRMSNormGated, RMSNorm, ShortConvolution
+from fla.ops.delta_rule import chunk_delta_rule, fused_recurrent_delta_rule
+from typing import Any, Dict, List, Optional, Tuple
+import torch
+import transformers
+if TYPE_CHECKING:
+    from transformers.processing_utils import Unpack
+    from fla.models.utils import Cache
+def elu_p1(x):
+    return (F.elu(x, 1., False) + 1.).to(x)
+def sum_norm(x):
+    return (x / x.sum(-1, keepdim=True)).to(x)
+class DeltaNet(nn.Module):
+    r"""
+    The layer implementaion for [Parallelizing Linear Transformers with the Delta Rule over Sequence Length](https://arxiv.org/abs/2406.06484).  # noqa:
+    DeltaNet was originally proposed in [Linear Transformers Are Secretly Fast Weight Programmers](https://arxiv.org/abs/2102.11174). # noqa
+    Args:
+        mode (str, Optional):
+            Which DeltaNet kernel to use.
+            Currently available: `chunk`, `fused_recurrent`, and `fused_chunk`.
+            Default: `chunk`.
+        hidden_size (int, Optional):
+            The hidden size of the input. Default: 1024.
+        expand_k (float, Optional):
+            The expansion ratio for the key dim. Default: 1.0.
+        expand_v (float, Optional):
+            The expansion ratio for the value dim. Default: 1.0.
+        num_heads (int, Optional):
+            The number of heads. Default: 4.
+        use_beta (bool, Optional):
+            Whether to use beta. Default: `True`.
+        use_gate (bool, Optional):
+            Whether to use output gate. Default: `False`.
+        use_short_conv (bool, Optional):
+            Whether to use short convolutions. Default: `True`.
+        conv_size (int, Optional):
+            The kernel size of the short convolution, only used when `use_short_conv` is `True`. Default: 4.
+        conv_bias (bool, Optional):
+            Whether to use bias in the short convolution, only used when `use_short_conv` is `True`. Default: `False`.
+        allow_neg_eigval (bool, Optional):
+            Allow negative eigenvalues. Default: `False`. If set to `True`, the beta will be multiplied by 2.
+            See reference: [Unlocking State-Tracking in Linear RNNs Through Negative Eigenvalues](https://arxiv.org/abs/2411.12537)
+        layer_idx (int, Optional):
+            The index of the layer. Default: None.
+        norm_eps (float, Optional):
+            The epsilon value for the layernorm/rmsnorm layer. Default: 1e-5.
+        qk_activation (str, Optional):
+            The activation function for the query and key. Default: `silu`.
+        qk_norm (str, Optional):
+            The normalization method for the query and key. Default: `l2`.
+    """
+    def __init__(
+        self,
+        mode: str = 'chunk',
+        d_model: int = None,
+        hidden_size: int = 1024,
+        expand_k: float = 1.0,
+        expand_v: float = 1.0,
+        num_heads: int = 4,
+        use_beta: bool = True,
+        use_gate: bool = False,
+        use_short_conv: bool = True,
+        conv_size: int = 4,
+        conv_bias: bool = False,
+        allow_neg_eigval: bool = False,
+        layer_idx: int = None,
+        qk_activation: str = 'silu',
+        qk_norm: str = 'l2',
+        norm_eps: float = 1e-5,
+        config = None,
+        **kwargs
+    ) -> DeltaNet:
+        super().__init__()
+        self.mode = mode
+        self.qk_activation = qk_activation
+        self.qk_norm = qk_norm
+        assert self.qk_activation in ['silu', 'relu', 'elu', 'identity']
+        assert self.qk_norm in ['l2', 'sum']
+        if d_model is not None:
+            hidden_size = d_model
+        self.hidden_size = hidden_size
+        self.expand_k = expand_k
+        self.expand_v = expand_v
+        self.num_heads = num_heads
+        self.use_gate = use_gate
+        self.use_short_conv = use_short_conv
+        self.conv_size = conv_size
+        self.conv_bias = conv_bias
+        self.allow_neg_eigval = allow_neg_eigval
+        self.key_dim = int(hidden_size * expand_k)
+        self.value_dim = int(hidden_size * expand_v)
+        self.head_k_dim = self.key_dim // num_heads
+        self.head_v_dim = self.value_dim // num_heads
+        self.layer_idx = layer_idx
+        self.silu = nn.SiLU()
+        if mode == 'fused_chunk':
+            raise NotImplementedError("fused_chunk_delta_rule is now deprecated. Please use `chunk_delta_rule` instead.")
+        assert mode in ['chunk', 'fused_recurrent'], f"Not suppoerted mode `{mode}`."
+        assert self.key_dim % num_heads == 0, f"key dim must be divisible by num_heads of {num_heads}"
+        assert self.value_dim % num_heads == 0, f"value dim must be divisible by num_heads of {num_heads}"
+        self.q_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.k_proj = nn.Linear(hidden_size, self.key_dim, bias=False)
+        self.v_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+        self.use_beta = use_beta
+        if self.use_beta:
+            self.b_proj = nn.Linear(hidden_size, self.num_heads, bias=False)
+        if use_short_conv:
+            self.conv_size = conv_size
+            self.q_conv1d = ShortConvolution(
+                hidden_size=self.key_dim,
+                kernel_size=conv_size,
+                activation='silu' if qk_activation == 'silu' else None
+            )
+            self.k_conv1d = ShortConvolution(
+                hidden_size=self.key_dim,
+                kernel_size=conv_size,
+                activation='silu' if qk_activation == 'silu' else None
+            )
+            self.v_conv1d = ShortConvolution(
+                hidden_size=self.value_dim,
+                kernel_size=conv_size,
+                activation='silu'
+            )
+        else:
+            raise UserWarning(
+                "ShortConvolution is crucial to the performance. "
+                "Do not turn it off, i.e., setting `use_short_conv=False` unless you know what you are doing."
+            )
+        if use_gate:
+            self.g_proj = nn.Linear(hidden_size, self.value_dim, bias=False)
+            self.o_norm = FusedRMSNormGated(self.head_v_dim, eps=norm_eps)
+        else:
+            self.o_norm = RMSNorm(self.head_v_dim, eps=norm_eps)
+        self.o_proj = nn.Linear(self.value_dim, hidden_size, bias=False)
+        self.apply(self._initialize_weights)
+    def _initialize_weights(self, module: nn.Module):
+        if getattr(module, "_is_hf_initialized", False):
+            return
+        if isinstance(module, nn.Linear):
+            nn.init.xavier_uniform_(module.weight, gain=2 ** -2.5)
+            if module.bias is not None:
+                nn.init.zeros_(module.bias)
+        module._is_hf_initialized = True
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs: Unpack[Dict]
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
+        if attention_mask is not None:
+            assert len(attention_mask.shape) == 2, (
+                "Expected attention_mask as a 0-1 matrix with shape [batch_size, seq_len] "
+                "for padding purposes (0 indicating padding). "
+                "Arbitrary attention masks of shape [batch_size, seq_len, seq_len] are not allowed."
+            )
+        # change to inference mode.
+        mode = 'fused_recurrent' if hidden_states.shape[1] <= 64 else self.mode
+        last_state = None
+        if past_key_values is not None and len(past_key_values) > self.layer_idx:
+            last_state = past_key_values[self.layer_idx]
+        if self.use_short_conv:
+            conv_state_q, conv_state_k, conv_state_v = None, None, None
+            if last_state is not None:
+                conv_state_q, conv_state_k, conv_state_v = last_state['conv_state']
+            conv_mask = attention_mask[:, -hidden_states.shape[1]:] if attention_mask is not None else None
+            position_ids = kwargs.get('position_ids', None)
+            q = self.q_proj(hidden_states)
+            q, conv_state_q = self.q_conv1d(x=q,
+                                            mask=conv_mask,
+                                            cache=conv_state_q,
+                                            output_final_state=use_cache,
+                                            seq_idx=position_ids)
+            k = self.k_proj(hidden_states)
+            k, conv_state_k = self.k_conv1d(x=k,
+                                            mask=conv_mask,
+                                            cache=conv_state_k,
+                                            output_final_state=use_cache,
+                                            seq_idx=position_ids)
+            v = self.v_proj(hidden_states)
+            v, conv_state_v = self.v_conv1d(x=v,
+                                            mask=conv_mask,
+                                            cache=conv_state_v,
+                                            output_final_state=use_cache,
+                                            seq_idx=position_ids)
+        else:
+            q = self.q_proj(hidden_states)
+            k = self.k_proj(hidden_states)
+            v = self.v_proj(hidden_states)
+            if self.qk_activation == 'silu':
+                q, k = self.silu(q), self.silu(k)
+            v = self.silu(v)
+        q, k = map(lambda x: rearrange(x, '... (h d) -> ... h d', d=self.head_k_dim), (q, k))
+        v = rearrange(v, '... (h d) -> ... h d', d=self.head_v_dim)
+        if self.qk_activation != 'silu':
+            if self.qk_activation == 'relu':
+                q, k = q.relu(), k.relu()
+            elif self.qk_activation == 'elu':
+                q, k = elu_p1(q), elu_p1(k)
+            elif self.qk_activation == 'identity':
+                pass
+            else:
+                raise NotImplementedError
+        if self.qk_norm == 'sum':
+            q = sum_norm(q).to(q)
+            k = sum_norm(k).to(k)
+        if self.use_beta:
+            beta = self.b_proj(hidden_states)
+            beta = beta.sigmoid()
+        else:
+            beta = q.new_ones(q.shape[0], q.shape[1], q.shape[2])
+        if self.allow_neg_eigval:
+            beta = beta * 2.
+        # dealing with padding
+        if attention_mask is not None:
+            beta = beta.mul(attention_mask[:, -beta.shape[-2]:, None])
+        recurrent_state = last_state['recurrent_state'] if last_state is not None else None
+        cu_seqlens = kwargs.get('cu_seqlens', None)
+        if mode == 'fused_recurrent':
+            o, recurrent_state = fused_recurrent_delta_rule(
+                q=q,
+                k=k,
+                v=v,
+                beta=beta,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                use_qk_l2norm_in_kernel=True if self.qk_norm == 'l2' else False
+            )
+        elif mode == 'chunk':
+            o, recurrent_state = chunk_delta_rule(
+                q=q,
+                k=k,
+                v=v,
+                beta=beta,
+                initial_state=recurrent_state,
+                output_final_state=use_cache,
+                cu_seqlens=cu_seqlens,
+                use_qk_l2norm_in_kernel=True if self.qk_norm == 'l2' else False
+            )
+        else:
+            raise NotImplementedError(f"Not supported mode `{mode}`.")
+        if past_key_values is not None:
+            past_key_values.update(
+                recurrent_state=recurrent_state,
+                conv_state=(conv_state_q, conv_state_k, conv_state_v) if self.use_short_conv else None,
+                layer_idx=self.layer_idx,
+                offset=q.shape[1]
+            )
+        if self.use_gate:
+            g = rearrange(self.g_proj(hidden_states), '... (h d) -> ... h d', d=self.head_v_dim)
+            o = self.o_norm(o, g)
+        else:
+            o = self.o_norm(o)
+        o = rearrange(o, 'b t h d -> b t (h d)')
+        o = self.o_proj(o)
+        return o, None, past_key_values
+class Cache(transformers.cache_utils.Cache):
+    """
+    A cache used for storing hidden states produced by flash linear attention models.
+    It stores the states of each layer as the tensor of shape `[batch_size, key_dim, value_dim]`.
+    """
+    is_compileable = True
+    def __init__(
+        self,
+        seen_tokens: int = 0
+    ) -> Cache:
+        super().__init__(layers=[0])
+        self.states: List[Dict[str, Any]] = []
+        self._seen_tokens = seen_tokens  # Used in `generate` to keep tally of how many tokens the cache has seen
+    def __getitem__(self, layer_idx: int) -> Dict[str, Any]:
+        if layer_idx < len(self):
+            return self.states[layer_idx]
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+    def __iter__(self):
+        for state in self.states:
+            yield state
+    def __len__(self):
+        return len(self.states)
+    def reset(self):
+        for state in self.states:
+            for key in state:
+                if state[key] is not None:
+                    if type(state[key]) == tuple:
+                        for subkey in state[key]:
+                            subkey.zero_()
+                    else:
+                        state[key].zero_()
+        self._seen_tokens = 0
+    def update(
+        self,
+        recurrent_state: Optional[Tuple[torch.Tensor]] = None,
+        attn_state: Optional[Tuple[torch.Tensor]] = None,
+        conv_state: Optional[Tuple[torch.Tensor]] = None,
+        ffn_state: Optional[Tuple[torch.Tensor]] = None,
+        layer_idx: int = 0,
+        offset: Optional[int] = 1,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Dict[str, Any]:
+        """
+        Args:
+            recurrent_state (`torch.Tensor`):
+                The new recurrent state to cache.
+            attn_state (`Tuple[torch.Tensor]`):
+                The new attention key/value states to cache.
+            conv_state (`Tuple[torch.Tensor]`):
+                The new convolution state to cache.
+            ffn_state (`Tuple[torch.Tensor]`):
+                The new feed-forward state to cache.
+            layer_idx (`int`, defaults to 0):
+                The index of the layer to cache the states for.
+            offset (`int`, defaults to 1):
+                The number of new tokens being processed.
+            cache_kwargs (`Dict[str, Any]`):
+                Additional arguments for the cache subclass.
+        Return:
+            Dictionary of the updated state.
+        """
+        if cache_kwargs is None:
+            cache_kwargs = {}
+        if attn_state is not None:
+            input_size = attn_state[0].shape[1]
+            window_size = cache_kwargs.get('window_size', None)
+            if not (isinstance(attn_state, Tuple) or isinstance(attn_state, List)):
+                raise ValueError("`attn_state` must be a tuple of tensors for key/value states")
+        if len(self.states) <= layer_idx:
+            # update the number of seen tokens
+            if layer_idx == 0:
+                self._seen_tokens += offset
+            if attn_state is not None:
+                if window_size is not None and input_size > window_size:
+                    attn_state = [state[:, -window_size:].contiguous() for state in attn_state]
+            state = dict(
+                recurrent_state=recurrent_state,
+                attn_state=attn_state,
+                conv_state=conv_state,
+                ffn_state=ffn_state
+            )
+            self.states.append(state)
+        else:
+            # update the number of seen tokens
+            if layer_idx == len(self.states) - 1:
+                self._seen_tokens += offset
+            state = self.states[layer_idx]
+            if recurrent_state is not None:
+                state['recurrent_state'].copy_(recurrent_state)
+            if attn_state is not None:
+                if window_size is not None and state['attn_state'][0].shape[1] == window_size:
+                    for i, (old_state, new_state) in enumerate(zip(state['attn_state'], attn_state)):
+                        # DO NOT allocate new memory if the cache is full
+                        # roll the key/value states to the left by `input_size`
+                        old_state = old_state.roll(-input_size, 1)
+                        # replace the last `input_size` tokens with the new key/value states
+                        old_state[:, -input_size:] = new_state
+                        state['attn_state'][i].copy_(old_state)
+                else:
+                    attn_state = [
+                        torch.cat([old_state, new_state], 1)
+                        for old_state, new_state in zip(state['attn_state'], attn_state)
+                    ]
+                    state['attn_state'].copy_(attn_state)
+            if conv_state is not None:
+                conv_state_q, conv_state_k, conv_state_v = state['conv_state']
+                conv_state_q.copy_(conv_state[0])
+                conv_state_k.copy_(conv_state[1])
+                conv_state_v.copy_(conv_state[2])
+            if ffn_state is not None:
+                state['ffn_state'].copy_(ffn_state)
+        return state
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        if len(self.states) <= layer_idx:
+            return 0
+        return self._seen_tokens
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. Cache does not have a maximum length."""
+        return None
+    def to_legacy_cache(self) -> Tuple:
+        return tuple(self.states)
+    @classmethod
+    @torch.compiler.disable
+    def from_legacy_cache(
+        cls,
+        past_key_values: Optional[Tuple] = None,
+        seen_tokens: int = 0
+    ) -> Cache:
+        """Converts a cache in the legacy cache format into an equivalent `Cache`."""
+        cache = cls(seen_tokens)
+        if isinstance(past_key_values, list):
+            for layer_idx in range(len(past_key_values)):
+                cache.states.append(past_key_values[layer_idx])
+        return cache

fused_mha_with_cache.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+from typing import Optional, Tuple
+from .triton_attention import (
+    fused_mha_with_paged_cache, fused_mha_with_cache
+)
+dtype_int = torch.int32
+def fused_mha_interface(
+    query_states: torch.Tensor,             # [batch, q_len, heads, head_dim]
+    key_states:   torch.Tensor,             # [batch, kv_len, heads, head_dim]
+    value_states: torch.Tensor,             # [batch, kv_len, heads, head_dim]
+    k_cache: torch.Tensor,   # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD] or [num_pages, page_size, n, d] for paged attn
+    v_cache: torch.Tensor,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    position_ids: torch.Tensor=None,
+    page_table: torch.Tensor=None, # [b, max_num_pages_per_seq] # loc of the block page in the cache.
+    max_seq_len = None,
+) -> torch.Tensor:
+    """
+    Replacement for _flash_attention_forward(...) that uses
+    Triton’s fused_mha_with_paged_cache under the hood.
+    Returns: [batch, q_len, heads*head_dim]
+    """
+    # unpack shapes
+    b, ql, n_heads, head_dim = query_states.shape
+    _, kvl, n_kv_heads, _ = key_states.shape
+    q = query_states.reshape(b, ql, n_heads * head_dim)
+    k = key_states.reshape(b, kvl, n_kv_heads * head_dim)
+    v = value_states.reshape(b, kvl, n_kv_heads * head_dim)
+    if position_ids is not None:
+        if ql == 1:  # Generate phase - single token
+            input_pos = position_ids[:, -1]  # Use the last position for each sequence
+        else:  # Context phase - multiple tokens
+            input_pos = position_ids[:, 0]   # Use the starting position for each sequence
+    else:
+        # Fallback: assume starting from 0 for all sequences
+        input_pos = torch.zeros(b, device=q.device, dtype=torch.int32)
+    freqs_cis = None
+    if page_table is None:
+        y = torch.ops.attention.fused_mha_with_cache(
+            q, k, v,
+            input_pos,
+            k_cache, v_cache,
+            freqs_cis,
+        )
+    else:
+        batch_size = b
+        # cache_loc: identity mapping [0, 1, ..., b-1]
+        cache_loc = torch.arange(batch_size, device=q.device, dtype=dtype_int)
+        # input_positions: assume pure context (all start from 0)
+        input_positions = torch.zeros(batch_size, device=q.device, dtype=dtype_int)
+        # seq_len: each sequence length is kvl
+        seq_len = torch.full((batch_size,), kvl, device=q.device, dtype=dtype_int)
+        # seq_start: flattened starting index for each sequence
+        seq_start = (seq_len.cumsum(0) - seq_len).to(dtype=dtype_int)
+        assert max_seq_len is not None, "max_seq_len must be provided when using paged attention."
+        y = torch.ops.attention.fused_mha_with_paged_cache(
+            q, k, v,
+            input_positions, cache_loc,
+            seq_len, seq_start,
+            page_table, max_seq_len,
+            k_cache, v_cache,
+            freqs_cis,
+        )
+    y = y.view(b, ql, n_heads, head_dim)
+    return y
+def main():
+    #––– Test hyperparameters –––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
+    batch_size = 1
+    q_len      = 1
+    kv_len     = 1
+    num_heads  = 16
+    n_kv_heads = 16
+    head_dim   = 128
+    max_batch_size = 1
+    max_seq_len = 1024
+    page_size = 256
+    device = "cuda"
+    #––– Random query, key, value tensors –––––––––––––––––––––––––––––––––––––––––––––––––––
+    query_states = torch.randn(batch_size, q_len, num_heads, head_dim, device=device)
+    key_states   = torch.randn(batch_size, kv_len, num_heads, head_dim, device=device)
+    value_states = torch.randn(batch_size, kv_len, num_heads, head_dim, device=device)
+    k_cache = torch.randn(max_batch_size, max_seq_len, num_heads, head_dim, device=device)
+    v_cache = torch.randn(max_batch_size, max_seq_len, num_heads, head_dim, device=device)
+    attn_out = fused_mha_interface(
+        query_states,
+        key_states,
+        value_states,
+        k_cache=k_cache,
+        v_cache=v_cache,
+    )
+    expected_shape = (batch_size, q_len, num_heads, head_dim)
+    print(f"[test] output shape: {attn_out.shape} (expected {expected_shape})")
+    if attn_out.shape == expected_shape:
+        print("[test] ✅ Success: output tensor has correct shape.")
+    else:
+        print("[test] ❌ Failure: shape mismatch.")
+if __name__ == "__main__":
+    main()

mamba2.py ADDED Viewed

	@@ -0,0 +1,464 @@

+# Copyright (c) 2024, Tri Dao, Albert Gu.
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat, pack, unpack
+try:
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+except ImportError:
+    causal_conv1d_fn, causal_conv1d_update = None, None
+try:
+    from causal_conv1d.causal_conv1d_varlen import causal_conv1d_varlen_states
+except ImportError:
+    causal_conv1d_varlen_states = None
+from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+from mamba_ssm.ops.triton.layernorm_gated import RMSNorm as RMSNormGated
+from mamba_ssm.distributed.tensor_parallel import ColumnParallelLinear, RowParallelLinear
+from mamba_ssm.distributed.distributed_utils import all_reduce, reduce_scatter
+from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined
+from mamba_ssm.ops.triton.ssd_combined import mamba_split_conv1d_scan_combined
+class Mamba2(nn.Module):
+    def __init__(
+        self,
+        config,
+        conv_init=None,
+        d_ssm=None,  # If not None, we only apply SSM on this many dimensions, the rest uses gated MLP
+        ngroups=1,
+        A_init_range=(1, 16),
+        D_has_hdim=False,
+        rmsnorm=True,
+        norm_before_gate=False,
+        dt_min=0.001,
+        dt_max=0.1,
+        dt_init_floor=1e-4,
+        dt_limit=(0.0, float("inf")),
+        bias=False,
+        conv_bias=True,
+        # Fused kernel and sharding options
+        chunk_size=256,
+        use_mem_eff_path=False, # True,
+        layer_idx=None,  # Absorb kwarg for general module
+        process_group=None,
+        sequence_parallel=True,
+        device=None,
+        dtype=None,
+    ):
+        factory_kwargs = {"device": device, "dtype": dtype}
+        super().__init__()
+        self.config = config
+        self.d_model = config.hidden_size
+        self.d_state = config.mamba_d_state
+        self.d_conv = config.mamba_d_conv
+        self.conv_init = conv_init
+        self.expand = config.mamba_expand
+        self.process_group = process_group
+        self.sequence_parallel = sequence_parallel
+        self.world_size = 1 if process_group is None else process_group.size()
+        self.local_rank = 0 if process_group is None else process_group.rank()
+        self.d_inner = (self.expand * self.d_model) // self.world_size
+        assert self.d_inner * self.world_size == self.expand * self.d_model
+        self.headdim = config.mamba2_headdim
+        self.d_ssm = self.d_inner if d_ssm is None else d_ssm // self.world_size
+        assert ngroups % self.world_size == 0
+        self.ngroups = ngroups // self.world_size
+        assert self.d_ssm % self.headdim == 0
+        self.nheads = self.d_ssm // self.headdim
+        self.D_has_hdim = D_has_hdim
+        self.rmsnorm = rmsnorm
+        self.norm_before_gate = norm_before_gate
+        self.dt_limit = dt_limit
+        self.activation = "silu"
+        self.chunk_size = chunk_size
+        self.use_mem_eff_path = use_mem_eff_path
+        self.layer_idx = layer_idx
+        assert (self.d_model * self.expand / self.headdim) % 8 == 0
+        # Order: [z, x, B, C, dt]
+        d_in_proj = 2 * self.d_inner + 2 * self.ngroups * self.d_state + self.nheads
+        if self.process_group is None:
+            self.in_proj = nn.Linear(self.d_model, d_in_proj, bias=bias, **factory_kwargs)
+        else:
+            self.in_proj = ColumnParallelLinear(self.d_model, d_in_proj * self.world_size, bias=bias,
+                                                process_group=self.process_group, sequence_parallel=self.sequence_parallel,
+                                                **factory_kwargs)
+        conv_dim = self.d_ssm + 2 * self.ngroups * self.d_state
+        self.conv1d = nn.Conv1d(
+            in_channels=conv_dim,
+            out_channels=conv_dim,
+            bias=conv_bias,
+            kernel_size=self.d_conv,
+            groups=conv_dim,
+            padding=self.d_conv - 1,
+            **factory_kwargs,
+        )
+        if self.conv_init is not None:
+            nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init)
+        self.act = nn.SiLU()
+        # Initialize log dt bias
+        dt = torch.exp(
+            torch.rand(self.nheads, **factory_kwargs) * (math.log(dt_max) - math.log(dt_min))
+            + math.log(dt_min)
+        )
+        dt = torch.clamp(dt, min=dt_init_floor)
+        # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+        inv_dt = dt + torch.log(-torch.expm1(-dt))
+        self.dt_bias = nn.Parameter(inv_dt)
+        # Just to be explicit. Without this we already don't put wd on dt_bias because of the check
+        # name.endswith("bias") in param_grouping.py
+        self.dt_bias._no_weight_decay = True
+        assert A_init_range[0] > 0 and A_init_range[1] >= A_init_range[0]
+        A = torch.empty(self.nheads, dtype=torch.float32, device=device).uniform_(*A_init_range)
+        A_log = torch.log(A).to(dtype=dtype)
+        self.A_log = nn.Parameter(A_log)
+        self.A_log._no_weight_decay = True
+        # D "skip" parameter
+        self.D = nn.Parameter(torch.ones(self.d_ssm if self.D_has_hdim else self.nheads, device=device))
+        self.D._no_weight_decay = True
+        if self.rmsnorm:
+            assert RMSNormGated is not None
+            self.norm = RMSNormGated(self.d_ssm, eps=1e-5, norm_before_gate=self.norm_before_gate,
+                                     group_size=self.d_ssm // ngroups, **factory_kwargs)
+        if self.process_group is None:
+            self.out_proj = nn.Linear(self.d_inner, self.d_model, bias=bias, **factory_kwargs)
+        else:
+            self.out_proj = RowParallelLinear(self.d_inner * self.world_size, self.d_model, bias=bias,
+                                              process_group=self.process_group, sequence_parallel=self.sequence_parallel,
+                                              **factory_kwargs)
+    def forward(self, hidden_states, attention_mask=None, past_key_value=None, seqlen=None, seq_idx=None, cu_seqlens=None, inference_params=None):
+        """
+        hidden_states: (batch, seqlen, hidden_dim) if seqlen=None.
+            If seqlen is not None, hidden_states is (batch * seqlen, hidden_dim). This is so that when we
+            split hidden_states during sequence parallel, we split the batch * seqlen dimension
+            (in case batch is small).
+        Returns: same shape as u
+        """
+        # assert past_key_value is None, "Not implemented yet!!!"
+        seqlen_og = seqlen
+        if seqlen is None:
+            batch, seqlen, dim = hidden_states.shape
+        else:
+            batch_seqlen, dim = hidden_states.shape
+            batch = batch_seqlen // seqlen
+        conv_state, ssm_state = None, None
+        if inference_params is not None:
+            inference_batch = cu_seqlens.shape[0] - 1 if cu_seqlens is not None else batch
+            conv_state, ssm_state = self._get_states_from_cache(inference_params, inference_batch)
+            if inference_params.seqlen_offset > 0:
+                # The states are updated inplace
+                out, _, _ = self.step(hidden_states, conv_state, ssm_state)
+                return out, past_key_value
+        zxbcdt = self.in_proj(hidden_states)  # (B, L, d_in_proj) or (B * L, d_in_proj)
+        if seqlen_og is not None:
+            zxbcdt = rearrange(zxbcdt, "(b l) d -> b l d", l=seqlen)
+        # If the model is loaded in fp16, without the .float() here, A might be -inf
+        A = -torch.exp(self.A_log.float())  # (nheads) or (d_inner, d_state)
+        dt_limit_kwargs = {} if self.dt_limit == (0.0, float("inf")) else dict(dt_limit=self.dt_limit)
+        if self.use_mem_eff_path and inference_params is None:
+            out = mamba_split_conv1d_scan_combined(
+                zxbcdt,
+                rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                self.conv1d.bias,
+                self.dt_bias,
+                A,
+                D=rearrange(self.D, "(h p) -> h p", p=self.headdim) if self.D_has_hdim else self.D,
+                chunk_size=self.chunk_size,
+                seq_idx=seq_idx,
+                activation=self.activation,
+                rmsnorm_weight=self.norm.weight if self.rmsnorm else None,
+                rmsnorm_eps=self.norm.eps if self.rmsnorm else 1e-6,
+                outproj_weight=self.out_proj.weight,
+                outproj_bias=self.out_proj.bias,
+                headdim=None if self.D_has_hdim else self.headdim,
+                ngroups=self.ngroups,
+                norm_before_gate=self.norm_before_gate,
+                **dt_limit_kwargs,
+            )
+            if seqlen_og is not None:
+                out = rearrange(out, "b l d -> (b l) d")
+            if self.process_group is not None:
+                reduce_fn = reduce_scatter if self.sequence_parallel else all_reduce
+                out = reduce_fn(out, self.process_group)
+        else:
+            d_mlp = (zxbcdt.shape[-1] - 2 * self.d_ssm - 2 * self.ngroups * self.d_state - self.nheads) // 2
+            z0, x0, z, xBC, dt = torch.split(
+                zxbcdt,
+                [d_mlp, d_mlp, self.d_ssm, self.d_ssm + 2 * self.ngroups * self.d_state, self.nheads],
+                dim=-1
+            )
+            if conv_state is not None:
+                if cu_seqlens is None:
+                    # If we just take xBC[:, :, -self.d_conv :], it will error if seqlen < self.d_conv
+                    # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise.
+                    xBC_t = rearrange(xBC, "b l d -> b d l")
+                    conv_state.copy_(F.pad(xBC_t, (self.d_conv - xBC_t.shape[-1], 0)))  # Update state (B D W)
+                else:
+                    assert causal_conv1d_varlen_states is not None, "varlen inference requires causal_conv1d package"
+                    assert batch == 1, "varlen inference only supports batch dimension 1"
+                    conv_varlen_states = causal_conv1d_varlen_states(
+                        xBC.squeeze(0), cu_seqlens, state_len=conv_state.shape[-1]
+                    )
+                    conv_state.copy_(conv_varlen_states)
+            assert self.activation in ["silu", "swish"]
+            if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
+                assert seq_idx is None, "varlen conv1d requires the causal_conv1d package"
+                xBC = self.act(
+                    self.conv1d(xBC.transpose(1, 2)).transpose(1, 2)[:, -(self.dconv - 1):]
+                )  # (B, L, self.d_ssm + 2 * ngroups * d_state)
+            else:
+                xBC = causal_conv1d_fn(
+                    xBC.transpose(1, 2),
+                    rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    bias=self.conv1d.bias,
+                    activation=self.activation,
+                    # seq_idx=seq_idx,
+                ).transpose(1, 2)
+            x, B, C = torch.split(xBC, [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state], dim=-1)
+            y = mamba_chunk_scan_combined(
+                rearrange(x, "b l (h p) -> b l h p", p=self.headdim),
+                dt,
+                A,
+                rearrange(B, "b l (g n) -> b l g n", g=self.ngroups),
+                rearrange(C, "b l (g n) -> b l g n", g=self.ngroups),
+                chunk_size=self.chunk_size,
+                # D=rearrange(self.D, "(h p) -> h p", p=self.headdim) if self.D_has_hdim else self.D,
+                D=self.D,
+                z=rearrange(z, "b l (h p) -> b l h p", p=self.headdim) if not self.rmsnorm else None,
+                dt_bias=self.dt_bias,
+                dt_softplus=True,
+                seq_idx=seq_idx,
+                cu_seqlens=cu_seqlens,
+                **dt_limit_kwargs,
+                return_final_states=ssm_state is not None,
+                return_varlen_states=cu_seqlens is not None and inference_params is not None,
+            )
+            if ssm_state is not None:
+                y, last_state, *rest = y
+                if cu_seqlens is None:
+                    ssm_state.copy_(last_state)
+                else:
+                    varlen_states = rest[0]
+                    ssm_state.copy_(varlen_states)
+            y = rearrange(y, "b l h p -> b l (h p)")
+            if self.rmsnorm:
+                y_full = y
+                z_full = z
+                y = self.norm(y_full, z_full)
+            if d_mlp > 0:
+                y = torch.cat([F.silu(z0) * x0, y], dim=-1)
+            if seqlen_og is not None:
+                y = rearrange(y, "b l d -> (b l) d")
+            out = self.out_proj(y)
+        return out, past_key_value
+    def step(self, hidden_states, conv_state, ssm_state):
+        dtype = hidden_states.dtype
+        # Remove single token limitation - now supports hidden_states.shape[1] > 1
+        batch_size, seq_len, _ = hidden_states.shape
+        if seq_len == 1:
+            # Single token case - keep existing optimized path
+            zxbcdt = self.in_proj(hidden_states.squeeze(1))  # (B 2D)
+        else:
+            # Multi-token case - process without squeezing
+            zxbcdt = self.in_proj(hidden_states)  # (B L 2D)
+        d_mlp = (zxbcdt.shape[-1] - 2 * self.d_ssm - 2 * self.ngroups * self.d_state - self.nheads) // 2
+        if seq_len == 1:
+            z0, x0, z, xBC, dt = torch.split(
+                zxbcdt,
+                [d_mlp, d_mlp, self.d_ssm, self.d_ssm + 2 * self.ngroups * self.d_state, self.nheads],
+                dim=-1
+            )
+        else:
+            z0, x0, z, xBC, dt = torch.split(
+                zxbcdt,
+                [d_mlp, d_mlp, self.d_ssm, self.d_ssm + 2 * self.ngroups * self.d_state, self.nheads],
+                dim=-1
+            )
+        # Conv step - handle both single and multi-token cases
+        if seq_len == 1:
+            # Single token optimized path
+            if causal_conv1d_update is None:
+                conv_state.copy_(torch.roll(conv_state, shifts=-1, dims=-1))  # Update state (B D W)
+                conv_state[:, :, -1] = xBC
+                xBC = torch.sum(conv_state * rearrange(self.conv1d.weight, "d 1 w -> d w"), dim=-1)  # (B D)
+                if self.conv1d.bias is not None:
+                    xBC = xBC + self.conv1d.bias
+                xBC = self.act(xBC).to(dtype=dtype)
+            else:
+                xBC = causal_conv1d_update(
+                    xBC,
+                    conv_state,
+                    rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    self.conv1d.bias,
+                    self.activation,
+                )
+        else:
+            # Multi-token case - update conv_state and process sequence
+            # Update conv_state with the new sequence
+            xBC_t = rearrange(xBC, "b l d -> b d l")
+            conv_state.copy_(F.pad(xBC_t, (self.d_conv - xBC_t.shape[-1], 0)))  # Update state (B D W)
+            # Process convolution for the full sequence
+            if causal_conv1d_fn is None or self.activation not in ["silu", "swish"]:
+                xBC = self.act(
+                    self.conv1d(xBC.transpose(1, 2)).transpose(1, 2)[:, -(self.d_conv - 1):]
+                )  # (B, L, self.d_ssm + 2 * ngroups * d_state)
+            else:
+                xBC = causal_conv1d_fn(
+                    xBC.transpose(1, 2),
+                    rearrange(self.conv1d.weight, "d 1 w -> d w"),
+                    bias=self.conv1d.bias,
+                    activation=self.activation,
+                ).transpose(1, 2)
+        x, B, C = torch.split(xBC, [self.d_ssm, self.ngroups * self.d_state, self.ngroups * self.d_state], dim=-1)
+        A = -torch.exp(self.A_log.float())  # (nheads,)
+        # SSM step - handle both single and multi-token cases
+        if seq_len == 1:
+            # Single token optimized path
+            if selective_state_update is None:
+                assert self.ngroups == 1, "Only support ngroups=1 for this inference code path"
+                # Discretize A and B
+                dt = F.softplus(dt + self.dt_bias.to(dtype=dt.dtype))  # (batch, nheads)
+                dA = torch.exp(dt * A)  # (batch, nheads)
+                x = rearrange(x, "b (h p) -> b h p", p=self.headdim)
+                dBx = torch.einsum("bh,bn,bhp->bhpn", dt, B, x)
+                ssm_state.copy_(ssm_state * rearrange(dA, "b h -> b h 1 1") + dBx)
+                y = torch.einsum("bhpn,bn->bhp", ssm_state.to(dtype), C)
+                y = y + rearrange(self.D.to(dtype), "h -> h 1") * x
+                y = rearrange(y, "b h p -> b (h p)")
+                if not self.rmsnorm:
+                    y = y * self.act(z)  # (B D)
+            else:
+                A = repeat(A, "h -> h p n", p=self.headdim, n=self.d_state).to(dtype=torch.float32)
+                dt = repeat(dt, "b h -> b h p", p=self.headdim)
+                dt_bias = repeat(self.dt_bias, "h -> h p", p=self.headdim)
+                D = repeat(self.D, "h -> h p", p=self.headdim)
+                B = rearrange(B, "b (g n) -> b g n", g=self.ngroups)
+                C = rearrange(C, "b (g n) -> b g n", g=self.ngroups)
+                x_reshaped = rearrange(x, "b (h p) -> b h p", p=self.headdim)
+                if not self.rmsnorm:
+                    z = rearrange(z, "b (h p) -> b h p", p=self.headdim)
+                y = selective_state_update(
+                    ssm_state, x_reshaped, dt, A, B, C, D, z=z if not self.rmsnorm else None,
+                    dt_bias=dt_bias, dt_softplus=True
+                )
+                y = rearrange(y, "b h p -> b (h p)")
+        else:
+            # Multi-token case - use mamba_chunk_scan_combined similar to forward method
+            dt_limit_kwargs = {} if self.dt_limit == (0.0, float("inf")) else dict(dt_limit=self.dt_limit)
+            y = mamba_chunk_scan_combined(
+                rearrange(x, "b l (h p) -> b l h p", p=self.headdim),
+                dt,
+                A,
+                rearrange(B, "b l (g n) -> b l g n", g=self.ngroups),
+                rearrange(C, "b l (g n) -> b l g n", g=self.ngroups),
+                chunk_size=self.chunk_size,
+                D=rearrange(self.D, "(h p) -> h p", p=self.headdim) if self.D_has_hdim else self.D,
+                z=rearrange(z, "b l (h p) -> b l h p", p=self.headdim) if not self.rmsnorm else None,
+                dt_bias=self.dt_bias,
+                dt_softplus=True,
+                **dt_limit_kwargs,
+                return_final_states=True,
+            )
+            # Extract final state and update ssm_state
+            y, final_ssm_state = y
+            ssm_state.copy_(final_ssm_state)
+            y = rearrange(y, "b l h p -> b l (h p)")
+        if self.rmsnorm:
+            y = self.norm(y, z)
+        if d_mlp > 0:
+            y = torch.cat([F.silu(z0) * x0, y], dim=-1)
+        out = self.out_proj(y)
+        # Ensure output shape consistency
+        if seq_len == 1 and out.dim() == 2:
+            out = out.unsqueeze(1)  # (B, 1, D)
+        return out, conv_state, ssm_state
+    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
+        device = self.out_proj.weight.device
+        conv_dtype = self.conv1d.weight.dtype if dtype is None else dtype
+        conv_state = torch.zeros(
+            batch_size, self.d_conv, self.conv1d.weight.shape[0], device=device, dtype=conv_dtype
+        ).transpose(1, 2)
+        ssm_dtype = self.in_proj.weight.dtype if dtype is None else dtype
+        ssm_state = torch.zeros(
+            batch_size, self.nheads, self.headdim, self.d_state, device=device, dtype=ssm_dtype
+        )
+        return conv_state, ssm_state
+    def _get_states_from_cache(self, inference_params, batch_size, initialize_states=False):
+        assert self.layer_idx is not None
+        if self.layer_idx not in inference_params.key_value_memory_dict:
+            batch_shape = (batch_size,)
+            conv_state = torch.zeros(
+                batch_size,
+                self.d_conv,
+                self.conv1d.weight.shape[0],
+                device=self.conv1d.weight.device,
+                dtype=self.conv1d.weight.dtype,
+            ).transpose(1, 2)
+            ssm_state = torch.zeros(
+                batch_size,
+                self.nheads,
+                self.headdim,
+                self.d_state,
+                device=self.in_proj.weight.device,
+                dtype=self.in_proj.weight.dtype,
+            )
+            inference_params.key_value_memory_dict[self.layer_idx] = (conv_state, ssm_state)
+        else:
+            conv_state, ssm_state = inference_params.key_value_memory_dict[self.layer_idx]
+            # TODO: What if batch size changes between generation, and we reuse the same states?
+            if initialize_states:
+                conv_state.zero_()
+                ssm_state.zero_()
+        return conv_state, ssm_state

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4fdb6a7f8e726e8e00f5f63dde79d18b499722f6ce7199027679cd01c9f71a87
+size 1930804728

modeling_fast_slm.py ADDED Viewed

The diff for this file is too large to render. See raw diff

triton_attention.py ADDED Viewed

	@@ -0,0 +1,2714 @@

+"""Custom ops for MHA/XQA attention."""
+import math
+from dataclasses import astuple
+from typing import List, Optional
+import torch
+import torch.nn.functional as F
+import triton
+from triton import language as tl
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field, fields
+from typing import Dict, List, Literal, Optional, Protocol, Sequence, Tuple, Type, Union
+import torch
+from torch.export import Dim
+@triton.jit
+def update_kv_cache(
+    k_ptr,  # [B*S, N, D]
+    v_ptr,  # [B*S, N, D]
+    seq_len_ptr,  # [b] # length of each sequence in a batch
+    seq_start_indices_ptr,  # [b] # start indices of a sequence in flattened q/k/v.
+    k_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    v_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    input_pos_ptr,  # Specifies the sequence index in the caches at which to write the provided kv
+    cache_loc_ptr,  # Specifies the batch index for each of the input sequences
+    MAX_SEQ_LENGTH: tl.constexpr,
+    N_KV_HEADS: tl.constexpr,
+    Q_D_HEAD: tl.constexpr,
+    V_D_HEAD: tl.constexpr,
+    SEQ_BLOCK: tl.constexpr,
+    GENERATE_ONLY: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    # Each program is responsible for a block of tokens in a single batch.
+    if GENERATE_ONLY:
+        seq_start_index = batch_id
+        seq_len: tl.constexpr = 1
+    else:
+        seq_start_index = tl.load(seq_start_indices_ptr + batch_id)
+        seq_len = tl.load(seq_len_ptr + batch_id)
+    # cache is [bsnd]
+    # cache_loc_ptr stores the batch index for the sequences provided to the kernel.
+    cache_loc = tl.load(cache_loc_ptr + batch_id)
+    kv_position = tl.load(input_pos_ptr + batch_id)
+    K_D_HEAD: tl.constexpr = Q_D_HEAD
+    k_cache_batch_offset = cache_loc * N_KV_HEADS * MAX_SEQ_LENGTH * K_D_HEAD
+    v_cache_batch_offset = cache_loc * N_KV_HEADS * MAX_SEQ_LENGTH * V_D_HEAD
+    k_dhead_offsets = tl.arange(0, triton.next_power_of_2(K_D_HEAD))
+    k_dhead_mask = k_dhead_offsets < K_D_HEAD
+    v_dhead_offsets = tl.arange(0, triton.next_power_of_2(V_D_HEAD))
+    v_dhead_mask = v_dhead_offsets < V_D_HEAD
+    seq_offsets = seq_block_id * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+    seq_mask = seq_offsets < seq_len
+    k_load_mask = seq_mask[:, None] * k_dhead_mask[None, :]
+    v_load_mask = seq_mask[:, None] * v_dhead_mask[None, :]
+    k_batch_offset = seq_start_index * N_KV_HEADS * K_D_HEAD
+    v_batch_offset = seq_start_index * N_KV_HEADS * V_D_HEAD
+    # Write back to kv-caches
+    ks = tl.load(
+        k_ptr
+        + k_batch_offset
+        + seq_offsets[:, None] * N_KV_HEADS * K_D_HEAD
+        + head_id * K_D_HEAD
+        + k_dhead_offsets[None, :],
+        mask=k_load_mask,
+    )
+    vs = tl.load(
+        v_ptr
+        + v_batch_offset
+        + seq_offsets[:, None] * N_KV_HEADS * V_D_HEAD
+        + head_id * V_D_HEAD
+        + v_dhead_offsets[None, :],
+        mask=v_load_mask,
+    )
+    kv_writeback_seq_offsets = seq_offsets + kv_position
+    k_cache_offset = (
+        k_cache_batch_offset
+        + kv_writeback_seq_offsets[:, None] * K_D_HEAD * N_KV_HEADS
+        + head_id * K_D_HEAD
+        + k_dhead_offsets[None, :]
+    )
+    v_cache_offset = (
+        v_cache_batch_offset
+        + kv_writeback_seq_offsets[:, None] * V_D_HEAD * N_KV_HEADS
+        + head_id * V_D_HEAD
+        + v_dhead_offsets[None, :]
+    )
+    tl.store(k_cache_ptr + k_cache_offset, ks, k_load_mask)
+    tl.store(v_cache_ptr + v_cache_offset, vs, v_load_mask)
+@triton.jit
+def gqa_attention_kv_stage1(
+    q_ptr,  # [Batch, 1, N_HEADS, D_HEAD]
+    k_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    v_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    cache_loc_ptr,  # [Batch] # Specifies the batch index for each of the generate tokens.
+    input_pos_ptr,  # [Batch]
+    output_values_ptr,  # [Batch, N_HEADS, num_blocks, D_HEAD]
+    output_logsumexp_ptr,  # [Batch, N_HEADS, num_blocks]
+    num_blocks,
+    MAX_SEQ_LEN: tl.constexpr,  # Maximum supported sequence length
+    N_HEADS: tl.constexpr,  # Number of heads
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    Q_D_HEAD: tl.constexpr,  # Dimension of each query head.
+    V_D_HEAD: tl.constexpr,  # Dimension of each key/value head
+    SEQ_BLOCK_SIZE: tl.constexpr,  # Block size used for tiling the sequence dim.
+    HEAD_BLOCK_SIZE: tl.constexpr,  # pad to 16 if HEAD_RATIO is < 16 to invoke tensor cores.
+):
+    """Attention kernel to be used for generate-only batches.
+    Specialized for GQA.
+    Assumes that kv caches have been updated.
+    Supports non-power-of-2 D_HEAD
+    Uses flash decoding.
+    KV-cache layout is assumed to be [Batch,Seq, Head, Dim]
+    1. Fetch the K-cache from 0 to input_pos
+    2. Fetch the V-cache from 0 to input_pos
+    3. A = Q*K^T [1,D_HEAD] * [1,seq_len,D_HEAD] -> [1, seq_len]
+    4. S = softmax(A)
+    5. O = S*V [1, seq_len] * [1, seq_len, D_HEAD] -> [1, D_HEAD]
+    """
+    # Assume KV-cache layout: [Batch, Seq, Head, Dim]
+    # A program is responsible for 1 batch, 1 head and a block of sequences.
+    batch_id = tl.program_id(axis=0)
+    kv_head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    kv_position = tl.load(input_pos_ptr + batch_id)
+    kv_batch_id = tl.load(cache_loc_ptr + batch_id)
+    K_D_HEAD: tl.constexpr = Q_D_HEAD
+    batch_offset = kv_batch_id * N_KV_HEADS * MAX_SEQ_LEN
+    # Offsets for the block of sequences this program processes.
+    seq_start_pos = seq_block_id * SEQ_BLOCK_SIZE
+    # The number of Q heads that map to each KV head.
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS  # This needs to be a power-of-2
+    if seq_start_pos > kv_position:
+        return
+    seq_offsets = seq_start_pos + tl.arange(0, SEQ_BLOCK_SIZE)
+    seq_mask = seq_offsets <= kv_position
+    # Need to pad the head dim to 16 if HEAD_RATIO is < 16 so that tensor cores can be invoked
+    #
+    head_offsets = kv_head_id * HEAD_RATIO + tl.arange(0, HEAD_BLOCK_SIZE)
+    head_mask = head_offsets < (kv_head_id * HEAD_RATIO + HEAD_RATIO)
+    # Assuming D_HEAD is a power of 2
+    q_dhead_offsets = tl.arange(0, triton.next_power_of_2(Q_D_HEAD))
+    q_dhead_mask = q_dhead_offsets < Q_D_HEAD
+    v_dhead_offsets = tl.arange(0, triton.next_power_of_2(V_D_HEAD))
+    v_dhead_mask = v_dhead_offsets < V_D_HEAD
+    sm_scale: tl.constexpr = 1.0 / (Q_D_HEAD**0.5)
+    # Program loads the entire Q for the head assigned to it.
+    # [NUM_HEADS, Q_D_HEAD]
+    q_batch_offset = batch_id * N_HEADS * Q_D_HEAD
+    q_head_offsets = head_offsets * Q_D_HEAD
+    # Q layout : BSND
+    q = tl.load(
+        q_ptr + q_batch_offset + q_head_offsets[:, None] + q_dhead_offsets[None, :],
+        mask=head_mask[:, None] * q_dhead_mask[None, :],
+        other=0.0,
+    )
+    # [BSND]
+    k_block_offsets = (
+        batch_offset * K_D_HEAD
+        + seq_offsets[:, None] * K_D_HEAD * N_KV_HEADS
+        + kv_head_id * K_D_HEAD
+        + q_dhead_offsets[None, :]
+    )
+    k_mask = seq_mask[:, None] * q_dhead_mask[None, :]  # K and Q share the same head dim
+    k = tl.load(k_cache_ptr + k_block_offsets, mask=k_mask, other=0.0)
+    v_block_offsets = (
+        batch_offset * V_D_HEAD
+        + seq_offsets[:, None] * V_D_HEAD * N_KV_HEADS
+        + kv_head_id * V_D_HEAD
+        + v_dhead_offsets[None, :]
+    )
+    v_mask = seq_mask[:, None] * v_dhead_mask[None, :]
+    # [seq_block, V_D_HEAD]
+    v = tl.load(v_cache_ptr + v_block_offsets, mask=v_mask, other=0.0)
+    # Note: check the output precision of the sum.
+    # compute q*K^T
+    # [NUM_HEADS, Q_D_HEAD] * [seq_block, Q_D_HEAD], sum along axis 1
+    attn = tl.dot(q, k.trans())  # [N, seq_block]
+    attn = attn.to(tl.float32)
+    attn *= sm_scale
+    max_attn = tl.max(attn, axis=1)  # [N, 1]
+    # Set to -inf attn values where mask is not set. This forces exp(attn) to 0.
+    attn = tl.where(head_mask[:, None] * seq_mask[None, :], attn, float("-inf"))
+    exp_attn = tl.exp(attn - max_attn[:, None])
+    sumexp = tl.sum(exp_attn, axis=1)  # [N, 1]
+    # [NUM_HEADS, seq_len] * [seq_len, V_D_HEAD], sum along axis 0
+    output = tl.dot(exp_attn.to(v.dtype), v)
+    output = output / sumexp[:, None]  # [N, D_HEAD]
+    # We store the log-sum-exp after removing the max.
+    logsumexp = tl.log(sumexp) + max_attn
+    # when seq_mask is all false, max_attn will be -inf and sumexp is zero
+    tl.store(
+        output_values_ptr
+        + batch_id * N_HEADS * V_D_HEAD * num_blocks
+        + head_offsets[:, None] * V_D_HEAD * num_blocks
+        + seq_block_id * V_D_HEAD
+        + v_dhead_offsets[None, :],
+        output,
+        mask=head_mask[:, None] * v_dhead_mask[None, :],
+    )
+    tl.store(
+        output_logsumexp_ptr
+        + batch_id * N_HEADS * num_blocks
+        + head_offsets * num_blocks
+        + seq_block_id,
+        logsumexp,
+        mask=head_mask,
+    )
+@triton.jit
+def attention_kv_stage1(
+    q_ptr,  # [Batch, 1, N_HEADS, D_HEAD]
+    k_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    v_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    cache_loc_ptr,  # [Batch] # Specifies the batch index for each of the generate tokens.
+    input_pos_ptr,  # [Batch]
+    output_values_ptr,  # [Batch, N_HEADS, num_blocks, D_HEAD]
+    output_logsumexp_ptr,  # [Batch, N_HEADS, num_blocks]
+    num_blocks,
+    MAX_SEQ_LEN: tl.constexpr,  # Maximum supported sequence length
+    N_HEADS: tl.constexpr,  # Number of heads
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    D_HEAD: tl.constexpr,  # Dimension of each head.
+    SEQ_BLOCK_SIZE: tl.constexpr,  # Block size used for tiling the sequence dim.
+):
+    """Attention kernel to be used for generate-only batches.
+    Assumes that kv caches have been updated.
+    Uses flash decoding.
+    KV-cache layout is assumed to be [Batch,Seq, Head, Dim]
+    1. Fetch the K-cache from 0 to input_pos
+    2. Fetch the V-cache from 0 to input_pos
+    3. A = Q*K^T [1,D_HEAD] * [1,seq_len,D_HEAD] -> [1, seq_len]
+    4. S = softmax(A)
+    5. O = S*V [1, seq_len] * [1, seq_len, D_HEAD] -> [1, D_HEAD]
+    """
+    # Assume KV-cache layout: [Batch, Seq, Head, Dim]
+    # A program is responsible for 1 batch, 1 head and a block of sequences.
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    epsilon: tl.constexpr = 1e-38  # float32 smallest positive number
+    kv_position = tl.load(input_pos_ptr + batch_id)
+    kv_batch_id = tl.load(cache_loc_ptr + batch_id)
+    kv_batch_offset = kv_batch_id * N_KV_HEADS * MAX_SEQ_LEN * D_HEAD
+    # Offsets for the block of sequences this program processes.
+    seq_start_pos = seq_block_id * SEQ_BLOCK_SIZE
+    if seq_start_pos > kv_position:
+        return
+    seq_offsets = seq_start_pos + tl.arange(0, SEQ_BLOCK_SIZE)
+    seq_mask = seq_offsets <= kv_position
+    # Assuming D_HEAD is a power of 2
+    dhead_offsets = tl.arange(0, triton.next_power_of_2(D_HEAD))
+    dhead_mask = dhead_offsets < D_HEAD
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS
+    kv_head_offset = (head_id // HEAD_RATIO) * D_HEAD
+    sm_scale: tl.constexpr = 1.0 / (D_HEAD**0.5)
+    # Program loads the entire Q for the head assigned to it.
+    # [D_HEAD]
+    q_batch_offset = batch_id * N_HEADS * D_HEAD
+    q_head_offset = head_id * D_HEAD
+    q = tl.load(q_ptr + q_batch_offset + q_head_offset + dhead_offsets, mask=dhead_mask)
+    kv_block_offsets = (
+        kv_batch_offset
+        + seq_offsets[:, None] * D_HEAD * N_KV_HEADS
+        + kv_head_offset
+        + dhead_offsets[None, :]
+    )  # [BSND]
+    kv_mask = seq_mask[:, None] * dhead_mask[None, :]
+    # [seq_block, D_HEAD]
+    k = tl.load(k_cache_ptr + kv_block_offsets, mask=kv_mask, other=0.0)
+    v = tl.load(v_cache_ptr + kv_block_offsets, mask=kv_mask, other=0.0)
+    # Note: check the output precision of the sum.
+    # compute q*K^T
+    # [D_HEAD] * [seq_block, D_HEAD], sum along axis 1
+    attn = tl.sum(q[None, :].to(tl.float32) * k.to(tl.float32), axis=1)  # [seq_block]
+    attn *= sm_scale
+    max_attn = tl.max(attn)
+    # Set to -inf attn values where mask is not set. This forces exp(attn) to 0.
+    attn = tl.where(seq_mask, attn, float("-inf"))
+    exp_attn = tl.exp(attn - max_attn)
+    exp_attn = tl.where(exp_attn == 0, epsilon, exp_attn)
+    sumexp = tl.sum(exp_attn, axis=0)  # scalar.
+    # [seq_len] * [seq_len, D_HEAD], sum along axis 0
+    output = tl.sum(exp_attn[:, None] * v, axis=0)  # [D_HEAD]
+    output = output / sumexp
+    # We store the log-sum-exp after removing the max.
+    logsumexp = tl.log(sumexp) + max_attn
+    # when seq_mask is all false, max_attn will be -inf and sumexp is zero
+    tl.store(
+        output_values_ptr
+        + batch_id * N_HEADS * D_HEAD * num_blocks
+        + head_id * D_HEAD * num_blocks
+        + seq_block_id * D_HEAD
+        + dhead_offsets,
+        output,
+        mask=dhead_mask,
+    )
+    tl.store(
+        output_logsumexp_ptr
+        + batch_id * N_HEADS * num_blocks
+        + head_id * num_blocks
+        + seq_block_id,
+        logsumexp,
+    )
+@triton.jit
+def attention_kv_stage2(
+    values_ptr,  # [Batch, N_HEADS, num_blocks, D_HEAD]
+    logsumexp_ptr,  # [Batch, N_HEADS, num_blocks]
+    output_ptr,  # [Batch, N_HEADS, D_HEAD]
+    input_pos_ptr,
+    NUM_BLOCKS: tl.constexpr,
+    N_HEADS: tl.constexpr,
+    D_HEAD: tl.constexpr,
+    SEQ_BLOCK_SIZE: tl.constexpr,  # Nearest power of 2 for num_blocks
+):
+    # There are batch * N_HEADS programs
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    dhead_offsets = tl.arange(0, triton.next_power_of_2(D_HEAD))
+    dhead_mask = dhead_offsets < D_HEAD
+    kv_position = tl.load(input_pos_ptr + batch_id)
+    block_id = kv_position // SEQ_BLOCK_SIZE + 1
+    NUM_BLOCKS_POW2: tl.constexpr = triton.next_power_of_2(NUM_BLOCKS)
+    block_offsets = tl.arange(0, NUM_BLOCKS_POW2)
+    block_mask = block_offsets < block_id
+    logsumexp = tl.load(
+        logsumexp_ptr + batch_id * N_HEADS * NUM_BLOCKS + head_id * NUM_BLOCKS + block_offsets,
+        mask=block_mask,
+        other=float("-inf"),
+    )
+    max_logsumexp = tl.max(logsumexp)
+    sumexp = tl.exp(logsumexp - max_logsumexp)  # [NUM_BLOCKS_POW2]
+    aggregate_sumexp = tl.sum(sumexp, axis=0)
+    values_offsets = block_offsets[:, None] * D_HEAD + dhead_offsets[None, :]
+    values_mask = block_mask[:, None] * dhead_mask[None, :]
+    values = tl.load(
+        values_ptr
+        + batch_id * N_HEADS * D_HEAD * NUM_BLOCKS
+        + head_id * D_HEAD * NUM_BLOCKS
+        + values_offsets,
+        mask=values_mask,
+        other=0.0,
+    )  # [BLOCK_SIZE, D_HEAD]
+    values *= sumexp[:, None]
+    values /= aggregate_sumexp
+    output = tl.sum(values, axis=0)  # [DHEAD]
+    tl.store(
+        output_ptr + batch_id * N_HEADS * D_HEAD + head_id * D_HEAD + dhead_offsets,
+        output,
+        mask=dhead_mask,
+    )
+@triton.jit
+def context_attention_kv(
+    q_ptr,  # [bsnd]
+    k_ptr,  # [bsnd]
+    v_ptr,  # [bsnd]
+    k_cache_ptr,  # [bsnd]
+    v_cache_ptr,  # [bsnd]
+    seq_len,
+    o_ptr,
+    softmax_scale,
+    N_HEADS: tl.constexpr,  # Number of heads
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    Q_D_HEAD: tl.constexpr,  # Dimension of each query head.
+    V_D_HEAD: tl.constexpr,  # Dimension of each value head.
+    SEQ_BLOCK: tl.constexpr,
+    MAX_SEQ_LENGTH: tl.constexpr,
+):
+    """Kernel for context phase.
+    Assuming:
+    1. Self-attention [seqlen(Q) == seqlen(K)]
+    2. Causal attention
+    3. QKV layout: [bsnd]
+    """
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS
+    K_D_HEAD: tl.constexpr = Q_D_HEAD
+    q_dhead_offsets = tl.arange(0, triton.next_power_of_2(Q_D_HEAD))
+    q_dhead_mask = q_dhead_offsets < Q_D_HEAD
+    v_dhead_offsets = tl.arange(0, triton.next_power_of_2(V_D_HEAD))
+    v_dhead_mask = v_dhead_offsets < V_D_HEAD
+    seq_offsets = seq_block_id * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+    seq_mask = seq_offsets < seq_len
+    q_load_mask = seq_mask[:, None] * q_dhead_mask[None, :]
+    q_batch_offset = batch_id * seq_len * N_HEADS
+    kv_batch_offset = batch_id * seq_len * N_KV_HEADS
+    k_head_offset = (head_id // HEAD_RATIO) * K_D_HEAD
+    v_head_offset = (head_id // HEAD_RATIO) * V_D_HEAD
+    # Q will stay in SRAM
+    q = tl.load(
+        q_ptr
+        + q_batch_offset * Q_D_HEAD
+        + seq_offsets[:, None] * N_HEADS * Q_D_HEAD
+        + head_id * Q_D_HEAD
+        + q_dhead_offsets[None, :],
+        mask=q_load_mask,
+    )
+    acc = tl.zeros([SEQ_BLOCK, triton.next_power_of_2(V_D_HEAD)], dtype=tl.float32)
+    lse_i = tl.zeros([SEQ_BLOCK], dtype=tl.float32) - float("inf")
+    m_i = tl.zeros([SEQ_BLOCK], dtype=tl.float32) - float("inf")
+    for s in range(0, seq_block_id + 1, 1):
+        kv_seq_offsets = s * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+        kv_seq_mask = kv_seq_offsets < seq_len
+        k_load_mask = kv_seq_mask[:, None] * q_dhead_mask[None, :]
+        k = tl.load(
+            k_ptr
+            + kv_batch_offset * K_D_HEAD
+            + kv_seq_offsets[:, None] * N_KV_HEADS * K_D_HEAD
+            + k_head_offset
+            + q_dhead_offsets[None, :],
+            mask=k_load_mask,
+        )
+        qk = tl.zeros([SEQ_BLOCK, SEQ_BLOCK], dtype=tl.float32)
+        qk += tl.dot(q, k.trans())
+        # causal mask
+        qk = tl.where(seq_offsets[:, None] >= kv_seq_offsets[None, :], qk, float("-inf"))
+        qk *= softmax_scale
+        # rowmax
+        m_ij = tl.maximum(tl.max(qk, 1), lse_i)
+        p = tl.exp(qk - m_ij[:, None])  # [S,S]
+        v = tl.load(
+            v_ptr
+            + kv_batch_offset * V_D_HEAD
+            + kv_seq_offsets[:, None] * N_KV_HEADS * V_D_HEAD
+            + v_head_offset
+            + v_dhead_offsets[None, :],
+            mask=kv_seq_mask[:, None] * v_dhead_mask[None, :],
+        )
+        l_ij = tl.sum(p, 1)
+        acc_scale = tl.exp(m_i - m_ij)
+        acc = acc * acc_scale[:, None]
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        m_i = m_ij
+        l_i_new = tl.exp(lse_i - m_ij) + l_ij
+        lse_i = m_ij + tl.log(l_i_new)
+    o_scale = tl.exp(m_i - lse_i)
+    acc = acc * o_scale[:, None]
+    tl.store(
+        o_ptr
+        + batch_id * seq_len * N_HEADS * V_D_HEAD
+        + seq_offsets[:, None] * N_HEADS * V_D_HEAD
+        + head_id * V_D_HEAD
+        + v_dhead_offsets[None, :],
+        acc,
+        mask=seq_mask[:, None] * v_dhead_mask[None, :],
+    )
+    # Write back to kv-caches
+    ks = tl.load(
+        k_ptr
+        + kv_batch_offset * K_D_HEAD
+        + seq_offsets[:, None] * N_KV_HEADS * K_D_HEAD
+        + k_head_offset
+        + q_dhead_offsets[None, :],
+        mask=seq_mask[:, None] * q_dhead_mask[None, :],
+    )
+    vs = tl.load(
+        v_ptr
+        + kv_batch_offset * V_D_HEAD
+        + seq_offsets[:, None] * N_KV_HEADS * V_D_HEAD
+        + v_head_offset
+        + v_dhead_offsets[None, :],
+        mask=seq_mask[:, None] * v_dhead_mask[None, :],
+    )
+    # cache is [bsnd]
+    k_cache_offset = (
+        batch_id * N_KV_HEADS * MAX_SEQ_LENGTH * K_D_HEAD
+        + seq_offsets[:, None] * K_D_HEAD * N_KV_HEADS
+        + k_head_offset
+        + q_dhead_offsets[None, :]
+    )
+    v_cache_offset = (
+        batch_id * N_KV_HEADS * MAX_SEQ_LENGTH * V_D_HEAD
+        + seq_offsets[:, None] * V_D_HEAD * N_KV_HEADS
+        + v_head_offset
+        + v_dhead_offsets[None, :]
+    )
+    tl.store(k_cache_ptr + k_cache_offset, ks, seq_mask[:, None] * q_dhead_mask[None, :])
+    tl.store(v_cache_ptr + v_cache_offset, vs, seq_mask[:, None] * v_dhead_mask[None, :])
+@triton.jit
+def context_attention_kv_flattened(
+    q_ptr,  # [b*s,nd]
+    seq_len_ptr,  # [b] # length of each sequence in a batch
+    seq_start_indices_ptr,  # [b] # start indices of a sequence in flattened q/k/v.
+    k_cache_ptr,  # [bsnd]
+    v_cache_ptr,  # [bsnd]
+    input_pos_ptr,  # [b] # specifies the location in the sequence where kv must be written back.
+    cache_loc_ptr,  # [b] # location of the sequence in the cache.
+    o_ptr,
+    softmax_scale: tl.constexpr,
+    N_HEADS: tl.constexpr,  # Number of heads
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    Q_D_HEAD: tl.constexpr,  # Dimension of each query head.
+    V_D_HEAD: tl.constexpr,  # Dimension of each value head.
+    SEQ_BLOCK: tl.constexpr,
+    MAX_SEQ_LENGTH: tl.constexpr,
+):
+    """Kernel for context phase.
+    Assumes that kv caches have been updated.
+    Assuming QKV layout: [b*s,n,d]
+    """
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    # Each program is responsible for a block of tokens in a single batch.
+    seq_start_index = tl.load(seq_start_indices_ptr + batch_id)
+    seq_len = tl.load(seq_len_ptr + batch_id)
+    K_D_HEAD: tl.constexpr = Q_D_HEAD
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS
+    # cache is [bsnd]
+    # cache_loc_ptr stores the batch index for the sequences provided to the kernel.
+    cache_loc = tl.load(cache_loc_ptr + batch_id)
+    cache_batch_offset = cache_loc * N_KV_HEADS * MAX_SEQ_LENGTH
+    cache_head_offset = head_id // HEAD_RATIO
+    q_dhead_offsets = tl.arange(0, triton.next_power_of_2(Q_D_HEAD))
+    q_dhead_mask = q_dhead_offsets < Q_D_HEAD
+    v_dhead_offsets = tl.arange(0, triton.next_power_of_2(V_D_HEAD))
+    v_dhead_mask = v_dhead_offsets < V_D_HEAD
+    seq_offsets = seq_block_id * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+    seq_mask = seq_offsets < seq_len
+    # Q will stay in SRAM
+    q = tl.load(
+        q_ptr
+        + seq_start_index * N_HEADS * Q_D_HEAD
+        + seq_offsets[:, None] * N_HEADS * Q_D_HEAD
+        + head_id * Q_D_HEAD
+        + q_dhead_offsets[None, :],
+        mask=seq_mask[:, None] * q_dhead_mask[None, :],
+    )
+    acc = tl.zeros([SEQ_BLOCK, triton.next_power_of_2(V_D_HEAD)], dtype=tl.float32)
+    lse_i = tl.zeros([SEQ_BLOCK], dtype=tl.float32) - float("inf")
+    m_i = tl.zeros([SEQ_BLOCK], dtype=tl.float32) - float("inf")
+    # Loop over the entire KV-history
+    # input_pos_ptr stores the location at which kv must be written back for the given batch.
+    kv_position = tl.load(input_pos_ptr + batch_id)
+    num_blocks = (kv_position + seq_len + SEQ_BLOCK - 1) // SEQ_BLOCK
+    for s in range(0, num_blocks + 1, 1):
+        kv_seq_offsets = s * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+        kv_seq_mask = kv_seq_offsets < (kv_position + seq_len)
+        k = tl.load(
+            k_cache_ptr
+            + cache_batch_offset * K_D_HEAD
+            + kv_seq_offsets[:, None] * K_D_HEAD * N_KV_HEADS
+            + cache_head_offset * K_D_HEAD
+            + q_dhead_offsets[None, :],
+            mask=kv_seq_mask[:, None] * q_dhead_mask[None, :],
+        )
+        qk = tl.zeros([SEQ_BLOCK, SEQ_BLOCK], dtype=tl.float32)
+        qk += tl.dot(q, k.trans())
+        qk = tl.where(
+            (seq_offsets[:, None] + kv_position) >= kv_seq_offsets[None, :], qk, float("-inf")
+        )
+        qk *= softmax_scale
+        # rowmax
+        m_ij = tl.maximum(tl.max(qk, 1), lse_i)
+        p = tl.exp(qk - m_ij[:, None])
+        v = tl.load(
+            v_cache_ptr
+            + cache_batch_offset * V_D_HEAD
+            + kv_seq_offsets[:, None] * V_D_HEAD * N_KV_HEADS
+            + cache_head_offset * V_D_HEAD
+            + v_dhead_offsets[None, :],
+            mask=kv_seq_mask[:, None] * v_dhead_mask[None, :],
+        )
+        l_ij = tl.sum(p, 1)
+        acc_scale = tl.exp(m_i - m_ij)
+        acc = acc * acc_scale[:, None]
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        m_i = m_ij
+        l_i_new = tl.exp(lse_i - m_ij) + l_ij
+        lse_i = m_ij + tl.log(l_i_new)
+    o_scale = tl.exp(m_i - lse_i)
+    acc = acc * o_scale[:, None]
+    tl.store(
+        o_ptr
+        + seq_start_index * N_HEADS * V_D_HEAD
+        + seq_offsets[:, None] * N_HEADS * V_D_HEAD
+        + head_id * V_D_HEAD
+        + v_dhead_offsets[None, :],
+        acc,
+        mask=seq_mask[:, None] * v_dhead_mask[None, :],
+    )
+@triton.jit
+def update_kv_cache_rope_fusion(
+    q_ptr,  # [B*S, N, D]
+    k_ptr,  # [B*S, N, D]
+    v_ptr,  # [B*S, N, D]
+    seq_len_ptr,  # [b] # length of each sequence in a batch
+    seq_start_indices_ptr,  # [b] # start indices of a sequence in flattened q/k/v.
+    q_rope_ptr,  # [B*S, N, D], roped q result
+    k_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    v_cache_ptr,  # [MAX_BATCH_SIZE, MAX_SEQ_LEN, N_HEADS, D_HEAD]
+    input_pos_ptr,  # Specifies the sequence index in the caches at which to write the provided kv
+    cache_loc_ptr,  # Specifies the batch index for each of the input sequences
+    f_ptr,  # [MAX_SEQ_LEN, D_HEAD//2, 2] # frequencies for rope embadding.
+    MAX_SEQ_LENGTH: tl.constexpr,
+    N_HEADS: tl.constexpr,
+    N_KV_HEADS: tl.constexpr,
+    D_HEAD: tl.constexpr,
+    SEQ_BLOCK: tl.constexpr,
+    HEAD_BLOCK_SIZE: tl.constexpr,  # pad to 16 if HEAD_RATIO is < 16 to invoke tensor cores.
+    GENERATE_ONLY: tl.constexpr,
+):
+    """Fuse q and k rope with update_kv_cache kernel.
+    The input is interleaved as [2, D//2] in D_HEAD dim.
+    Update q_rope with the post-rope-embadding q values.
+    Update k_cache with the post-rope-embadding k values.
+    For rope computation, q and k need to load and store in tensors pair of 2 * [D//2].
+    Update v_cache with v.
+    """
+    batch_id = tl.program_id(axis=0)
+    kv_head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    # Each program is responsible for a block of tokens in a single batch.
+    if GENERATE_ONLY:
+        seq_start_index = batch_id
+        seq_len: tl.constexpr = 1
+    else:
+        seq_start_index = tl.load(seq_start_indices_ptr + batch_id)
+        seq_len = tl.load(seq_len_ptr + batch_id)
+    # cache is [bsnd]
+    # cache_loc_ptr stores the batch index for the sequences provided to the kernel.
+    cache_loc = tl.load(cache_loc_ptr + batch_id)
+    kv_position = tl.load(input_pos_ptr + batch_id)
+    cache_batch_offset = cache_loc * N_KV_HEADS * MAX_SEQ_LENGTH * D_HEAD
+    cache_head_offset = kv_head_id * D_HEAD
+    # Assuming D_HEAD is a power of 2
+    dhead_offsets = tl.arange(0, D_HEAD)
+    dhead_mask = dhead_offsets < D_HEAD
+    seq_offsets = seq_block_id * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+    seq_mask = seq_offsets < seq_len
+    load_mask = seq_mask[:, None] * dhead_mask[None, :]
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS  # This needs to be a power-of-2
+    q_head_offsets = kv_head_id * HEAD_RATIO + tl.arange(0, HEAD_BLOCK_SIZE)
+    q_head_mask = q_head_offsets < (kv_head_id * HEAD_RATIO + HEAD_RATIO)
+    q_batch_offset = seq_start_index * N_HEADS * D_HEAD
+    kv_batch_offset = seq_start_index * N_KV_HEADS * D_HEAD
+    kv_head_offset = cache_head_offset
+    D2: tl.constexpr = D_HEAD // 2
+    # input is interleaved as [2, D//2] in dim [D_HEAD].
+    d2_offsets = tl.arange(0, D2)
+    dhead_offsets1 = d2_offsets
+    dhead_offsets2 = d2_offsets + D2
+    d2_mask = dhead_offsets2 < D_HEAD
+    d2_load_mask = seq_mask[:, None] * d2_mask[None, :]
+    # offsets of [bsn]
+    q_offsets_base = (
+        q_batch_offset
+        + seq_offsets[:, None, None] * N_HEADS * D_HEAD
+        + q_head_offsets[None, :, None] * D_HEAD
+    )
+    q_offsets1 = q_offsets_base + dhead_offsets1[None, None, :]
+    q_offsets2 = q_offsets_base + dhead_offsets2[None, None, :]
+    q_mask = d2_load_mask[:, None, :] * q_head_mask[None, :, None]
+    q1 = tl.load(q_ptr + q_offsets1, mask=q_mask).to(tl.float32)
+    q2 = tl.load(q_ptr + q_offsets2, mask=q_mask).to(tl.float32)
+    k_offsets_base = kv_batch_offset + seq_offsets[:, None] * N_KV_HEADS * D_HEAD + kv_head_offset
+    k_offsets1 = k_offsets_base + dhead_offsets1[None, :]
+    k_offsets2 = k_offsets_base + dhead_offsets2[None, :]
+    k1 = tl.load(k_ptr + k_offsets1, mask=d2_load_mask).to(tl.float32)
+    k2 = tl.load(k_ptr + k_offsets2, mask=d2_load_mask).to(tl.float32)
+    # -----------------------------------
+    # torch version sin/cos
+    # cos and sin values are interleaved in frequencies tensor.
+    f_offsets = seq_offsets[:, None] * D2 + d2_offsets[None, :]
+    cos_ref = tl.load(f_ptr + kv_position * D_HEAD + f_offsets * 2, mask=d2_load_mask).to(
+        dtype=tl.float32
+    )
+    sin_ref = tl.load(f_ptr + kv_position * D_HEAD + f_offsets * 2 + 1, mask=d2_load_mask).to(
+        dtype=tl.float32
+    )
+    qs1 = cos_ref[:, None, :] * q1 - sin_ref[:, None, :] * q2
+    qs2 = sin_ref[:, None, :] * q1 + cos_ref[:, None, :] * q2
+    tl.store(q_rope_ptr + q_offsets1, qs1, mask=q_mask)
+    tl.store(q_rope_ptr + q_offsets2, qs2, mask=q_mask)
+    ks1 = cos_ref * k1 - sin_ref * k2
+    ks2 = sin_ref * k1 + cos_ref * k2
+    # Write back to kv-caches
+    vs = tl.load(
+        v_ptr
+        + kv_batch_offset
+        + seq_offsets[:, None] * N_KV_HEADS * D_HEAD
+        + kv_head_offset
+        + dhead_offsets[None, :],
+        mask=load_mask,
+    )
+    kv_writeback_seq_offsets = seq_offsets + kv_position
+    cache_offset_base = (
+        cache_batch_offset
+        + kv_writeback_seq_offsets[:, None] * D_HEAD * N_KV_HEADS
+        + cache_head_offset
+    )
+    k_cache_offset1 = cache_offset_base + dhead_offsets1[None, :]
+    k_cache_offset2 = cache_offset_base + dhead_offsets2[None, :]
+    tl.store(k_cache_ptr + k_cache_offset1, ks1, mask=d2_load_mask)
+    tl.store(k_cache_ptr + k_cache_offset2, ks2, mask=d2_load_mask)
+    v_cache_offset = cache_offset_base + dhead_offsets[None, :]
+    tl.store(v_cache_ptr + v_cache_offset, vs, load_mask)
+"""
+Kernels based on paged KV Cache.
+Parameter infos:
+    tensors:
+    - q: [b*s, n, d], flattened queries.
+    - k/v: [b*s, n, d], flattened key/value.
+    - seq_len: [b], length of each sequence in the batch.
+        `seq_len` can be 1 (generate) or larger (context).
+    - seq_start: [b], start index of each sequence in b*s dim of q/k/v.
+    - k_cache/v_cache: [num_pages, PAGE_SIZE, n, d], paged KV Cache.
+        New-coming k/v is split into small group of PAGE_SIZE, and then
+        mapped to incontinuous memory in KV Cache.
+    - page_table: [b, max_num_pages_per_seq], mapping logic of each sequence.
+    - cache_loc: [b], mapping logic of `batch_id` in q/k/v to index in `page_table`.
+    - cache_len: [b], existing cached k/v length of each sequence.
+    constexpr:
+    - N_HEADS/N_KV_HEADS: shape of dim [n] in q or k/v.
+    - D_HEAD: shape of dim [d] in q/k/v.
+        Assuming power of 2.
+    - SEQ_BLOCK: block size to split dim [s].
+        Assuming power of 2.
+        Split k/v in update kernel and split q in context/generate kernel.
+    - MAX_SEQ_LENGTH: seq_len <= MAX_SEQ_LENGTH.
+    - PAGE_SIZE: shape of each kv cache page,
+        Assuming power of 2 and SEQ_BLOCK % PAGE_SIZE = 0.
+    - PAGE_TABLE_STIDE: stride of dim [b] in `page_table`.
+KV Cache access logic in update kernel:
+    1. batch_id i access k[seq_start[i] : seq_start[i] + seq_len[i]]
+        and can be split into pages [a:b] in the sequence.
+    2. Look up cache_len[i] to find if the sequence has cached k/v.
+    3. Look up page_table[cache_loc[i], cache_len[i] + a : cache_len[i] + b]
+       to get the corresponding pages in the k_cache, with result [c:d].
+    4. Then update k_cache[c:d] with the k value.
+"""
+@triton.jit
+def update_paged_kv_cache(
+    k_ptr,  # [B*S, N, D]
+    v_ptr,  # [B*S, N, D]
+    seq_len_ptr,  # [b] # length of each sequence in a batch
+    seq_start_indices_ptr,  # [b] # start indices of a sequence in flattened q/k/v.
+    k_cache_ptr,  # [num_pages, page_size, n, d]
+    v_cache_ptr,  # [num_pages, page_size, n, d]
+    cache_loc_ptr,  # [b] # index of the sequence in the page table.
+    cache_len_ptr,  # [b] # length of the sequence already in kv cache.
+    page_table_ptr,  # [b, max_num_pages_per_seq] # loc of the block page in the cache.
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    D_HEAD: tl.constexpr,  # Dimension of each head.
+    SEQ_BLOCK: tl.constexpr,
+    MAX_SEQ_LENGTH: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    PAGE_TABLE_STRIDE: tl.constexpr,
+    GENERATE_ONLY: tl.constexpr,
+):
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    # Each program is responsible for a block of tokens in a single batch.
+    if GENERATE_ONLY:
+        seq_start_index = batch_id
+        seq_len: tl.constexpr = 1
+    else:
+        seq_start_index = tl.load(seq_start_indices_ptr + batch_id)
+        seq_len = tl.load(seq_len_ptr + batch_id)
+    cache_len = tl.load(cache_len_ptr + batch_id)
+    # cache is [num_pages, page_size, n, d]
+    # cache_loc_ptr stores the batch index for the sequences provided to the kernel.
+    cache_loc = tl.load(cache_loc_ptr + batch_id)
+    cache_head_offset = head_id * D_HEAD
+    # Assuming D_HEAD is a power of 2
+    dhead_offsets = tl.arange(0, D_HEAD)
+    dhead_mask = dhead_offsets < D_HEAD
+    seq_offsets = seq_block_id * SEQ_BLOCK + tl.arange(0, SEQ_BLOCK)
+    seq_mask = seq_offsets < seq_len
+    load_mask = seq_mask[:, None] * dhead_mask[None, :]
+    kv_batch_offset = seq_start_index * N_KV_HEADS * D_HEAD
+    kv_head_offset = cache_head_offset
+    # Write back to kv-caches
+    ks = tl.load(
+        k_ptr
+        + kv_batch_offset
+        + seq_offsets[:, None] * N_KV_HEADS * D_HEAD
+        + kv_head_offset
+        + dhead_offsets[None, :],
+        mask=load_mask,
+    )
+    vs = tl.load(
+        v_ptr
+        + kv_batch_offset
+        + seq_offsets[:, None] * N_KV_HEADS * D_HEAD
+        + kv_head_offset
+        + dhead_offsets[None, :],
+        mask=load_mask,
+    )
+    # assuming SEQ_BLOCK can be divided by PAGE_SIZE and PAGE_SIZE is a power of 2.
+    SEQ_BLOCK_PAGE: tl.constexpr = SEQ_BLOCK // PAGE_SIZE
+    MAX_NUM_PAGES: tl.constexpr = (MAX_SEQ_LENGTH + PAGE_SIZE - 1) // PAGE_SIZE
+    # cache_len // PAGE_SIZE means history pages
+    # if decode sequence, then seq_len = 1 and only seq_block_id = 0 works,
+    kv_pages = seq_block_id * SEQ_BLOCK_PAGE + tl.arange(0, SEQ_BLOCK_PAGE) + cache_len // PAGE_SIZE
+    cache_pages = tl.load(
+        page_table_ptr + cache_loc * PAGE_TABLE_STRIDE + kv_pages, mask=kv_pages < MAX_NUM_PAGES
+    )
+    page_offsets = tl.arange(0, PAGE_SIZE)
+    # shape [SEQ_BLOCK], means [cache_pages, page_offsets]
+    cache_seq_offset = tl.reshape(
+        cache_pages[:, None] * PAGE_SIZE + page_offsets[None, :], [SEQ_BLOCK]
+    )
+    # write offset inside the page
+    cache_seq_offset += cache_len % PAGE_SIZE
+    cache_offsets = (
+        cache_seq_offset[:, None] * N_KV_HEADS * D_HEAD + kv_head_offset + dhead_offsets[None, :]
+    )
+    tl.store(k_cache_ptr + cache_offsets, ks, load_mask)
+    tl.store(v_cache_ptr + cache_offsets, vs, load_mask)
+# TODO: Write a doc describing the 2 stage algorithm
+@triton.jit
+def attention_kv_paged_stage1(
+    q_ptr,  # [Batch, 1, N_HEADS, D_HEAD]
+    k_cache_ptr,  # [NUM_PAGES, PAGE_SIZE, N_HEADS, D_HEAD]
+    v_cache_ptr,  # [NUM_PAGES, PAGE_SIZE, N_HEADS, D_HEAD]
+    cache_loc_ptr,  # [Batch] # Specifies the batch index for each of the generate tokens.
+    page_table_ptr,  # [Batch, num_pages_per_seq]
+    cache_len_ptr,  # [Batch] # Number of tokens in kv cache.
+    output_values_ptr,  # [Batch, N_HEADS, num_blocks, D_HEAD]
+    output_logsumexp_ptr,  # [Batch, N_HEADS, num_blocks]
+    num_blocks,
+    MAX_SEQ_LEN: tl.constexpr,  # Maximum supported sequence length
+    N_HEADS: tl.constexpr,  # Number of heads
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    D_HEAD: tl.constexpr,  # Dimension of each head.
+    # Block size used for tiling the sequence dim.
+    SEQ_BLOCK_SIZE: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    PAGE_TABLE_STRIDE: tl.constexpr,
+):
+    """Attention kernel to be used during the generate phase.
+    Uses flash decoding.
+    KV-cache layout is assumed to be [Batch, Head, Seq, Dim]
+    1. Fetch the K-cache from 0 to input_pos
+    2. Fetch the V-cache from 0 to input_pos
+    3. A = Q*K^T [1,D_HEAD] * [1,seq_len,D_HEAD] -> [1, seq_len]
+    4. S = softmax(A)
+    5. O = S*V [1, seq_len] * [1, seq_len, D_HEAD] -> [1, D_HEAD]
+    """
+    # Assume KV-cache layout: [Batch, Head, Seq, Dim]
+    # A program is responsible for 1 batch, 1 head and a block of sequences.
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    SEQ_BLOCK_PAGE: tl.constexpr = SEQ_BLOCK_SIZE // PAGE_SIZE
+    MAX_NUM_PAGES: tl.constexpr = MAX_SEQ_LEN // PAGE_SIZE
+    cache_loc = tl.load(cache_loc_ptr + batch_id)
+    seq_len = tl.load(cache_len_ptr + batch_id)
+    # Offsets for the block of sequences this program processes.
+    seq_start_pos = seq_block_id * SEQ_BLOCK_SIZE
+    if seq_start_pos > seq_len:
+        return
+    seq_offsets = seq_start_pos + tl.arange(0, SEQ_BLOCK_SIZE)
+    seq_mask = seq_offsets <= seq_len
+    # Assuming D_HEAD is a power of 2
+    dhead_offsets = tl.arange(0, D_HEAD)
+    dhead_mask = dhead_offsets < D_HEAD
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS
+    cache_head_offset = (head_id // HEAD_RATIO) * D_HEAD
+    sm_scale: tl.constexpr = 1 / (D_HEAD**0.5)
+    # Program loads the entire Q for the head assigned to it.
+    # [D_HEAD]
+    q_batch_offset = batch_id * N_HEADS * D_HEAD
+    q_head_offset = head_id * D_HEAD
+    q = tl.load(q_ptr + q_batch_offset + q_head_offset + dhead_offsets)
+    kv_mask = seq_mask[:, None] * dhead_mask[None, :]
+    kv_pages = seq_block_id * SEQ_BLOCK_PAGE + tl.arange(0, SEQ_BLOCK_PAGE)
+    cache_pages = tl.load(
+        page_table_ptr + cache_loc * PAGE_TABLE_STRIDE + kv_pages, mask=kv_pages < MAX_NUM_PAGES
+    )
+    page_offsets = tl.arange(0, PAGE_SIZE)
+    # shape [SEQ_BLOCK], means [cache_pages, page_offsets]
+    # token offsets in the paged kv cache
+    cache_seq_offset = tl.reshape(
+        cache_pages[:, None] * PAGE_SIZE + page_offsets[None, :], [SEQ_BLOCK_SIZE]
+    )
+    cache_offsets = (
+        cache_seq_offset[:, None] * N_KV_HEADS * D_HEAD + cache_head_offset + dhead_offsets[None, :]
+    )
+    k = tl.load(k_cache_ptr + cache_offsets, mask=kv_mask)
+    v = tl.load(v_cache_ptr + cache_offsets, mask=kv_mask)
+    # Note: check the output precision of the sum.
+    # compute q*K^T
+    # [D_HEAD] * [seq_block, D_HEAD], sum along axis 1
+    attn = tl.sum(q[None, :] * k, axis=1)  # [seq_block]
+    attn = attn.to(tl.float32)
+    attn *= sm_scale
+    max_attn = tl.max(attn)
+    # Set to -inf attn values where mask is not set. This forces exp(attn) to 0.
+    attn = tl.where(seq_mask, attn, float("-inf"))
+    exp_attn = tl.exp(attn - max_attn)
+    sumexp = tl.sum(exp_attn, axis=0)  # scalar.
+    # [seq_len] * [seq_len, D_HEAD], sum along axis 0
+    output = tl.sum(exp_attn[:, None] * v, axis=0)  # [D_HEAD]
+    output = output / sumexp
+    # We store the log-sum-exp after removing the max.
+    logsumexp = tl.log(sumexp) + max_attn
+    # when seq_mask is all false, max_attn will be -inf and sumexp is zero
+    tl.store(
+        output_values_ptr
+        + batch_id * N_HEADS * D_HEAD * num_blocks
+        + head_id * D_HEAD * num_blocks
+        + seq_block_id * D_HEAD
+        + dhead_offsets,
+        output,
+    )
+    tl.store(
+        output_logsumexp_ptr
+        + batch_id * N_HEADS * num_blocks
+        + head_id * num_blocks
+        + seq_block_id,
+        logsumexp,
+    )
+@triton.jit
+def context_attention_kv_paged(
+    q_ptr,  # [b*s,nd]
+    seq_len_ptr,  # [b] # length of each sequence in a batch
+    seq_start_ptr,  # [b] # start indices of a sequence in flattened q/k/v.
+    k_cache_ptr,  # [num_pages, page_size, n, d]
+    v_cache_ptr,  # [num_pages, page_size, n, d]
+    cache_loc_ptr,  # [b] # index of the sequence in the page table.
+    cache_len_ptr,  # [Batch] # Number of tokens in kv cache.
+    page_table_ptr,  # [b, max_num_pages_per_seq] # loc of the block page in the cache.
+    softmax_scale,
+    o_ptr,
+    N_HEADS: tl.constexpr,  # Number of heads
+    N_KV_HEADS: tl.constexpr,  # Number of KV heads.
+    D_HEAD: tl.constexpr,  # Dimension of each head.
+    SEQ_BLOCK: tl.constexpr,
+    MAX_SEQ_LENGTH: tl.constexpr,
+    PAGE_SIZE: tl.constexpr,
+    PAGE_TABLE_STRIDE: tl.constexpr,
+):
+    """Kernel for context phase.
+    Fuses rope
+    Assuming:
+    1. Self-attention [seqlen(Q) == seqlen(K)]
+    2. Causal attention
+    3. QKV layout: [b*s,n,d]
+    """
+    batch_id = tl.program_id(axis=0)
+    head_id = tl.program_id(axis=1)
+    seq_block_id = tl.program_id(axis=2)
+    # Each program is responsible for a block of tokens in a single batch.
+    seq_start_index = tl.load(seq_start_ptr + batch_id)
+    seq_len = tl.load(seq_len_ptr + batch_id)
+    HEAD_RATIO: tl.constexpr = N_HEADS // N_KV_HEADS
+    # assuming SEQ_BLOCK can be divided by PAGE_SIZE and PAGE_SIZE is a power of 2.
+    SEQ_BLOCK_PAGE: tl.constexpr = SEQ_BLOCK // PAGE_SIZE
+    MAX_NUM_PAGES: tl.constexpr = (MAX_SEQ_LENGTH + PAGE_SIZE - 1) // PAGE_SIZE
+    # cache is [num_pages, page_size, n, d]
+    # cache_loc_ptr stores the batch index for the sequences provided to the kernel.
+    cache_loc = tl.load(cache_loc_ptr + batch_id)
+    table_batch_offset = cache_loc * PAGE_TABLE_STRIDE
+    # Assuming D_HEAD is a power of 2
+    dhead_offsets = tl.arange(0, D_HEAD)
+    dhead_mask = dhead_offsets < D_HEAD
+    seq_offsets = tl.arange(0, SEQ_BLOCK)
+    q_seq_offsets = seq_block_id * SEQ_BLOCK + seq_offsets
+    seq_mask = q_seq_offsets < seq_len
+    load_mask = seq_mask[:, None] * dhead_mask[None, :]
+    q_batch_offset = seq_start_index * N_HEADS * D_HEAD
+    q_head_offset = head_id * D_HEAD
+    cache_head_offset = (head_id // HEAD_RATIO) * D_HEAD
+    # Q will stay in SRAM
+    q = tl.load(
+        q_ptr
+        + q_batch_offset
+        + q_seq_offsets[:, None] * N_HEADS * D_HEAD
+        + q_head_offset
+        + dhead_offsets[None, :],
+        mask=load_mask,
+    )
+    acc = tl.zeros([SEQ_BLOCK, D_HEAD], dtype=tl.float32)
+    lse_i = tl.zeros([SEQ_BLOCK], dtype=tl.float32) - float("inf")
+    m_i = tl.zeros([SEQ_BLOCK], dtype=tl.float32) - float("inf")
+    cache_len = tl.load(cache_len_ptr + batch_id)
+    total_len = cache_len + seq_len
+    num_blocks = (total_len + SEQ_BLOCK - 1) // SEQ_BLOCK
+    for s in range(0, num_blocks + 1, 1):
+        kv_pages = s * SEQ_BLOCK_PAGE + tl.arange(0, SEQ_BLOCK_PAGE)
+        cache_pages = tl.load(
+            page_table_ptr + table_batch_offset + kv_pages, mask=kv_pages < MAX_NUM_PAGES
+        )
+        page_offsets = tl.arange(0, PAGE_SIZE)
+        # shape [SEQ_BLOCK], means [cache_pages, page_offsets]
+        # physical token offsets in the paged kv cache
+        cache_seq_offset = tl.reshape(
+            cache_pages[:, None] * PAGE_SIZE + page_offsets[None, :], [SEQ_BLOCK]
+        )
+        cache_offsets = (
+            cache_seq_offset[:, None] * N_KV_HEADS * D_HEAD
+            + cache_head_offset
+            + dhead_offsets[None, :]
+        )
+        # logical kv tokens offsets
+        kv_seq_offsets = s * SEQ_BLOCK + seq_offsets
+        kv_seq_mask = kv_seq_offsets < total_len
+        kv_load_mask = kv_seq_mask[:, None] * dhead_mask[None, :]
+        k = tl.load(k_cache_ptr + cache_offsets, mask=kv_load_mask)
+        qk = tl.zeros([SEQ_BLOCK, SEQ_BLOCK], dtype=tl.float32)
+        qk += tl.dot(q, k.trans())
+        # causal mask, need to use kv_seq_offsets
+        qk = tl.where(
+            (q_seq_offsets[:, None] + cache_len) >= kv_seq_offsets[None, :], qk, float("-inf")
+        )
+        qk *= softmax_scale
+        # rowmax
+        m_ij = tl.maximum(tl.max(qk, 1), lse_i)
+        p = tl.exp(qk - m_ij[:, None])
+        v = tl.load(v_cache_ptr + cache_offsets, mask=kv_load_mask)
+        l_ij = tl.sum(p, 1)
+        acc_scale = tl.exp(m_i - m_ij)
+        acc = acc * acc_scale[:, None]
+        p = p.to(v.dtype)
+        acc += tl.dot(p, v)
+        m_i = m_ij
+        l_i_new = tl.exp(lse_i - m_ij) + l_ij
+        lse_i = m_ij + tl.log(l_i_new)
+    o_scale = tl.exp(m_i - lse_i)
+    acc = acc * o_scale[:, None]
+    tl.store(
+        o_ptr
+        + q_batch_offset
+        + q_seq_offsets[:, None] * N_HEADS * D_HEAD
+        + q_head_offset
+        + dhead_offsets[None, :],
+        acc,
+        mask=load_mask,
+    )
+@dataclass
+class PositionalEmbeddingConfig:
+    """A dataclass to hold positional embedding information."""
+    mode: Optional[Literal["rope"]] = None
+    rope_theta: float = 10000.0
+    rope_scale: float = 1.0
+    def __post_init__(self):
+        assert self.mode in [None, "rope"], f"Invalid mode: {self.mode}."
+        if self.mode == "rope":
+            assert self.rope_theta > 0, f"Invalid rope theta: {self.rope_theta}."
+@dataclass
+class CacheConfig:
+    """A dataclass to hold information how to configure the cache."""
+    dtype: Optional[torch.dtype] = None
+@dataclass
+class AttentionInfo:
+    """Information about the attention op.
+    This is the dataclass collected by the kvcache transformation and passed in to the
+    AttentionDescriptor methods to inform the attention op about the attention configuration.
+    """
+    num_heads: int
+    num_kv_heads: int
+    head_dim: int  # embedding size of each head
+    dtype: torch.dtype
+    cache_config: CacheConfig
+    pos_embd_config: PositionalEmbeddingConfig
+    # rope_dim represents embedding size of decoupled q/k that carry rope information
+    # when rope_dim != 0 the decoupled q/k tensor carrying rope information is the last part of the tensor [-rope_dim: ]
+    rope_dim: Optional[int] = 0
+@dataclass
+class SequenceInfo:
+    """A dataclass to hold information about how the sequence is laid out and stored in cache.
+    We assume the sequence + cache is laid out in the following way:
+    - input_ids: [id_0, ..., id_{s_total-1}]
+      flattened sequence of [b, 1] or [1, s_total]. We use [b, 1] to denote generate-only batches.
+    - seq_len: [s_0, s_1, ..., s_{b-1}] such that s_total = sum(s_i)
+      Describes how long each sequence is. For example,
+      input_ids[:s_0] will correspond to sequence 0 in the batch and input_ids[s_0:s_1] will
+      correspond to sequence 1 in the batch.
+    - input_pos: [pos_0, ..., pos_{b-1}]
+      Corresponds to the total number of tokens that has been already been cached for each sequence
+      in the batch.
+    - cache_loc: [c0, ...., c_{np-1}] where np is total number of pages allocated to describe all
+      sequences in the batch.
+    - pages_per_seq: [ps_0, ps_1, ..., ps_{b-1}] where ps_i is the number of pages allocated for
+      sequence i. Note that, for example, cache_loc[p_0:p_1] will correspond to the pages associated
+      with sequence 1 in the batch.
+    Here are a couple of notes to emphasize this notation:
+    - The total number of allocated token space for sequence i is given by ps_i * page_size. This is
+      the total number of tokens that can be cached for each sequence.
+    - NOTE: It must hold that pos_i + s_i <= ps_i * page_size for all i in [0, b-1]. Moreover, it is
+      the responsibility of the cache manager and/or runtime to ensure sufficient page allocation
+      for each sequence.
+    """
+    ## USE TO INITIALIZE DATA CLASS  ###############################################################
+    # max_seq_len corresponds the maximum number of tokens in any sequence. It includes the tokens in the
+    # input sequence and the tokens generated by the model.
+    max_seq_len: int = 1
+    # max_batch_size corresponds to the maximum number of sequences (or requests) that the model can process.
+    max_batch_size: int = 1
+    # page_size is the granularity with which the cache pages are allocated for a paged kv cache.
+    # For an unpaged cache, the page size should be set to max_seq_len.
+    # Also note that two sequences in a batch can not share a page.
+    page_size: int = 0
+    # max_num_tokens is the maximum number of tokens that the model can process across all sequences in the batch.
+    # If a batch is composed of context-only requests of input sequence length ISL,
+    # then the maximum number of sequences possible in the batch is min (max_batch_size, max_num_tokens // ISL).
+    # Similarly, if a batch is composed of generate-only requests,
+    # then the maximum number of sequences possible in the batch is min (max_batch_size, max_num_tokens).
+    max_num_tokens: int = 0
+    ## [UPDATE WITH CARE] TENSOR FIELDS THAT WILL BE PASSED TO PREPARE_METADATA OP #################
+    # input_ids MUST ALWAYS BE THE FIRST FIELD
+    input_ids: torch.Tensor = field(default_factory=lambda: torch.zeros(1, 1, dtype=torch.int))
+    seq_len: torch.Tensor = field(default_factory=lambda: torch.ones(1, dtype=torch.int))
+    input_pos: torch.Tensor = field(default_factory=lambda: torch.zeros(1, dtype=torch.int))
+    cache_loc: torch.Tensor = field(default_factory=lambda: torch.arange(1, dtype=torch.int))
+    pages_per_seq: torch.Tensor = field(default_factory=lambda: torch.ones(1, dtype=torch.int))
+    ################################################################################################
+    ## PRIVATE FIELDS ##############################################################################
+    _sequence_lengths: List[int] = field(default_factory=list)
+    _num_pages: int = 1
+    def __post_init__(self):
+        if self.page_size < 1:
+            self.page_size = self.max_seq_len
+        if self.max_num_tokens < 1:
+            self.max_num_tokens = self.max_batch_size * self.max_seq_len
+        # if the provided max_num_tokens is less than the max_batch_size * max_seq_len,
+        # we use the provided max_num_tokens to calculate the number of pages
+        total_tokens = min(self.max_num_tokens, self.max_batch_size * self.max_seq_len)
+        self._num_pages = (total_tokens) // self.page_size + (total_tokens % self.page_size > 0)
+        self.input_ids = torch.ones(self.max_batch_size, 1, dtype=torch.int)
+        self.seq_len = torch.empty(self.max_batch_size, dtype=torch.int)
+        self.input_pos = torch.empty_like(self.seq_len)
+        self.cache_loc = torch.empty(self.num_pages, dtype=torch.int)
+        self.pages_per_seq = torch.empty_like(self.seq_len)
+        # dynamic shape descriptors for tensor args
+        self._dynamic_shapes: Optional[Tuple[Dict[str, Dim]]] = None
+        # keep a list-like object of sequence lengths for simplicity as well
+        self._sequence_lengths = [0] * self.max_batch_size
+        # call reset once to initialize the tensors
+        self.reset()
+    @property
+    def device(self) -> torch.device:
+        return self.input_pos.device
+    @property
+    def args(self) -> List[torch.Tensor]:
+        args = []
+        for f in fields(self):
+            val = getattr(self, f.name)
+            if isinstance(val, torch.Tensor):
+                args.append(val)
+        return args
+    @property
+    def extra_arg_names(self) -> List[str]:
+        """Return extra arg names for the prepare_metadata op beyond input_ids."""
+        return [f.name for f in fields(self) if isinstance(getattr(self, f.name), torch.Tensor)][1:]
+    @property
+    def dynamic_shapes(self) -> Tuple[Dict[str, Dim]]:
+        """Return dynamic shapes of sequence info tensors.
+        NOTE: will be lazily initialized since the Dim object is not picklable for multi-processing.
+        """
+        if self._dynamic_shapes is None:
+            dynamic_shapes = ({},)
+            if self.max_batch_size > 1:
+                dynamic_shapes[0][0] = Dim("batch_size", max=self.max_batch_size)
+            dynamic_shapes[0][1] = Dim("seq_len", max=self.max_seq_len)
+            dynamic_shapes += ({},) * len(self.extra_arg_names)
+            self._dynamic_shapes = dynamic_shapes
+        return self._dynamic_shapes
+    @property
+    def num_sequences(self) -> int:
+        return len(self._sequence_lengths)
+    @property
+    def sequence_lengths(self) -> List[int]:
+        return self._sequence_lengths
+    @property
+    def input_positions(self) -> List[int]:
+        return self.input_pos[: self.num_sequences].tolist()
+    @property
+    def is_generate(self) -> bool:
+        return all(sl == 1 for sl in self.sequence_lengths)
+    @property
+    def num_pages(self) -> int:
+        return self._num_pages
+    @num_pages.setter
+    def num_pages(self, value):
+        self._num_pages = value
+        # update the cache_loc tensor
+        self.cache_loc.resize_(value)
+    @property
+    def is_paged(self) -> bool:
+        return self.page_size < self.max_seq_len
+    @property
+    def page_assignments(self) -> List[List[int]]:
+        """Return the page assignments for each sequence."""
+        pages_per_seq = self.pages_per_seq[: self.num_sequences].tolist()
+        return [
+            c_loc_one_seq.tolist()
+            for c_loc_one_seq in torch.split(self.cache_loc[: sum(pages_per_seq)], pages_per_seq)
+        ]
+    @classmethod
+    def _get_sanitized_seq_len(cls, input_ids: torch.Tensor, seq_len: torch.Tensor) -> torch.Tensor:
+        """Sanitize sequence lengths.
+        We want to cover the following scenarios with this function:
+        1. Pre-fill:
+            input_ids: [1, s_total, ...]
+            seq_len: [s_0, s_1, ..., s_{b-1}, 0, 0, ..., 0]
+            ---> returns [s_0, s_1, ..., s_{b-1}]
+        2. Decode:
+            input_ids: [b, 1, ...]
+            seq_len: [1, 1, ..., 1, 0, 0, ..., ..., ..., ..., 0]
+                     |---- b ----|--- (max_batch_size - b) ---|
+            --> returns [1,] * b
+        3. Decode in Cudagraph:
+            input_ids: [b_cudagraph, 1, ...]
+            seq_len: [1, 1, ..., 1, 0, 0, ..., ..., ..., ..., 0]
+                     |---- b ----|--- (max_batch_size - b) ---|
+            --> returns [1,] * b_cudagraph
+            Here b <= b_cudagraph. We want to make sure that the seq_len is one-padded to
+            b_cudagraph.
+            # TODO: I could see one possible issue with this approach in the future.
+            # If we have b < b_cudagraph we now one-pad. However, we don't pad the cache location
+            # information. What could happen is that the for the padded sequences the cache location
+            # tensors point to allocated pages. This could lead to a situation where we write into
+            # allocated cache pages polluting the cache of other sequences. Now this is not an issue
+            # if we write the dummy sequences into unallocated cache pages... One fix could be to
+            # pad not only the seq len but also pad the cache locations by just repeating the last
+            # valid cache location in the batch. This would ensure that the dummy sequences just
+            # repeats valid computation...
+        """
+        _, s = input_ids.shape[:2]
+        num_seq = cls._get_sanitized_num_sequences(input_ids, seq_len)
+        if s > 1:
+            return seq_len[:num_seq].detach().clone()
+        else:
+            return torch.ones(num_seq, dtype=seq_len.dtype, device=seq_len.device)
+    @staticmethod
+    def _get_sanitized_num_sequences(input_ids: torch.Tensor, seq_len: torch.Tensor) -> int:
+        """Get number of sequences.
+        We makes sure that this function is compatible with both torch graph capture and cudagraph.
+        Both can be a bit temparamental when trying to extract the number of sequences from a tensor
+        with max_batch_size or max_batch_size*max_seq_len.
+        """
+        b, s = input_ids.shape[:2]
+        if s > 1:
+            num_seq = torch.sum(seq_len > 0)
+            assert seq_len[num_seq:].sum() == 0, "seq_len should be zero-padded"
+        else:
+            num_seq = b
+        return num_seq
+    def to(self, *args, **kwargs) -> None:
+        for f in fields(self):
+            val = getattr(self, f.name)
+            if isinstance(val, torch.Tensor):
+                setattr(self, f.name, val.to(*args, **kwargs))
+    def sync(self, other: "SequenceInfo") -> None:
+        for f in fields(self):
+            val = getattr(self, f.name)
+            val_other = getattr(other, f.name)
+            if f.name == "input_ids":
+                setattr(self, f.name, val_other.to(self.device))
+            elif f.name == "_sequence_lengths":
+                self._sequence_lengths = val_other
+            elif isinstance(val, torch.Tensor):
+                val[: len(val_other)] = val_other.to(self.device)
+            else:
+                assert val == val_other, f"Field {f.name} mismatch: {val} != {val_other}."
+    def reset(self) -> None:
+        """Reset the sequence information.
+        After reset the sequence information should correspond to a "generate-only" batch of
+        sequences (b, s==1) without cache history.
+        """
+        # set a dummy sequence corresponding to a generate-only batch
+        self.nest_sequences(torch.zeros(self.max_batch_size, 1, dtype=torch.int))
+        # reset cache information
+        self.input_pos.zero_()
+        self.cache_loc[:] = torch.arange(self.num_pages, dtype=torch.int, device=self.device)
+        self.pages_per_seq.fill_(1)
+    def _set_example_sequence(self) -> None:
+        """Set an example sequence for export purposes."""
+        self.reset()
+        input_ids = torch.ones(
+            min(2, self.max_batch_size),
+            min(4, self.max_seq_len),
+            dtype=torch.int,
+            device=self.device,
+        )
+        self.nest_sequences(input_ids)
+        self.input_ids = input_ids
+    def _set_max_num_tokens_sample(self) -> None:
+        """Set an example sequence with max_num_tokens."""
+        self.reset()
+        seq_len = self.max_num_tokens // self.max_batch_size
+        input_ids = torch.ones(
+            self.max_batch_size,
+            seq_len,
+            dtype=torch.int,
+            device=self.device,
+        )
+        self.pages_per_seq.fill_(seq_len // self.page_size)
+        self.nest_sequences(input_ids)
+    def _set_generate_only_batch(self) -> None:
+        """Set an example sequence for generate-only batch."""
+        self.reset()
+        self.nest_sequences([[1]] * self.max_batch_size)
+    def nest_sequences(self, input_ids: Sequence[Sequence[int]]) -> None:
+        """Create and store a flattened list of input_ids from the provided list of sequences.
+        This i/f will also update any relevant sequence information.
+        """
+        # set new sequence lengths
+        seq_lens = [len(ids) for ids in input_ids]
+        self.seq_len.zero_()
+        self.seq_len[: len(seq_lens)].copy_(torch.tensor(seq_lens), non_blocking=True)
+        # set new input_ids as new tensor from flattened input_ids
+        ids_tnsr_list = [
+            lst.detach() if isinstance(lst, torch.Tensor) else torch.tensor(lst, dtype=torch.int)
+            for lst in input_ids
+        ]
+        self.input_ids = torch.cat(ids_tnsr_list, dim=0).to(self.device)
+        # set derivative properties
+        self._sequence_lengths = seq_lens
+        # use [b,1] shape to indicate generate-only batch, otherwise use [1,total_len]
+        if self.is_generate:
+            self.input_ids = self.input_ids.view(-1, 1, *self.input_ids.shape[1:])
+        else:
+            self.input_ids = self.input_ids.view(1, -1, *self.input_ids.shape[1:])
+    def unnest_sequences(self, t_nested: torch.Tensor) -> List[torch.Tensor]:
+        t_squeezed = t_nested.squeeze(1) if self.is_generate else t_nested.squeeze(0)
+        return list(torch.split(t_squeezed, self.sequence_lengths))
+    def update_pos(self, seq_len: Union[torch.Tensor, List[int], int], reset: bool = False) -> None:
+        """Update the starting position for each sequence in the cache.
+        If ``reset=True`, ``input_pos`` will be reset to zero before updating.
+        """
+        if not isinstance(seq_len, torch.Tensor):
+            seq_len = torch.tensor(seq_len, dtype=torch.int)
+        bs = len(seq_len) if seq_len.dim() > 0 else self.max_batch_size
+        if reset:
+            self.input_pos[:bs] = seq_len.to(self.device)
+        else:
+            self.input_pos[:bs] += seq_len.to(self.device)
+    def assign_cache_loc(self, page_assignments: Sequence[Sequence[int]]) -> None:
+        """Set the cache location and pages_per_seq tensors from page assignments."""
+        cache_loc_flat = torch.tensor(
+            [p_idx for pages in page_assignments for p_idx in pages], dtype=torch.int
+        )
+        self.cache_loc[: len(cache_loc_flat)].copy_(cache_loc_flat, non_blocking=True)
+        pages_per_seq = torch.tensor([len(p) for p in page_assignments], dtype=torch.int)
+        self.pages_per_seq[: len(pages_per_seq)].copy_(pages_per_seq, non_blocking=True)
+Constant = Union[int, float, str, None]
+class MHACallable(Protocol):
+    def __call__(
+        self,
+        *qkv_metadata_and_caches: Union[torch.Tensor, Constant],
+    ) -> torch.Tensor: ...
+class PrepareMetadataCallable(Protocol):
+    def __call__(
+        self,
+        input_ids: torch.Tensor,
+        seq_len: torch.Tensor,
+        input_pos: torch.Tensor,
+        cache_loc: torch.Tensor,
+        pages_per_seq: torch.Tensor,
+        page_size: int,
+    ) -> List[torch.Tensor]: ...
+class GetCacheCallable(Protocol):
+    def __call__(self, sequence_info: SequenceInfo) -> torch.Tensor: ...
+class GetBufferCallable(GetCacheCallable):
+    pass
+class GetAttentionInfo(Protocol):
+    def __call__() -> AttentionInfo: ...
+CacheInitializerDict = Dict[str, GetCacheCallable]
+BufferInitializerDict = Dict[str, GetBufferCallable]
+class AttentionDescriptor(ABC):
+    """An interface to define a functional attention operator.
+    The main logic is contained with the actual attention op as well as the prepare_metadata op. The
+    prepare_metadata op is responsible for converting the standardized sequence info into metadata
+    specific to the attention op.
+    """
+    @classmethod
+    @abstractmethod
+    def is_paged(cls) -> bool:
+        """Return if the attention op is paged or not."""
+    @classmethod
+    def get_attention_op(cls) -> Tuple[MHACallable, int]:
+        """Get the attention op and the number of arguments corresponding to qkv.
+        The attention_op should follow the below signature:
+        ```
+        def attention_op(
+            *qkv,       # list of tensors corresponding to Q, K, V as in original op
+            *metadata,  # global info about the sequences as returned by the prepare_metadata op
+            *caches,    # contains layer-specific caches per provided cache initializers
+            *buffers,   # global buffers used by the attention op as provided by buffer initializers
+            *constants, # basic arguments (int, float, str, None) added as CONSTANTS in the graph
+        ) -> torch.Tensor: ...
+        ```
+        **Note that the attention op should be a valid torch custom op, which comes with
+        restrictions on the supported types in the signature.**
+        **Note that the `qkv` tuple should be consistent across both the cached attention
+        op and the op that it is replacing.**
+        """
+        raise NotImplementedError
+    @classmethod
+    @abstractmethod
+    def get_prepare_metadata_op(cls) -> Tuple[PrepareMetadataCallable, int]:
+        """Get the prepare_metadata op.
+        The prepare_metadata op should follow the below signature:
+        ```
+        def prepare_metadata(
+            input_ids: torch.Tensor,
+            seq_len: torch.Tensor,
+            input_pos: torch.Tensor,
+            cache_loc: torch.Tensor,
+        ) -> List[torch.Tensor]: ...
+        ```
+        The metadata should contain all necessary global information required for the underlying
+        attention op to process the input sequence and the returned list of tensors will be passed
+        on to each invocation of the attention op in the graph.
+        prepare_metadata is called once at the beginning of the forward pass.
+        **Note that the prepare_metadata op should be a valid torch custom op, which comes with
+        restrictions on the supported types in the signature.**
+        """
+        return NotImplementedError
+    @classmethod
+    @abstractmethod
+    def get_cache_initializers(cls, get_info: GetAttentionInfo) -> CacheInitializerDict:
+        """Provide a dictionary of function pointers that can be used to initialize the caches.
+        The key corresponds to the argument name used in the attention op signature. The function
+        key doesn't need to be unique across multiple attention nodes in the graph. The key used to
+        describe the cache in the graph will be patched with the attention node index to ensure
+        uniqueness.
+        ``get_cache_initializers`` will be called *once* after the model initialization and before
+        the initial forward pass for each attention op detected in the graph. The caches will be
+        managed by the global CacheManager and passed back to the attention op during the forward
+        pass.
+        If the cache initializer requires information about the attention op, the ``get_info``
+        function can be called **inside** the cache initializer to retrieve the necessary
+        information.
+        """
+        raise NotImplementedError
+    @classmethod
+    def get_global_buffer_initializers(cls, get_info: GetAttentionInfo) -> BufferInitializerDict:
+        """Provide a dictionary of function pointers that can be used to initialize buffers.
+        The key corresponds to the buffer name used in the graph module and will **not**
+        be patched unlike a cache key. Hence, it is a **global** key that is shared across all
+        attention ops in the model much like a regular buffer in an nn.Module. That means if this
+        i/f is called for multiple attention ops, the same buffer will be shared across all of them
+        if this function provides the same key multiple times.
+        Buffers are initialize *once* after the model initialization and before the initial forward
+        pass for each attention op detected in the graph. The buffer will be managed by the global
+        CacheManager and passed back to the attention op during the forward pass.
+        If the buffer initializer requires information about the attention op, the ``get_info``
+        function can be called **inside** the buffer initializer to retrieve the necessary
+        information.
+        """
+        return {}
+    @classmethod
+    def get_constants(cls, attention_info: AttentionInfo) -> List[Constant]:
+        """Provide a list of constant arguments to be passed to the attention op.
+        The constant arguments are passed to the attention op as additional arguments after the
+        caches and buffers. The constants are expected to be of type int, float, str, or None.
+        """
+        return []
+class AttentionRegistry:
+    """A simple registry to look up different attention implementations."""
+    _attention_registry: Dict[str, Type["AttentionDescriptor"]] = {}
+    @classmethod
+    def register(cls, kernel_source: str) -> Type["AttentionDescriptor"]:
+        def decorator(attention_cls: Type["AttentionDescriptor"]):
+            assert kernel_source not in cls._attention_registry, (
+                f"Attention source {kernel_source} already registered."
+            )
+            cls._attention_registry[kernel_source] = attention_cls
+            return attention_cls
+        return decorator
+    @classmethod
+    def get(cls, kernel_source: str) -> Type["AttentionDescriptor"]:
+        assert cls.has(kernel_source), f"Attention source {kernel_source} not registered."
+        return cls._attention_registry[kernel_source]
+    @classmethod
+    def has(cls, kernel_source: str) -> bool:
+        return kernel_source in cls._attention_registry
+@torch.library.custom_op("attention::scaled_dot_product_attention", mutates_args=())
+def scaled_dot_product_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: Optional[torch.Tensor] = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: Optional[float] = None,
+) -> torch.Tensor:
+    """A carbon copy of torch.nn.functional.scaled_dot_product_attention as custom op.
+    Using this custom op instead of using the functional directly ensures consistent representation
+    of the vanilla sdpa in a graph.
+    """
+    return F.scaled_dot_product_attention(
+        query,
+        key,
+        value,
+        attn_mask=attn_mask,
+        dropout_p=dropout_p,
+        is_causal=is_causal,
+        scale=scale,
+    )
+@scaled_dot_product_attention.register_fake
+def scaled_dot_product_attention_fake(
+    query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None
+):
+    """Fake implementation of scaled_dot_product_attention."""
+    return torch.empty_like(query)
+def _generate_mha(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    cache_locs: torch.Tensor,
+    input_pos: torch.Tensor,
+    out: torch.Tensor,
+):
+    b, (n_heads, q_d_head) = q.shape[0], q.shape[-2:]
+    max_seq_len, n_kv_heads = k_cache.shape[1:3]
+    v_d_head = v.shape[-1]
+    device = q.device
+    HEAD_BLOCK_SIZE = max(16, triton.next_power_of_2(n_heads // n_kv_heads))
+    SEQ_BLOCK_SIZE = 256
+    num_blocks = (max_seq_len + SEQ_BLOCK_SIZE - 1) // SEQ_BLOCK_SIZE
+    stage1_output_values = torch.empty(
+        b, n_heads, num_blocks, v_d_head, device=device, dtype=torch.float32
+    )
+    stage1_output_logsumexp = torch.empty(
+        b, n_heads, num_blocks, device=device, dtype=torch.float32
+    ) - float("inf")
+    (
+        update_kv_cache[(b, n_kv_heads, 1)](
+            k,
+            v,
+            None,
+            None,
+            k_cache,
+            v_cache,
+            input_pos,
+            cache_locs,
+            max_seq_len,
+            n_kv_heads,
+            q_d_head,
+            v_d_head,
+            1,
+            GENERATE_ONLY=True,
+        ),
+    )
+    gqa_attention_kv_stage1[
+        (
+            b,
+            n_kv_heads,
+            num_blocks,
+        )
+    ](
+        q,
+        k_cache,
+        v_cache,
+        cache_locs,
+        input_pos,
+        stage1_output_values,
+        stage1_output_logsumexp,
+        num_blocks,
+        max_seq_len,
+        n_heads,
+        n_kv_heads,
+        q_d_head,
+        v_d_head,
+        SEQ_BLOCK_SIZE,
+        HEAD_BLOCK_SIZE,
+    )
+    attention_kv_stage2[(b, n_heads, 1)](
+        stage1_output_values,
+        stage1_output_logsumexp,
+        out,
+        input_pos,
+        num_blocks,
+        n_heads,
+        v_d_head,
+        SEQ_BLOCK_SIZE,
+    )
+def _context_mha(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    out: torch.Tensor,
+):
+    b, s, n_heads, q_d_head = q.shape
+    max_seq_len, n_kv_heads = k_cache.shape[1:3]
+    v_d_head = v.shape[-1]
+    SEQ_BLOCK = 128
+    softmax_scale = 1.0 / math.sqrt(q_d_head)
+    grid = (b, n_heads, (s + SEQ_BLOCK - 1) // SEQ_BLOCK)
+    context_attention_kv[grid](
+        q,
+        k,
+        v,
+        k_cache,
+        v_cache,
+        s,
+        out,
+        softmax_scale,
+        n_heads,
+        n_kv_heads,
+        q_d_head,
+        v_d_head,
+        SEQ_BLOCK,
+        max_seq_len,
+        num_stages=2,
+    )
+@torch.library.custom_op("attention::fused_mha_with_cache", mutates_args=())
+def fused_mha_with_cache(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Fused MHA with cache that takes raw input from q, k, v GEMMs."""
+    # b, s info
+    b, s = q.shape[:2]
+    head_dim = k_cache.shape[-1]
+    # reshapes with num_heads and head_dim
+    q = q.view(b, s, -1, head_dim)
+    k = k.view(b, s, -1, head_dim)
+    v = v.view(b, s, -1, head_dim)
+    # rope embedding
+    if freqs_cis is not None:
+        q = torch.ops.rope.apply_rope_with_input_pos(q, freqs_cis, input_pos, "bsnd")
+        k = torch.ops.rope.apply_rope_with_input_pos(k, freqs_cis, input_pos, "bsnd")
+    # attention (assumed layout is bsnd)
+    y = torch.empty_like(q)
+    if s > 1:
+        # context phase
+        _context_mha(q, k, v, k_cache, v_cache, y)
+    else:
+        # generate phase
+        cache_locs = torch.arange(0, b, device=q.device, dtype=torch.int32)
+        _generate_mha(q, k, v, k_cache, v_cache, cache_locs, input_pos, y)
+    return y.view(b, s, -1)  # [b,s,n*h_d]
+@fused_mha_with_cache.register_fake
+def fused_mha_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: torch.Tensor,
+):
+    return torch.empty_like(q.contiguous())
+def _flattened_context_mha(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    # NOTE: s_total == sum(seq_len)
+    s_total, n_heads, q_d_head = q.shape
+    max_cache_seq_len, n_kv_heads = k_cache.shape[1:3]
+    v_d_head = v.shape[-1]
+    BATCH_SIZE: int = len(input_pos)
+    SEQ_BLOCK = 32
+    (
+        update_kv_cache[(BATCH_SIZE, n_kv_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)](
+            k,
+            v,
+            seq_len,
+            seq_start,
+            k_cache,
+            v_cache,
+            input_pos,
+            cache_loc,
+            max_cache_seq_len,
+            n_kv_heads,
+            q_d_head,
+            v_d_head,
+            32,
+            GENERATE_ONLY=False,
+        ),
+    )
+    # TODO: use input_pos to get the correct cache locations
+    softmax_scale = 1.0 / math.sqrt(q_d_head)
+    grid = (BATCH_SIZE, n_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)
+    context_attention_kv_flattened[grid](
+        q,
+        seq_len,
+        seq_start,
+        k_cache,
+        v_cache,
+        input_pos,
+        cache_loc,
+        out,
+        softmax_scale,
+        n_heads,
+        n_kv_heads,
+        q_d_head,
+        v_d_head,
+        SEQ_BLOCK,
+        max_cache_seq_len,
+        num_stages=2,
+    )
+@torch.library.custom_op("attention::fused_flattened_mha_with_cache", mutates_args=())
+def fused_flattened_mha_with_cache(
+    # Q, K, V
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    # METADATA
+    seq_len: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    seq_start: torch.Tensor,
+    # CACHES
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    # BUFFERS
+    freqs_cis: torch.Tensor,
+    # CONSTANTS
+    # <none>
+) -> torch.Tensor:
+    """Flattened & fused MHA with cache that takes raw input from q, k, v GEMMs.
+    NOTE: this op can also handle seq_len==0, which might be useful for CUDAGRAPH.
+    """
+    # b, s info
+    # NOTE: b, s are just the shapes of the input tensor q; not necessarily the number of sequences.
+    # Generally speaking, we expect one of two cases here:
+    # 1. b > 0, s==1: this indicates a generate-only batch of tokens.
+    # 2. b==1, s > 0: this indicates a mixed context+generate phase. The actual number of sequences
+    #    and number of tokens per sequence are encoded in seq_len and seq_start.
+    head_dim = k_cache.shape[-1]
+    b, s, d = q.shape
+    # reshapes with num_heads and head_dim
+    if s == 1:
+        bs_view = (b, s)
+    else:
+        bs_view = (b * s,)
+    q = q.view(*bs_view, q.shape[2] // head_dim, head_dim)
+    k = k.view(*bs_view, k.shape[2] // head_dim, head_dim)
+    v = v.view(*bs_view, v.shape[2] // head_dim, head_dim)
+    # rope embedding for generate-only or mixed
+    if freqs_cis is not None and freqs_cis.numel() > 0:
+        if s == 1:
+            rope_args = (freqs_cis, input_pos, "bsnd")
+            fn_rope = torch.ops.rope.apply_rope_with_input_pos
+        else:
+            rope_args = (freqs_cis, input_pos, seq_len, seq_start)
+            fn_rope = torch.ops.rope.apply_rope_on_flattened_inputs
+        q = fn_rope(q, *rope_args)
+        k = fn_rope(k, *rope_args)
+    # run attention
+    y = torch.empty_like(q)
+    if s == 1:
+        # generate-only phase
+        _generate_mha(q, k, v, k_cache, v_cache, cache_loc, input_pos, y)
+    else:
+        # mixed context + generate phase
+        _flattened_context_mha(
+            q,
+            k,
+            v,
+            input_pos,
+            cache_loc,
+            k_cache,
+            v_cache,
+            seq_len,
+            seq_start,
+            y,
+        )
+    return y.view(b, s, d)  # [b,s,n*h_d]
+@fused_flattened_mha_with_cache.register_fake
+def fused_flattened_mha_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    seq_len: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    seq_start: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: torch.Tensor,
+):
+    return torch.empty_like(q.contiguous())
+def _generate_mha_rope_fusion(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    freqs_cis: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    cache_locs: torch.Tensor,
+    input_pos: torch.Tensor,
+    out: torch.Tensor,
+):
+    b, (n_heads, d_head) = q.shape[0], q.shape[-2:]
+    max_seq_len, n_kv_heads = k_cache.shape[1:3]
+    device = q.device
+    SEQ_BLOCK_SIZE = 64
+    num_blocks = (max_seq_len + SEQ_BLOCK_SIZE - 1) // SEQ_BLOCK_SIZE
+    stage1_output_values = torch.empty(
+        b, n_heads, num_blocks, d_head, device=device, dtype=torch.float32
+    )
+    stage1_output_logsumexp = torch.empty(
+        b, n_heads, num_blocks, device=device, dtype=torch.float32
+    ) - float("inf")
+    q_rope = torch.empty_like(q)
+    HEAD_BLOCK_SIZE = max(16, triton.next_power_of_2(n_heads // n_kv_heads))
+    (
+        update_kv_cache_rope_fusion[(b, n_kv_heads, 1)](
+            q,
+            k,
+            v,
+            None,
+            None,
+            q_rope,
+            k_cache,
+            v_cache,
+            input_pos,
+            cache_locs,
+            freqs_cis,
+            max_seq_len,
+            n_heads,
+            n_kv_heads,
+            d_head,
+            1,
+            HEAD_BLOCK_SIZE,
+            GENERATE_ONLY=True,
+        ),
+    )
+    HEAD_BLOCK_SIZE = max(16, triton.next_power_of_2(n_heads // n_kv_heads))
+    gqa_attention_kv_stage1[
+        (
+            b,
+            n_kv_heads,
+            num_blocks,
+        )
+    ](
+        q_rope,
+        k_cache,
+        v_cache,
+        cache_locs,
+        input_pos,
+        stage1_output_values,
+        stage1_output_logsumexp,
+        num_blocks,
+        max_seq_len,
+        n_heads,
+        n_kv_heads,
+        d_head,
+        d_head,
+        SEQ_BLOCK_SIZE,
+        HEAD_BLOCK_SIZE,
+    )
+    attention_kv_stage2[(b, n_heads, 1)](
+        stage1_output_values,
+        stage1_output_logsumexp,
+        out,
+        input_pos,
+        num_blocks,
+        n_heads,
+        d_head,
+        SEQ_BLOCK_SIZE,
+    )
+def _flattened_context_mha_rope_fusion(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    freqs_cis: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    out: torch.Tensor,
+) -> None:
+    # NOTE: s_total == sum(seq_len)
+    s_total, n_heads, d_head = q.shape
+    max_cache_seq_len, n_kv_heads = k_cache.shape[1:3]
+    BATCH_SIZE: int = len(input_pos)
+    SEQ_BLOCK = 32
+    q_rope = torch.empty_like(q)
+    HEAD_BLOCK_SIZE = max(16, triton.next_power_of_2(n_heads // n_kv_heads))
+    (
+        update_kv_cache_rope_fusion[
+            (BATCH_SIZE, n_kv_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)
+        ](
+            q,
+            k,
+            v,
+            seq_len,
+            seq_start,
+            q_rope,
+            k_cache,
+            v_cache,
+            input_pos,
+            cache_loc,
+            freqs_cis,
+            max_cache_seq_len,
+            n_heads,
+            n_kv_heads,
+            d_head,
+            32,
+            HEAD_BLOCK_SIZE,
+            GENERATE_ONLY=False,
+        ),
+    )
+    # TODO: use input_pos to get the correct cache locations
+    softmax_scale = 1.0 / math.sqrt(d_head)
+    grid = (BATCH_SIZE, n_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)
+    context_attention_kv_flattened[grid](
+        q_rope,
+        seq_len,
+        seq_start,
+        k_cache,
+        v_cache,
+        input_pos,
+        cache_loc,
+        out,
+        softmax_scale,
+        n_heads,
+        n_kv_heads,
+        d_head,
+        d_head,
+        SEQ_BLOCK,
+        max_cache_seq_len,
+        num_stages=2,
+    )
+@torch.library.custom_op("attention::fused_flattened_mha_with_cache_rope_fusion", mutates_args=())
+def fused_flattened_mha_with_cache_rope_fusion(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Flattened & fused MHA with cache that takes raw input from q, k, v GEMMs.
+    Fuse k rope in update_kv_cache and q rope in attention.
+    NOTE: this op can also handle seq_len==0, which might be useful for CUDAGRAPH.
+    """
+    # this function only handle requests with rope embadding.
+    if freqs_cis is None:
+        return fused_flattened_mha_with_cache(
+            q,
+            k,
+            v,
+            input_pos,
+            cache_loc,
+            seq_len,
+            seq_start,
+            k_cache,
+            v_cache,
+            freqs_cis,
+        )
+    # b, s info
+    # NOTE: b, s are just the shapes of the input tensor q; not necessarily the number of sequences.
+    # Generally speaking, we expect one of two cases here:
+    # 1. b > 0, s==1: this indicates a generate-only batch of tokens.
+    # 2. b==1, s > 0: this indicates a mixed context+generate phase. The actual number of sequences
+    #    and number of tokens per sequence are encoded in seq_len and seq_start.
+    b, s, d = q.shape
+    head_dim = k_cache.shape[-1]
+    # reshapes with num_heads and head_dim
+    if s == 1:
+        bs_view = (b, s)
+    else:
+        bs_view = (b * s,)
+    q = q.view(*bs_view, q.shape[2] // head_dim, head_dim)
+    k = k.view(*bs_view, k.shape[2] // head_dim, head_dim)
+    v = v.view(*bs_view, v.shape[2] // head_dim, head_dim)
+    # run attention
+    y = torch.empty_like(q)
+    if s == 1:
+        # generate-only phase
+        _generate_mha_rope_fusion(q, k, v, freqs_cis, k_cache, v_cache, cache_loc, input_pos, y)
+    else:
+        # mixed context + generate phase
+        _flattened_context_mha_rope_fusion(
+            q,
+            k,
+            v,
+            freqs_cis,
+            input_pos,
+            cache_loc,
+            k_cache,
+            v_cache,
+            seq_len,
+            seq_start,
+            y,
+        )
+    return y.view(b, s, d)  # [b,s,n*h_d]
+@fused_flattened_mha_with_cache_rope_fusion.register_fake
+def fused_flattened_mha_with_cache_rope_fusion_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: torch.Tensor,
+):
+    return torch.empty_like(q.contiguous())
+def _paged_generate_mha(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    page_table: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    cache_loc: torch.Tensor,
+    input_pos: torch.Tensor,
+    out: torch.Tensor,
+    max_seq_len: int,
+):
+    b, (n_heads, d_head) = q.shape[0], q.shape[-2:]
+    PAGE_SIZE, n_kv_heads = k_cache.shape[1:3]
+    device = q.device
+    SEQ_BLOCK_SIZE = PAGE_SIZE # 256
+    num_blocks = (max_seq_len + SEQ_BLOCK_SIZE - 1) // SEQ_BLOCK_SIZE
+    stage1_output_values = torch.empty(
+        b, n_heads, num_blocks, d_head, device=device, dtype=torch.float32
+    )
+    stage1_output_logsumexp = torch.empty(
+        b, n_heads, num_blocks, device=device, dtype=torch.float32
+    ) - float("inf")
+    (
+        update_paged_kv_cache[(b, n_kv_heads, 1)](
+            k,
+            v,
+            None,
+            None,
+            k_cache,
+            v_cache,
+            cache_loc,
+            input_pos,
+            page_table,
+            n_kv_heads,
+            d_head,
+            SEQ_BLOCK_SIZE,
+            max_seq_len,
+            PAGE_SIZE,
+            page_table.stride(0),
+            GENERATE_ONLY=True,
+        ),
+    )
+    attention_kv_paged_stage1[
+        (
+            b,
+            n_heads,
+            num_blocks,
+        )
+    ](
+        q,
+        k_cache,
+        v_cache,
+        cache_loc,
+        page_table,
+        input_pos,
+        stage1_output_values,
+        stage1_output_logsumexp,
+        num_blocks,
+        max_seq_len,
+        n_heads,
+        n_kv_heads,
+        d_head,
+        SEQ_BLOCK_SIZE,
+        PAGE_SIZE,
+        page_table.stride(0),
+    )
+    attention_kv_stage2[(b, n_heads, 1)](
+        stage1_output_values,
+        stage1_output_logsumexp,
+        out,
+        input_pos,
+        num_blocks,
+        n_heads,
+        d_head,
+        SEQ_BLOCK_SIZE,
+    )
+def _paged_context_mha(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    page_table: torch.Tensor,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    out: torch.Tensor,
+    max_seq_len: int,  # max cache length of sequence, kv_cache shape don't provide this info.
+) -> None:
+    # NOTE: s_total == sum(seq_len)
+    s_total, n_heads, d_head = q.shape
+    PAGE_SIZE, n_kv_heads = k_cache.shape[1:3]
+    BATCH_SIZE = len(input_pos)
+    SEQ_BLOCK = PAGE_SIZE # 32
+    (
+        update_paged_kv_cache[
+            (BATCH_SIZE, n_kv_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)
+        ](
+            k,
+            v,
+            seq_len,
+            seq_start,
+            k_cache,
+            v_cache,
+            cache_loc,
+            input_pos,
+            page_table,
+            n_kv_heads,
+            d_head,
+            SEQ_BLOCK,
+            max_seq_len,
+            PAGE_SIZE,
+            page_table.stride(0),
+            GENERATE_ONLY=False,
+        ),
+    )
+    softmax_scale = 1.0 / math.sqrt(d_head)
+    grid = (BATCH_SIZE, n_heads, (max(seq_len) + SEQ_BLOCK - 1) // SEQ_BLOCK)
+    context_attention_kv_paged[grid](
+        q,
+        seq_len,
+        seq_start,
+        k_cache,
+        v_cache,
+        cache_loc,
+        input_pos,
+        page_table,
+        softmax_scale,
+        out,
+        n_heads,
+        n_kv_heads,
+        d_head,
+        SEQ_BLOCK,
+        max_seq_len,
+        PAGE_SIZE,
+        page_table.stride(0),
+        num_stages=2,
+    )
+@torch.library.custom_op("attention::fused_mha_with_paged_cache", mutates_args=())
+def fused_mha_with_paged_cache(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    page_table: torch.Tensor,
+    max_seq_len: int,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: Optional[torch.Tensor],
+) -> torch.Tensor:
+    """Fused MHA with paged cache that takes raw input from q, k, v GEMMs.
+    NOTE: this op can also handle seq_len==0, which might be useful for CUDAGRAPH.
+    """
+    # b, s info
+    # NOTE: b, s are just the shapes of the input tensor q; not necessarily the number of sequences.
+    # Generally speaking, we expect one of two cases here:
+    # 1. b > 0, s==1: this indicates a generate-only batch of tokens.
+    # 2. b==1, s > 0: this indicates a mixed context+generate phase. The actual number of sequences
+    #    and number of tokens per sequence are encoded in seq_len and seq_start.
+    #    Assuming that context seq_len always > 0.
+    b, s, d = q.shape
+    head_dim = k_cache.shape[-1]
+    # reshapes with num_heads and head_dim
+    if s == 1:
+        bs_view = (b, s)
+    else:
+        bs_view = (b * s,)
+    q = q.view(*bs_view, q.shape[2] // head_dim, head_dim)
+    k = k.view(*bs_view, k.shape[2] // head_dim, head_dim)
+    v = v.view(*bs_view, v.shape[2] // head_dim, head_dim)
+    # rope embedding for generate-only or mixed
+    if freqs_cis is not None:
+        if s == 1:
+            rope_args = (freqs_cis, input_pos, "bsnd")
+            fn_rope = torch.ops.rope.apply_rope_with_input_pos
+        else:
+            rope_args = (freqs_cis, input_pos, seq_len, seq_start)
+            fn_rope = torch.ops.rope.apply_rope_on_flattened_inputs
+        q = fn_rope(q, *rope_args)
+        k = fn_rope(k, *rope_args)
+    # run attention
+    y = torch.empty_like(q)
+    if s == 1:
+        # generate-only phase
+        _paged_generate_mha(
+            q, k, v, page_table, k_cache, v_cache, cache_loc, input_pos, y, max_seq_len
+        )
+    else:
+        # mixed context + generate phase
+        _paged_context_mha(
+            q,
+            k,
+            v,
+            input_pos,
+            cache_loc,
+            page_table,
+            k_cache,
+            v_cache,
+            seq_len,
+            seq_start,
+            y,
+            max_seq_len,
+        )
+    return y.view(b, s, d)  # [b,s,n*h_d]
+@fused_mha_with_paged_cache.register_fake
+def fused_mha_with_paged_cache_fake(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    seq_len: torch.Tensor,
+    seq_start: torch.Tensor,
+    page_table: torch.Tensor,
+    max_seq_len: int,
+    k_cache: torch.Tensor,
+    v_cache: torch.Tensor,
+    freqs_cis: Optional[torch.Tensor],
+) -> torch.Tensor:
+    return torch.empty_like(q.contiguous())
+@torch.library.custom_op("attention::prepare_fused_mha_metadata", mutates_args=())
+def prepare_fused_mha_metadata(
+    input_ids: torch.Tensor,
+    seq_len: torch.Tensor,
+    input_pos: torch.Tensor,
+    cache_loc: torch.Tensor,
+    pages_per_seq: torch.Tensor,
+    page_size: int,
+) -> List[torch.Tensor]:
+    num_seq = SequenceInfo._get_sanitized_num_sequences(input_ids, seq_len)
+    seq_start = torch.zeros_like(seq_len[:num_seq])
+    seq_start[1:] = torch.cumsum(seq_len[: num_seq - 1], 0)
+    return (
+        seq_len[:num_seq].clone(),
+        input_pos[:num_seq].clone(),
+        cache_loc[:num_seq].clone(),
+        seq_start,
+    )
+@prepare_fused_mha_metadata.register_fake
+def prepare_fused_mha_metadata_fake(
+    input_ids, seq_len, input_pos, cache_loc, pages_per_seq, page_size
+):
+    return (
+        torch.empty_like(seq_len),
+        torch.empty_like(input_pos),
+        torch.empty_like(cache_loc),
+        torch.empty_like(seq_len),
+    )
+@AttentionRegistry.register("TritonWithFlattenedInputs")
+class TritonWithFlattenedInputs(AttentionDescriptor):
+    @classmethod
+    def is_paged(cls):
+        """Return if the attention op is paged or not."""
+        return False
+    @classmethod
+    def get_attention_op(cls):
+        return torch.ops.attention.fused_flattened_mha_with_cache, 3
+    @classmethod
+    def get_prepare_metadata_op(cls):
+        return torch.ops.attention.prepare_fused_mha_metadata, 4
+    @classmethod
+    def get_cache_initializers(cls, get_info):
+        def _get_cache(si: SequenceInfo):
+            assert not si.is_paged, "Paged cache not supported for TritonWithFlattenedInputs"
+            attention_info = get_info()
+            return torch.empty(
+                si.num_pages,
+                si.page_size,
+                attention_info.num_kv_heads,
+                attention_info.head_dim,
+                device=si.device,
+                dtype=attention_info.cache_config.dtype or attention_info.dtype,
+            )
+        return {"k_cache": _get_cache, "v_cache": _get_cache}
+    @classmethod
+    def get_global_buffer_initializers(cls, get_info):
+        attention_info = get_info()
+        head_dim = attention_info.head_dim
+        pos_embd_config = attention_info.pos_embd_config
+        def _get_freqs_cis(si: SequenceInfo):
+            if pos_embd_config.mode is None:
+                return torch.empty(0, device=si.device)
+            assert pos_embd_config.mode == "rope", f"Mode {pos_embd_config.mode=} not supported"
+            assert pos_embd_config.rope_scale == 1.0, f"{pos_embd_config.rope_scale=} not supported"
+            rope_theta = pos_embd_config.rope_theta
+            return cls._precompute_freqs_cis(2 * si.max_seq_len, head_dim, rope_theta).to(si.device)
+        k_full = "_".join(map(str, ["freqs_cis", *astuple(pos_embd_config)])).replace(".", "_")
+        return {k_full: _get_freqs_cis}
+    @staticmethod
+    def _precompute_freqs_cis(
+        seq_len: int, head_dim: int, rope_theta: Optional[float] = None
+    ) -> torch.Tensor:
+        if rope_theta is None:
+            rope_theta = 1e4
+        freqs = 1.0 / (
+            rope_theta ** (torch.arange(0, head_dim, 2)[: (head_dim // 2)].float() / head_dim)
+        )
+        t = torch.arange(seq_len)
+        freqs = torch.outer(t, freqs)
+        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
+        # cos and sin (real and img) are packed
+        cache = torch.stack([freqs_cis.real, freqs_cis.imag], dim=-1)
+        return cache.to(dtype=torch.float16)