# coding=utf-8
# Copyright 2024 Lightpost ApS. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
"""Lightpost model configuration"""

from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging


logger = logging.get_logger(__name__)


class LightpostConfig(PretrainedConfig):
    r"""
    Configuration class for the Lightpost model. This class stores all parameters needed to define the model architecture.

    Inherits from PretrainedConfig to provide standard configuration functionality. See PretrainedConfig docs for details.

    Args:
        vocab_size (int, optional, defaults to 151936): 
            Size of model vocabulary. Determines number of unique tokens model can process.
        
        hidden_size (int, optional, defaults to 4096):
            Dimension of model's hidden states.
            
        intermediate_size (int, optional, defaults to 22016):
            Dimension of feed-forward network layers.
            
        num_hidden_layers (int, optional, defaults to 32):
            Number of transformer layers in model.
            
        num_attention_heads (int, optional, defaults to 32):
            Number of attention heads per layer.
            
        num_key_value_heads (int, optional, defaults to 32):
            Number of key/value heads for Grouped Query Attention (GQA).
            - If equal to num_attention_heads: Uses Multi-Head Attention (MHA)
            - If equal to 1: Uses Multi-Query Attention (MQA) 
            - Otherwise: Uses GQA with specified number of groups
            
        hidden_act (str or callable, optional, defaults to "silu"):
            Activation function used in feed-forward layers.
            
        max_position_embeddings (int, optional, defaults to 32768):
            Maximum sequence length model can handle.
            
        initializer_range (float, optional, defaults to 0.02):
            Standard deviation for weight initialization.
            
        rms_norm_eps (float, optional, defaults to 1e-06):
            Epsilon for RMSNorm layers.
            
        use_cache (bool, optional, defaults to True):
            Whether to use key/value cache for faster inference.
            
        tie_word_embeddings (bool, optional, defaults to False):
            Whether to tie input and output embeddings.
            
        rope_theta (float, optional, defaults to 10000.0):
            Base frequency for rotary position embeddings.
            
        rope_scaling (dict, optional):
            Configuration for RoPE scaling. Supported types:
            - default: Original RoPE
            - linear: Linear scaling
            - dynamic: Dynamic scaling
            - yarn: YaRN scaling
            - longrope: LongRoPE scaling
            - llama3: Llama 3 style scaling
            
            See implementation docs for type-specific parameters.
            
        use_sliding_window (bool, optional, defaults to False):
            Whether to use sliding window attention.
            
        sliding_window (int, optional, defaults to 4096):
            Size of sliding attention window.
            
        max_window_layers (int, optional, defaults to 28):
            Number of bottom layers using sliding window attention.
            
        attention_dropout (float, optional, defaults to 0.0):
            Dropout probability for attention weights.

        mem_size (int, optional, defaults to 32):
            Size of the learnable memory.
            
        mem_layers (int or list[int], optional, defaults to None):
            Layers to apply memory attention to.
            
    Example:
        >>> from transformers import LightpostModel, LightpostConfig
        >>> config = LightpostConfig()  # Initialize with defaults
        >>> model = LightpostModel(config)  # Create model
        >>> model.config  # Access configuration
    """

    model_type = "lightpost"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        vocab_size=151936,
        hidden_size=4096,
        intermediate_size=22016,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=32,
        hidden_act="silu",
        max_position_embeddings=32768,
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        tie_word_embeddings=False,
        rope_theta=10000.0,
        rope_scaling=None,
        use_sliding_window=False,
        sliding_window=4096,
        max_window_layers=28,
        attention_dropout=0.0,
        mem_size=32,
        mem_layers=None,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
        self.hidden_size = hidden_size
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
        self.use_sliding_window = use_sliding_window
        self.sliding_window = sliding_window if use_sliding_window else None
        self.max_window_layers = max_window_layers
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling
        self.attention_dropout = attention_dropout
        self.mem_size = mem_size
        self.mem_layers = mem_layers
        # Validate the correctness of rotary position embeddings parameters
        rope_config_validation(self)

        super().__init__(
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )