# coding=utf-8 # Copyright 2024 Lightpost ApS. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in # compliance with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software distributed under the License is # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. """Lightpost model configuration""" from transformers.configuration_utils import PretrainedConfig from transformers.modeling_rope_utils import rope_config_validation from transformers.utils import logging logger = logging.get_logger(__name__) class LightpostConfig(PretrainedConfig): r""" Configuration class for the Lightpost model. This class stores all parameters needed to define the model architecture. Inherits from PretrainedConfig to provide standard configuration functionality. See PretrainedConfig docs for details. Args: vocab_size (int, optional, defaults to 151936): Size of model vocabulary. Determines number of unique tokens model can process. hidden_size (int, optional, defaults to 4096): Dimension of model's hidden states. intermediate_size (int, optional, defaults to 22016): Dimension of feed-forward network layers. num_hidden_layers (int, optional, defaults to 32): Number of transformer layers in model. num_attention_heads (int, optional, defaults to 32): Number of attention heads per layer. num_key_value_heads (int, optional, defaults to 32): Number of key/value heads for Grouped Query Attention (GQA). - If equal to num_attention_heads: Uses Multi-Head Attention (MHA) - If equal to 1: Uses Multi-Query Attention (MQA) - Otherwise: Uses GQA with specified number of groups hidden_act (str or callable, optional, defaults to "silu"): Activation function used in feed-forward layers. max_position_embeddings (int, optional, defaults to 32768): Maximum sequence length model can handle. initializer_range (float, optional, defaults to 0.02): Standard deviation for weight initialization. rms_norm_eps (float, optional, defaults to 1e-06): Epsilon for RMSNorm layers. use_cache (bool, optional, defaults to True): Whether to use key/value cache for faster inference. tie_word_embeddings (bool, optional, defaults to False): Whether to tie input and output embeddings. rope_theta (float, optional, defaults to 10000.0): Base frequency for rotary position embeddings. rope_scaling (dict, optional): Configuration for RoPE scaling. Supported types: - default: Original RoPE - linear: Linear scaling - dynamic: Dynamic scaling - yarn: YaRN scaling - longrope: LongRoPE scaling - llama3: Llama 3 style scaling See implementation docs for type-specific parameters. use_sliding_window (bool, optional, defaults to False): Whether to use sliding window attention. sliding_window (int, optional, defaults to 4096): Size of sliding attention window. max_window_layers (int, optional, defaults to 28): Number of bottom layers using sliding window attention. attention_dropout (float, optional, defaults to 0.0): Dropout probability for attention weights. mem_size (int, optional, defaults to 32): Size of the learnable memory. mem_layers (int or list[int], optional, defaults to None): Layers to apply memory attention to. Example: >>> from transformers import LightpostModel, LightpostConfig >>> config = LightpostConfig() # Initialize with defaults >>> model = LightpostModel(config) # Create model >>> model.config # Access configuration """ model_type = "lightpost" keys_to_ignore_at_inference = ["past_key_values"] def __init__( self, vocab_size=151936, hidden_size=4096, intermediate_size=22016, num_hidden_layers=32, num_attention_heads=32, num_key_value_heads=32, hidden_act="silu", max_position_embeddings=32768, initializer_range=0.02, rms_norm_eps=1e-6, use_cache=True, tie_word_embeddings=False, rope_theta=10000.0, rope_scaling=None, use_sliding_window=False, sliding_window=4096, max_window_layers=28, attention_dropout=0.0, mem_size=32, mem_layers=None, **kwargs, ): self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads self.use_sliding_window = use_sliding_window self.sliding_window = sliding_window if use_sliding_window else None self.max_window_layers = max_window_layers self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.rope_theta = rope_theta self.rope_scaling = rope_scaling self.attention_dropout = attention_dropout self.mem_size = mem_size self.mem_layers = mem_layers # Validate the correctness of rotary position embeddings parameters rope_config_validation(self) super().__init__( tie_word_embeddings=tie_word_embeddings, **kwargs, )