one-1.5B-instruct / configuration_lightpost.py
wilstrup's picture
Upload LightpostForCausalLM
9f6466e verified
# coding=utf-8
# Copyright 2024 Lightpost ApS. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
"""Lightpost model configuration"""
from transformers.configuration_utils import PretrainedConfig
from transformers.modeling_rope_utils import rope_config_validation
from transformers.utils import logging
logger = logging.get_logger(__name__)
class LightpostConfig(PretrainedConfig):
r"""
Configuration class for the Lightpost model. This class stores all parameters needed to define the model architecture.
Inherits from PretrainedConfig to provide standard configuration functionality. See PretrainedConfig docs for details.
Args:
vocab_size (int, optional, defaults to 151936):
Size of model vocabulary. Determines number of unique tokens model can process.
hidden_size (int, optional, defaults to 4096):
Dimension of model's hidden states.
intermediate_size (int, optional, defaults to 22016):
Dimension of feed-forward network layers.
num_hidden_layers (int, optional, defaults to 32):
Number of transformer layers in model.
num_attention_heads (int, optional, defaults to 32):
Number of attention heads per layer.
num_key_value_heads (int, optional, defaults to 32):
Number of key/value heads for Grouped Query Attention (GQA).
- If equal to num_attention_heads: Uses Multi-Head Attention (MHA)
- If equal to 1: Uses Multi-Query Attention (MQA)
- Otherwise: Uses GQA with specified number of groups
hidden_act (str or callable, optional, defaults to "silu"):
Activation function used in feed-forward layers.
max_position_embeddings (int, optional, defaults to 32768):
Maximum sequence length model can handle.
initializer_range (float, optional, defaults to 0.02):
Standard deviation for weight initialization.
rms_norm_eps (float, optional, defaults to 1e-06):
Epsilon for RMSNorm layers.
use_cache (bool, optional, defaults to True):
Whether to use key/value cache for faster inference.
tie_word_embeddings (bool, optional, defaults to False):
Whether to tie input and output embeddings.
rope_theta (float, optional, defaults to 10000.0):
Base frequency for rotary position embeddings.
rope_scaling (dict, optional):
Configuration for RoPE scaling. Supported types:
- default: Original RoPE
- linear: Linear scaling
- dynamic: Dynamic scaling
- yarn: YaRN scaling
- longrope: LongRoPE scaling
- llama3: Llama 3 style scaling
See implementation docs for type-specific parameters.
use_sliding_window (bool, optional, defaults to False):
Whether to use sliding window attention.
sliding_window (int, optional, defaults to 4096):
Size of sliding attention window.
max_window_layers (int, optional, defaults to 28):
Number of bottom layers using sliding window attention.
attention_dropout (float, optional, defaults to 0.0):
Dropout probability for attention weights.
mem_size (int, optional, defaults to 32):
Size of the learnable memory.
mem_layers (int or list[int], optional, defaults to None):
Layers to apply memory attention to.
Example:
>>> from transformers import LightpostModel, LightpostConfig
>>> config = LightpostConfig() # Initialize with defaults
>>> model = LightpostModel(config) # Create model
>>> model.config # Access configuration
"""
model_type = "lightpost"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
vocab_size=151936,
hidden_size=4096,
intermediate_size=22016,
num_hidden_layers=32,
num_attention_heads=32,
num_key_value_heads=32,
hidden_act="silu",
max_position_embeddings=32768,
initializer_range=0.02,
rms_norm_eps=1e-6,
use_cache=True,
tie_word_embeddings=False,
rope_theta=10000.0,
rope_scaling=None,
use_sliding_window=False,
sliding_window=4096,
max_window_layers=28,
attention_dropout=0.0,
mem_size=32,
mem_layers=None,
**kwargs,
):
self.vocab_size = vocab_size
self.max_position_embeddings = max_position_embeddings
self.hidden_size = hidden_size
self.intermediate_size = intermediate_size
self.num_hidden_layers = num_hidden_layers
self.num_attention_heads = num_attention_heads
self.use_sliding_window = use_sliding_window
self.sliding_window = sliding_window if use_sliding_window else None
self.max_window_layers = max_window_layers
self.num_key_value_heads = num_key_value_heads
self.hidden_act = hidden_act
self.initializer_range = initializer_range
self.rms_norm_eps = rms_norm_eps
self.use_cache = use_cache
self.rope_theta = rope_theta
self.rope_scaling = rope_scaling
self.attention_dropout = attention_dropout
self.mem_size = mem_size
self.mem_layers = mem_layers
# Validate the correctness of rotary position embeddings parameters
rope_config_validation(self)
super().__init__(
tie_word_embeddings=tie_word_embeddings,
**kwargs,
)