one-1.5B-instruct / configuration_lightpost.py

Upload LightpostForCausalLM

9f6466e verified 18 days ago

6.37 kB

	# coding=utf-8
	# Copyright 2024 Lightpost ApS. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in
	# compliance with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software distributed under the License is
	# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and limitations under the License.
	"""Lightpost model configuration"""

	from transformers.configuration_utils import PretrainedConfig
	from transformers.modeling_rope_utils import rope_config_validation
	from transformers.utils import logging


	logger = logging.get_logger(__name__)


	class LightpostConfig(PretrainedConfig):
	r"""
	Configuration class for the Lightpost model. This class stores all parameters needed to define the model architecture.

	Inherits from PretrainedConfig to provide standard configuration functionality. See PretrainedConfig docs for details.

	Args:
	vocab_size (int, optional, defaults to 151936):
	Size of model vocabulary. Determines number of unique tokens model can process.

	hidden_size (int, optional, defaults to 4096):
	Dimension of model's hidden states.

	intermediate_size (int, optional, defaults to 22016):
	Dimension of feed-forward network layers.

	num_hidden_layers (int, optional, defaults to 32):
	Number of transformer layers in model.

	num_attention_heads (int, optional, defaults to 32):
	Number of attention heads per layer.

	num_key_value_heads (int, optional, defaults to 32):
	Number of key/value heads for Grouped Query Attention (GQA).
	- If equal to num_attention_heads: Uses Multi-Head Attention (MHA)
	- If equal to 1: Uses Multi-Query Attention (MQA)
	- Otherwise: Uses GQA with specified number of groups

	hidden_act (str or callable, optional, defaults to "silu"):
	Activation function used in feed-forward layers.

	max_position_embeddings (int, optional, defaults to 32768):
	Maximum sequence length model can handle.

	initializer_range (float, optional, defaults to 0.02):
	Standard deviation for weight initialization.

	rms_norm_eps (float, optional, defaults to 1e-06):
	Epsilon for RMSNorm layers.

	use_cache (bool, optional, defaults to True):
	Whether to use key/value cache for faster inference.

	tie_word_embeddings (bool, optional, defaults to False):
	Whether to tie input and output embeddings.

	rope_theta (float, optional, defaults to 10000.0):
	Base frequency for rotary position embeddings.

	rope_scaling (dict, optional):
	Configuration for RoPE scaling. Supported types:
	- default: Original RoPE
	- linear: Linear scaling
	- dynamic: Dynamic scaling
	- yarn: YaRN scaling
	- longrope: LongRoPE scaling
	- llama3: Llama 3 style scaling

	See implementation docs for type-specific parameters.

	use_sliding_window (bool, optional, defaults to False):
	Whether to use sliding window attention.

	sliding_window (int, optional, defaults to 4096):
	Size of sliding attention window.

	max_window_layers (int, optional, defaults to 28):
	Number of bottom layers using sliding window attention.

	attention_dropout (float, optional, defaults to 0.0):
	Dropout probability for attention weights.

	mem_size (int, optional, defaults to 32):
	Size of the learnable memory.

	mem_layers (int or list[int], optional, defaults to None):
	Layers to apply memory attention to.

	Example:
	>>> from transformers import LightpostModel, LightpostConfig
	>>> config = LightpostConfig() # Initialize with defaults
	>>> model = LightpostModel(config) # Create model
	>>> model.config # Access configuration
	"""

	model_type = "lightpost"
	keys_to_ignore_at_inference = ["past_key_values"]

	def __init__(
	self,
	vocab_size=151936,
	hidden_size=4096,
	intermediate_size=22016,
	num_hidden_layers=32,
	num_attention_heads=32,
	num_key_value_heads=32,
	hidden_act="silu",
	max_position_embeddings=32768,
	initializer_range=0.02,
	rms_norm_eps=1e-6,
	use_cache=True,
	tie_word_embeddings=False,
	rope_theta=10000.0,
	rope_scaling=None,
	use_sliding_window=False,
	sliding_window=4096,
	max_window_layers=28,
	attention_dropout=0.0,
	mem_size=32,
	mem_layers=None,
	**kwargs,
	):
	self.vocab_size = vocab_size
	self.max_position_embeddings = max_position_embeddings
	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads
	self.use_sliding_window = use_sliding_window
	self.sliding_window = sliding_window if use_sliding_window else None
	self.max_window_layers = max_window_layers
	self.num_key_value_heads = num_key_value_heads
	self.hidden_act = hidden_act
	self.initializer_range = initializer_range
	self.rms_norm_eps = rms_norm_eps
	self.use_cache = use_cache
	self.rope_theta = rope_theta
	self.rope_scaling = rope_scaling
	self.attention_dropout = attention_dropout
	self.mem_size = mem_size
	self.mem_layers = mem_layers
	# Validate the correctness of rotary position embeddings parameters
	rope_config_validation(self)

	super().__init__(
	tie_word_embeddings=tie_word_embeddings,
	**kwargs,
	)