from transformers import PretrainedConfig class CALIConfig(PretrainedConfig): model_type = "cali" def __init__( self, vocab_size: int = 32000, hidden_dim: int = 768, num_layers: int = 11, num_heads: int = 4, num_kv_heads: int = 1, head_dim: int = 192, seq_len: int = 1024, ffn_multiplier: float = 3.0, dropout: float = 0.0, tie_embeddings: bool = True, rope_theta: float = 10000.0, rms_norm_eps: float = 1e-6, initializer_range: float = 0.02, use_cache: bool = True, pad_token_id: int = None, bos_token_id: int = 1, eos_token_id: int = 2, **kwargs, ): self.vocab_size = vocab_size self.hidden_dim = hidden_dim self.num_layers = num_layers self.num_heads = num_heads self.num_kv_heads = num_kv_heads self.head_dim = head_dim self.seq_len = seq_len self.ffn_multiplier = ffn_multiplier self.dropout = dropout self.tie_embeddings = tie_embeddings self.rope_theta = rope_theta self.rms_norm_eps = rms_norm_eps self.initializer_range = initializer_range self.use_cache = use_cache super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs, )