Spaces:
Sleeping
Sleeping
from transformers import PretrainedConfig | |
class DearthConfig(PretrainedConfig): | |
model_type = "dearth" | |
def __init__( | |
self, | |
max_token_len: int = 8192, | |
vocab_size: int = None, # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency | |
n_layer: int = None, | |
n_head: int = None, | |
n_kv_head: int = None, | |
dim: int = None, | |
dim_qk_head = None, | |
hidden_dim: int = None, | |
multiple_of: int = None, | |
dropout_rate: float = 0.0, | |
layer_init_factor: float = None, | |
residual_factor: float = None, # should > 1.0 | |
sliding_window_size: int = 4096, | |
front_window_size: int = 256, | |
use_rotary: bool = True, | |
rope_theta: float = 10000.0, | |
use_alibi: bool = False, | |
mimic_attn_layer: int = None, # 1-based, starting from the bottom; The first layer should be 1, not 0 | |
mimic_n_head: int = None, | |
mimic_n_kv_head: int = None, | |
mimic_attn_dropout: float = None, | |
mimic_dim_qk_head: int = None, | |
mimic_use_rotary: bool = True, | |
mimic_use_alibi: bool = False, | |
pad_token_id=None, | |
bos_token_id=1, | |
eos_token_id=2, | |
tie_word_embeddings=False, | |
**kwargs, | |
): | |
self.max_token_len = max_token_len | |
self.vocab_size = vocab_size | |
self.n_layer = n_layer | |
self.n_head = n_head | |
self.n_kv_head = n_kv_head | |
self.dim = dim | |
self.dim_qk_head = dim_qk_head | |
self.hidden_dim = hidden_dim | |
if hidden_dim is None: | |
self.hidden_dim = dim * 4 | |
#print(f"hidden_dim is not specified. Set to {self.hidden_dim}") | |
self.multiple_of = multiple_of | |
self.dropout_rate = dropout_rate | |
self.layer_init_factor = layer_init_factor | |
self.residual_factor = residual_factor | |
self.sliding_window_size = sliding_window_size | |
self.front_window_size = front_window_size | |
self.use_rotary = use_rotary | |
self.rope_theta = rope_theta | |
self.use_alibi = use_alibi | |
self.mimic_attn_layer = mimic_attn_layer | |
self.mimic_n_head = mimic_n_head | |
self.mimic_n_kv_head = mimic_n_kv_head | |
self.mimic_attn_dropout = mimic_attn_dropout | |
self.mimic_dim_qk_head = mimic_dim_qk_head | |
self.mimic_use_rotary = mimic_use_rotary | |
self.mimic_use_alibi = mimic_use_alibi | |
if "attn_window_size" in kwargs: | |
#print("Warning: attn_window_size is deprecated. Please use sliding_window_size instead !!!!!!!!!!!") | |
self.sliding_window_size = kwargs["attn_window_size"] | |
super().__init__( | |
pad_token_id=pad_token_id, | |
bos_token_id=bos_token_id, | |
eos_token_id=eos_token_id, | |
tie_word_embeddings=tie_word_embeddings, | |
**kwargs, | |
) | |
def __str__(self) -> str: | |
return f""" | |
max_token_len = {self.max_token_len} | |
vocab_size = {self.vocab_size} | |
n_layer = {self.n_layer} | |
n_head = {self.n_head} | |
n_kv_head = {self.n_kv_head} | |
dim = {self.dim} | |
dim_qk_head = {self.dim_qk_head} | |
hidden_dim = {self.hidden_dim} | |
multiple_of = {self.multiple_of} | |
dropout_rate = {self.dropout_rate} | |
layer_init_factor = {self.layer_init_factor} | |
residual_factor = {self.residual_factor} | |
sliding_window_size = {self.sliding_window_size} | |
front_window_size = {self.front_window_size} | |
use_rotary = {self.use_rotary} | |
use_alibi = {self.use_alibi} | |
mimic_attn_layer = {self.mimic_attn_layer} | |
mimic_n_head = {self.mimic_n_head} | |
mimic_n_kv_head = {self.mimic_n_kv_head} | |
mimic_attn_dropout = {self.mimic_attn_dropout} | |
mimic_dim_qk_head = {self.mimic_dim_qk_head} | |
mimic_use_rotary = {self.mimic_use_rotary} | |
mimic_use_alibi = {self.mimic_use_alibi} | |
""" |