|
from packaging import version |
|
import transformers |
|
if version.parse(transformers.__version__) < version.parse("4.31.0"): |
|
raise ImportError( |
|
f"You are using transformers=={transformers.__version__}, but transformers>=4.31.0 is required to use DeciCoder. Please upgrade transformers." |
|
) |
|
from transformers.models.llama.configuration_llama import LlamaConfig |
|
from transformers.utils import logging |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP = {} |
|
|
|
|
|
class DeciCoderConfig(LlamaConfig): |
|
r""" |
|
This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA |
|
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the |
|
defaults will yield a similar configuration to that of the LLaMA-7B. |
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
|
|
Args: |
|
naive_attention_prefill (`bool`, *optional*, defaults to False): |
|
Whether to use naive matmul or scaled dot product attention during prefill. |
|
naive_attention_decode_batched (`bool`, *optional*, defaults to True): |
|
Whether to use naive matmul or scaled dot product attention during decode for batch_size > 1. |
|
naive_attention_decode_single (`bool`, *optional*, defaults to False): |
|
Whether to use naive matmul or scaled dot product attention during decode for batch_size == 1. |
|
|
|
|
|
```""" |
|
model_type = "llama" |
|
keys_to_ignore_at_inference = ["past_key_values"] |
|
|
|
def __init__( |
|
self, |
|
naive_attention_prefill: bool = False, |
|
naive_attention_decode_batched: bool = True, |
|
naive_attention_decode_single: bool = False, |
|
**kwargs, |
|
): |
|
self.naive_attention_prefill = naive_attention_prefill |
|
self.naive_attention_decode_batched = naive_attention_decode_batched |
|
self.naive_attention_decode_single = naive_attention_decode_single |
|
|
|
super().__init__(**kwargs,) |
|
|
|
|